sched: do not set softirqs to nice +19
[deliverable/linux.git] / kernel / sched.c
CommitLineData
1da177e4
LT
1/*
2 * kernel/sched.c
3 *
4 * Kernel scheduler and related syscalls
5 *
6 * Copyright (C) 1991-2002 Linus Torvalds
7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 */
20
21#include <linux/mm.h>
22#include <linux/module.h>
23#include <linux/nmi.h>
24#include <linux/init.h>
dff06c15 25#include <linux/uaccess.h>
1da177e4
LT
26#include <linux/highmem.h>
27#include <linux/smp_lock.h>
28#include <asm/mmu_context.h>
29#include <linux/interrupt.h>
c59ede7b 30#include <linux/capability.h>
1da177e4
LT
31#include <linux/completion.h>
32#include <linux/kernel_stat.h>
9a11b49a 33#include <linux/debug_locks.h>
1da177e4
LT
34#include <linux/security.h>
35#include <linux/notifier.h>
36#include <linux/profile.h>
7dfb7103 37#include <linux/freezer.h>
198e2f18 38#include <linux/vmalloc.h>
1da177e4
LT
39#include <linux/blkdev.h>
40#include <linux/delay.h>
41#include <linux/smp.h>
42#include <linux/threads.h>
43#include <linux/timer.h>
44#include <linux/rcupdate.h>
45#include <linux/cpu.h>
46#include <linux/cpuset.h>
47#include <linux/percpu.h>
48#include <linux/kthread.h>
49#include <linux/seq_file.h>
50#include <linux/syscalls.h>
51#include <linux/times.h>
8f0ab514 52#include <linux/tsacct_kern.h>
c6fd91f0 53#include <linux/kprobes.h>
0ff92245 54#include <linux/delayacct.h>
5517d86b 55#include <linux/reciprocal_div.h>
dff06c15 56#include <linux/unistd.h>
1da177e4 57
5517d86b 58#include <asm/tlb.h>
1da177e4 59
b035b6de
AD
60/*
61 * Scheduler clock - returns current time in nanosec units.
62 * This is default implementation.
63 * Architectures and sub-architectures can override this.
64 */
65unsigned long long __attribute__((weak)) sched_clock(void)
66{
67 return (unsigned long long)jiffies * (1000000000 / HZ);
68}
69
1da177e4
LT
70/*
71 * Convert user-nice values [ -20 ... 0 ... 19 ]
72 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
73 * and back.
74 */
75#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
76#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
77#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
78
79/*
80 * 'User priority' is the nice value converted to something we
81 * can work with better when scaling various scheduler parameters,
82 * it's a [ 0 ... 39 ] range.
83 */
84#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
85#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
86#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
87
88/*
89 * Some helpers for converting nanosecond timing to jiffy resolution
90 */
91#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
92#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
93
6aa645ea
IM
94#define NICE_0_LOAD SCHED_LOAD_SCALE
95#define NICE_0_SHIFT SCHED_LOAD_SHIFT
96
1da177e4
LT
97/*
98 * These are the 'tuning knobs' of the scheduler:
99 *
100 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
101 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
102 * Timeslices get refilled after they expire.
103 */
104#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
105#define DEF_TIMESLICE (100 * HZ / 1000)
2dd73a4f 106
5517d86b
ED
107#ifdef CONFIG_SMP
108/*
109 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
110 * Since cpu_power is a 'constant', we can use a reciprocal divide.
111 */
112static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
113{
114 return reciprocal_divide(load, sg->reciprocal_cpu_power);
115}
116
117/*
118 * Each time a sched group cpu_power is changed,
119 * we must compute its reciprocal value
120 */
121static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
122{
123 sg->__cpu_power += val;
124 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
125}
126#endif
127
634fa8c9
IM
128#define SCALE_PRIO(x, prio) \
129 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
130
91fcdd4e 131/*
634fa8c9 132 * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
91fcdd4e 133 * to time slice values: [800ms ... 100ms ... 5ms]
91fcdd4e 134 */
634fa8c9 135static unsigned int static_prio_timeslice(int static_prio)
2dd73a4f 136{
634fa8c9
IM
137 if (static_prio == NICE_TO_PRIO(19))
138 return 1;
139
140 if (static_prio < NICE_TO_PRIO(0))
141 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
142 else
143 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
2dd73a4f
PW
144}
145
e05606d3
IM
146static inline int rt_policy(int policy)
147{
148 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
149 return 1;
150 return 0;
151}
152
153static inline int task_has_rt_policy(struct task_struct *p)
154{
155 return rt_policy(p->policy);
156}
157
1da177e4 158/*
6aa645ea 159 * This is the priority-queue data structure of the RT scheduling class:
1da177e4 160 */
6aa645ea
IM
161struct rt_prio_array {
162 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
163 struct list_head queue[MAX_RT_PRIO];
164};
165
166struct load_stat {
167 struct load_weight load;
168 u64 load_update_start, load_update_last;
169 unsigned long delta_fair, delta_exec, delta_stat;
170};
171
172/* CFS-related fields in a runqueue */
173struct cfs_rq {
174 struct load_weight load;
175 unsigned long nr_running;
176
177 s64 fair_clock;
178 u64 exec_clock;
179 s64 wait_runtime;
180 u64 sleeper_bonus;
181 unsigned long wait_runtime_overruns, wait_runtime_underruns;
182
183 struct rb_root tasks_timeline;
184 struct rb_node *rb_leftmost;
185 struct rb_node *rb_load_balance_curr;
186#ifdef CONFIG_FAIR_GROUP_SCHED
187 /* 'curr' points to currently running entity on this cfs_rq.
188 * It is set to NULL otherwise (i.e when none are currently running).
189 */
190 struct sched_entity *curr;
191 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
192
193 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
194 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
195 * (like users, containers etc.)
196 *
197 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
198 * list is used during load balance.
199 */
200 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
201#endif
202};
1da177e4 203
6aa645ea
IM
204/* Real-Time classes' related field in a runqueue: */
205struct rt_rq {
206 struct rt_prio_array active;
207 int rt_load_balance_idx;
208 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
209};
210
1da177e4
LT
211/*
212 * This is the main, per-CPU runqueue data structure.
213 *
214 * Locking rule: those places that want to lock multiple runqueues
215 * (such as the load balancing or the thread migration code), lock
216 * acquire operations must be ordered by ascending &runqueue.
217 */
70b97a7f 218struct rq {
6aa645ea 219 spinlock_t lock; /* runqueue lock */
1da177e4
LT
220
221 /*
222 * nr_running and cpu_load should be in the same cacheline because
223 * remote CPUs use both these fields when doing load calculation.
224 */
225 unsigned long nr_running;
6aa645ea
IM
226 #define CPU_LOAD_IDX_MAX 5
227 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
bdecea3a 228 unsigned char idle_at_tick;
46cb4b7c
SS
229#ifdef CONFIG_NO_HZ
230 unsigned char in_nohz_recently;
231#endif
6aa645ea
IM
232 struct load_stat ls; /* capture load from *all* tasks on this cpu */
233 unsigned long nr_load_updates;
234 u64 nr_switches;
235
236 struct cfs_rq cfs;
237#ifdef CONFIG_FAIR_GROUP_SCHED
238 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
1da177e4 239#endif
6aa645ea 240 struct rt_rq rt;
1da177e4
LT
241
242 /*
243 * This is part of a global counter where only the total sum
244 * over all CPUs matters. A task can increase this counter on
245 * one CPU and if it got migrated afterwards it may decrease
246 * it on another CPU. Always updated under the runqueue lock:
247 */
248 unsigned long nr_uninterruptible;
249
36c8b586 250 struct task_struct *curr, *idle;
c9819f45 251 unsigned long next_balance;
1da177e4 252 struct mm_struct *prev_mm;
6aa645ea 253
6aa645ea
IM
254 u64 clock, prev_clock_raw;
255 s64 clock_max_delta;
256
257 unsigned int clock_warps, clock_overflows;
258 unsigned int clock_unstable_events;
259
260 struct sched_class *load_balance_class;
261
1da177e4
LT
262 atomic_t nr_iowait;
263
264#ifdef CONFIG_SMP
265 struct sched_domain *sd;
266
267 /* For active balancing */
268 int active_balance;
269 int push_cpu;
0a2966b4 270 int cpu; /* cpu of this runqueue */
1da177e4 271
36c8b586 272 struct task_struct *migration_thread;
1da177e4
LT
273 struct list_head migration_queue;
274#endif
275
276#ifdef CONFIG_SCHEDSTATS
277 /* latency stats */
278 struct sched_info rq_sched_info;
279
280 /* sys_sched_yield() stats */
281 unsigned long yld_exp_empty;
282 unsigned long yld_act_empty;
283 unsigned long yld_both_empty;
284 unsigned long yld_cnt;
285
286 /* schedule() stats */
287 unsigned long sched_switch;
288 unsigned long sched_cnt;
289 unsigned long sched_goidle;
290
291 /* try_to_wake_up() stats */
292 unsigned long ttwu_cnt;
293 unsigned long ttwu_local;
294#endif
fcb99371 295 struct lock_class_key rq_lock_key;
1da177e4
LT
296};
297
c3396620 298static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
5be9361c 299static DEFINE_MUTEX(sched_hotcpu_mutex);
1da177e4 300
dd41f596
IM
301static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
302{
303 rq->curr->sched_class->check_preempt_curr(rq, p);
304}
305
0a2966b4
CL
306static inline int cpu_of(struct rq *rq)
307{
308#ifdef CONFIG_SMP
309 return rq->cpu;
310#else
311 return 0;
312#endif
313}
314
20d315d4
IM
315/*
316 * Per-runqueue clock, as finegrained as the platform can give us:
317 */
318static unsigned long long __rq_clock(struct rq *rq)
319{
320 u64 prev_raw = rq->prev_clock_raw;
321 u64 now = sched_clock();
322 s64 delta = now - prev_raw;
323 u64 clock = rq->clock;
324
325 /*
326 * Protect against sched_clock() occasionally going backwards:
327 */
328 if (unlikely(delta < 0)) {
329 clock++;
330 rq->clock_warps++;
331 } else {
332 /*
333 * Catch too large forward jumps too:
334 */
335 if (unlikely(delta > 2*TICK_NSEC)) {
336 clock++;
337 rq->clock_overflows++;
338 } else {
339 if (unlikely(delta > rq->clock_max_delta))
340 rq->clock_max_delta = delta;
341 clock += delta;
342 }
343 }
344
345 rq->prev_clock_raw = now;
346 rq->clock = clock;
347
348 return clock;
349}
350
351static inline unsigned long long rq_clock(struct rq *rq)
352{
353 int this_cpu = smp_processor_id();
354
355 if (this_cpu == cpu_of(rq))
356 return __rq_clock(rq);
357
358 return rq->clock;
359}
360
674311d5
NP
361/*
362 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
1a20ff27 363 * See detach_destroy_domains: synchronize_sched for details.
674311d5
NP
364 *
365 * The domain tree of any CPU may only be accessed from within
366 * preempt-disabled sections.
367 */
48f24c4d
IM
368#define for_each_domain(cpu, __sd) \
369 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
1da177e4
LT
370
371#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
372#define this_rq() (&__get_cpu_var(runqueues))
373#define task_rq(p) cpu_rq(task_cpu(p))
374#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
375
138a8aeb
IM
376#ifdef CONFIG_FAIR_GROUP_SCHED
377/* Change a task's ->cfs_rq if it moves across CPUs */
378static inline void set_task_cfs_rq(struct task_struct *p)
379{
380 p->se.cfs_rq = &task_rq(p)->cfs;
381}
382#else
383static inline void set_task_cfs_rq(struct task_struct *p)
384{
385}
386#endif
387
1da177e4 388#ifndef prepare_arch_switch
4866cde0
NP
389# define prepare_arch_switch(next) do { } while (0)
390#endif
391#ifndef finish_arch_switch
392# define finish_arch_switch(prev) do { } while (0)
393#endif
394
395#ifndef __ARCH_WANT_UNLOCKED_CTXSW
70b97a7f 396static inline int task_running(struct rq *rq, struct task_struct *p)
4866cde0
NP
397{
398 return rq->curr == p;
399}
400
70b97a7f 401static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
4866cde0
NP
402{
403}
404
70b97a7f 405static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
4866cde0 406{
da04c035
IM
407#ifdef CONFIG_DEBUG_SPINLOCK
408 /* this is a valid case when another task releases the spinlock */
409 rq->lock.owner = current;
410#endif
8a25d5de
IM
411 /*
412 * If we are tracking spinlock dependencies then we have to
413 * fix up the runqueue lock - which gets 'carried over' from
414 * prev into current:
415 */
416 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
417
4866cde0
NP
418 spin_unlock_irq(&rq->lock);
419}
420
421#else /* __ARCH_WANT_UNLOCKED_CTXSW */
70b97a7f 422static inline int task_running(struct rq *rq, struct task_struct *p)
4866cde0
NP
423{
424#ifdef CONFIG_SMP
425 return p->oncpu;
426#else
427 return rq->curr == p;
428#endif
429}
430
70b97a7f 431static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
4866cde0
NP
432{
433#ifdef CONFIG_SMP
434 /*
435 * We can optimise this out completely for !SMP, because the
436 * SMP rebalancing from interrupt is the only thing that cares
437 * here.
438 */
439 next->oncpu = 1;
440#endif
441#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
442 spin_unlock_irq(&rq->lock);
443#else
444 spin_unlock(&rq->lock);
445#endif
446}
447
70b97a7f 448static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
4866cde0
NP
449{
450#ifdef CONFIG_SMP
451 /*
452 * After ->oncpu is cleared, the task can be moved to a different CPU.
453 * We must ensure this doesn't happen until the switch is completely
454 * finished.
455 */
456 smp_wmb();
457 prev->oncpu = 0;
458#endif
459#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
460 local_irq_enable();
1da177e4 461#endif
4866cde0
NP
462}
463#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
1da177e4 464
b29739f9
IM
465/*
466 * __task_rq_lock - lock the runqueue a given task resides on.
467 * Must be called interrupts disabled.
468 */
70b97a7f 469static inline struct rq *__task_rq_lock(struct task_struct *p)
b29739f9
IM
470 __acquires(rq->lock)
471{
70b97a7f 472 struct rq *rq;
b29739f9
IM
473
474repeat_lock_task:
475 rq = task_rq(p);
476 spin_lock(&rq->lock);
477 if (unlikely(rq != task_rq(p))) {
478 spin_unlock(&rq->lock);
479 goto repeat_lock_task;
480 }
481 return rq;
482}
483
1da177e4
LT
484/*
485 * task_rq_lock - lock the runqueue a given task resides on and disable
486 * interrupts. Note the ordering: we can safely lookup the task_rq without
487 * explicitly disabling preemption.
488 */
70b97a7f 489static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
1da177e4
LT
490 __acquires(rq->lock)
491{
70b97a7f 492 struct rq *rq;
1da177e4
LT
493
494repeat_lock_task:
495 local_irq_save(*flags);
496 rq = task_rq(p);
497 spin_lock(&rq->lock);
498 if (unlikely(rq != task_rq(p))) {
499 spin_unlock_irqrestore(&rq->lock, *flags);
500 goto repeat_lock_task;
501 }
502 return rq;
503}
504
70b97a7f 505static inline void __task_rq_unlock(struct rq *rq)
b29739f9
IM
506 __releases(rq->lock)
507{
508 spin_unlock(&rq->lock);
509}
510
70b97a7f 511static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
1da177e4
LT
512 __releases(rq->lock)
513{
514 spin_unlock_irqrestore(&rq->lock, *flags);
515}
516
1da177e4 517/*
cc2a73b5 518 * this_rq_lock - lock this runqueue and disable interrupts.
1da177e4 519 */
70b97a7f 520static inline struct rq *this_rq_lock(void)
1da177e4
LT
521 __acquires(rq->lock)
522{
70b97a7f 523 struct rq *rq;
1da177e4
LT
524
525 local_irq_disable();
526 rq = this_rq();
527 spin_lock(&rq->lock);
528
529 return rq;
530}
531
1b9f19c2
IM
532/*
533 * CPU frequency is/was unstable - start new by setting prev_clock_raw:
534 */
535void sched_clock_unstable_event(void)
536{
537 unsigned long flags;
538 struct rq *rq;
539
540 rq = task_rq_lock(current, &flags);
541 rq->prev_clock_raw = sched_clock();
542 rq->clock_unstable_events++;
543 task_rq_unlock(rq, &flags);
544}
545
c24d20db
IM
546/*
547 * resched_task - mark a task 'to be rescheduled now'.
548 *
549 * On UP this means the setting of the need_resched flag, on SMP it
550 * might also involve a cross-CPU call to trigger the scheduler on
551 * the target CPU.
552 */
553#ifdef CONFIG_SMP
554
555#ifndef tsk_is_polling
556#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
557#endif
558
559static void resched_task(struct task_struct *p)
560{
561 int cpu;
562
563 assert_spin_locked(&task_rq(p)->lock);
564
565 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
566 return;
567
568 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
569
570 cpu = task_cpu(p);
571 if (cpu == smp_processor_id())
572 return;
573
574 /* NEED_RESCHED must be visible before we test polling */
575 smp_mb();
576 if (!tsk_is_polling(p))
577 smp_send_reschedule(cpu);
578}
579
580static void resched_cpu(int cpu)
581{
582 struct rq *rq = cpu_rq(cpu);
583 unsigned long flags;
584
585 if (!spin_trylock_irqsave(&rq->lock, flags))
586 return;
587 resched_task(cpu_curr(cpu));
588 spin_unlock_irqrestore(&rq->lock, flags);
589}
590#else
591static inline void resched_task(struct task_struct *p)
592{
593 assert_spin_locked(&task_rq(p)->lock);
594 set_tsk_need_resched(p);
595}
596#endif
597
45bf76df
IM
598static u64 div64_likely32(u64 divident, unsigned long divisor)
599{
600#if BITS_PER_LONG == 32
601 if (likely(divident <= 0xffffffffULL))
602 return (u32)divident / divisor;
603 do_div(divident, divisor);
604
605 return divident;
606#else
607 return divident / divisor;
608#endif
609}
610
611#if BITS_PER_LONG == 32
612# define WMULT_CONST (~0UL)
613#else
614# define WMULT_CONST (1UL << 32)
615#endif
616
617#define WMULT_SHIFT 32
618
619static inline unsigned long
620calc_delta_mine(unsigned long delta_exec, unsigned long weight,
621 struct load_weight *lw)
622{
623 u64 tmp;
624
625 if (unlikely(!lw->inv_weight))
626 lw->inv_weight = WMULT_CONST / lw->weight;
627
628 tmp = (u64)delta_exec * weight;
629 /*
630 * Check whether we'd overflow the 64-bit multiplication:
631 */
632 if (unlikely(tmp > WMULT_CONST)) {
633 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
634 >> (WMULT_SHIFT/2);
635 } else {
636 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
637 }
638
639 return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
640}
641
642static inline unsigned long
643calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
644{
645 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
646}
647
648static void update_load_add(struct load_weight *lw, unsigned long inc)
649{
650 lw->weight += inc;
651 lw->inv_weight = 0;
652}
653
654static void update_load_sub(struct load_weight *lw, unsigned long dec)
655{
656 lw->weight -= dec;
657 lw->inv_weight = 0;
658}
659
660static void __update_curr_load(struct rq *rq, struct load_stat *ls)
661{
662 if (rq->curr != rq->idle && ls->load.weight) {
663 ls->delta_exec += ls->delta_stat;
664 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
665 ls->delta_stat = 0;
666 }
667}
668
669/*
670 * Update delta_exec, delta_fair fields for rq.
671 *
672 * delta_fair clock advances at a rate inversely proportional to
673 * total load (rq->ls.load.weight) on the runqueue, while
674 * delta_exec advances at the same rate as wall-clock (provided
675 * cpu is not idle).
676 *
677 * delta_exec / delta_fair is a measure of the (smoothened) load on this
678 * runqueue over any given interval. This (smoothened) load is used
679 * during load balance.
680 *
681 * This function is called /before/ updating rq->ls.load
682 * and when switching tasks.
683 */
684static void update_curr_load(struct rq *rq, u64 now)
685{
686 struct load_stat *ls = &rq->ls;
687 u64 start;
688
689 start = ls->load_update_start;
690 ls->load_update_start = now;
691 ls->delta_stat += now - start;
692 /*
693 * Stagger updates to ls->delta_fair. Very frequent updates
694 * can be expensive.
695 */
696 if (ls->delta_stat >= sysctl_sched_stat_granularity)
697 __update_curr_load(rq, ls);
698}
699
2dd73a4f
PW
700/*
701 * To aid in avoiding the subversion of "niceness" due to uneven distribution
702 * of tasks with abnormal "nice" values across CPUs the contribution that
703 * each task makes to its run queue's load is weighted according to its
704 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
705 * scaled version of the new time slice allocation that they receive on time
706 * slice expiry etc.
707 */
708
709/*
710 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
711 * If static_prio_timeslice() is ever changed to break this assumption then
712 * this code will need modification
713 */
714#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
dd41f596 715#define load_weight(lp) \
2dd73a4f
PW
716 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
717#define PRIO_TO_LOAD_WEIGHT(prio) \
dd41f596 718 load_weight(static_prio_timeslice(prio))
2dd73a4f 719#define RTPRIO_TO_LOAD_WEIGHT(rp) \
dd41f596
IM
720 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
721
722#define WEIGHT_IDLEPRIO 2
723#define WMULT_IDLEPRIO (1 << 31)
724
725/*
726 * Nice levels are multiplicative, with a gentle 10% change for every
727 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
728 * nice 1, it will get ~10% less CPU time than another CPU-bound task
729 * that remained on nice 0.
730 *
731 * The "10% effect" is relative and cumulative: from _any_ nice level,
732 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
733 * it's +10% CPU usage.
734 */
735static const int prio_to_weight[40] = {
736/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
737/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
738/* 0 */ NICE_0_LOAD /* 1024 */,
739/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
740/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
741};
742
743static const u32 prio_to_wmult[40] = {
744 48356, 60446, 75558, 94446, 118058, 147573,
745 184467, 230589, 288233, 360285, 450347,
746 562979, 703746, 879575, 1099582, 1374389,
747 717986, 2147483, 2684354, 3355443, 4194304,
748 244160, 6557201, 8196502, 10250518, 12782640,
749 16025997, 19976592, 24970740, 31350126, 39045157,
750 49367440, 61356675, 76695844, 95443717, 119304647,
751 148102320, 186737708, 238609294, 286331153,
752};
2dd73a4f 753
36c8b586 754static inline void
dd41f596 755inc_load(struct rq *rq, const struct task_struct *p, u64 now)
2dd73a4f 756{
dd41f596
IM
757 update_curr_load(rq, now);
758 update_load_add(&rq->ls.load, p->se.load.weight);
2dd73a4f
PW
759}
760
36c8b586 761static inline void
dd41f596 762dec_load(struct rq *rq, const struct task_struct *p, u64 now)
2dd73a4f 763{
dd41f596
IM
764 update_curr_load(rq, now);
765 update_load_sub(&rq->ls.load, p->se.load.weight);
2dd73a4f
PW
766}
767
dd41f596 768static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
2dd73a4f
PW
769{
770 rq->nr_running++;
dd41f596 771 inc_load(rq, p, now);
2dd73a4f
PW
772}
773
dd41f596 774static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
2dd73a4f
PW
775{
776 rq->nr_running--;
dd41f596 777 dec_load(rq, p, now);
2dd73a4f
PW
778}
779
dd41f596
IM
780static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
781
782/*
783 * runqueue iterator, to support SMP load-balancing between different
784 * scheduling classes, without having to expose their internal data
785 * structures to the load-balancing proper:
786 */
787struct rq_iterator {
788 void *arg;
789 struct task_struct *(*start)(void *);
790 struct task_struct *(*next)(void *);
791};
792
793static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
794 unsigned long max_nr_move, unsigned long max_load_move,
795 struct sched_domain *sd, enum cpu_idle_type idle,
796 int *all_pinned, unsigned long *load_moved,
797 int this_best_prio, int best_prio, int best_prio_seen,
798 struct rq_iterator *iterator);
799
800#include "sched_stats.h"
801#include "sched_rt.c"
802#include "sched_fair.c"
803#include "sched_idletask.c"
804#ifdef CONFIG_SCHED_DEBUG
805# include "sched_debug.c"
806#endif
807
808#define sched_class_highest (&rt_sched_class)
809
45bf76df
IM
810static void set_load_weight(struct task_struct *p)
811{
dd41f596
IM
812 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
813 p->se.wait_runtime = 0;
814
45bf76df 815 if (task_has_rt_policy(p)) {
dd41f596
IM
816 p->se.load.weight = prio_to_weight[0] * 2;
817 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
818 return;
819 }
45bf76df 820
dd41f596
IM
821 /*
822 * SCHED_IDLE tasks get minimal weight:
823 */
824 if (p->policy == SCHED_IDLE) {
825 p->se.load.weight = WEIGHT_IDLEPRIO;
826 p->se.load.inv_weight = WMULT_IDLEPRIO;
827 return;
828 }
71f8bd46 829
dd41f596
IM
830 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
831 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
71f8bd46
IM
832}
833
dd41f596
IM
834static void
835enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
71f8bd46 836{
dd41f596
IM
837 sched_info_queued(p);
838 p->sched_class->enqueue_task(rq, p, wakeup, now);
839 p->se.on_rq = 1;
71f8bd46
IM
840}
841
dd41f596
IM
842static void
843dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
71f8bd46 844{
dd41f596
IM
845 p->sched_class->dequeue_task(rq, p, sleep, now);
846 p->se.on_rq = 0;
71f8bd46
IM
847}
848
14531189 849/*
dd41f596 850 * __normal_prio - return the priority that is based on the static prio
14531189 851 */
14531189
IM
852static inline int __normal_prio(struct task_struct *p)
853{
dd41f596 854 return p->static_prio;
14531189
IM
855}
856
b29739f9
IM
857/*
858 * Calculate the expected normal priority: i.e. priority
859 * without taking RT-inheritance into account. Might be
860 * boosted by interactivity modifiers. Changes upon fork,
861 * setprio syscalls, and whenever the interactivity
862 * estimator recalculates.
863 */
36c8b586 864static inline int normal_prio(struct task_struct *p)
b29739f9
IM
865{
866 int prio;
867
e05606d3 868 if (task_has_rt_policy(p))
b29739f9
IM
869 prio = MAX_RT_PRIO-1 - p->rt_priority;
870 else
871 prio = __normal_prio(p);
872 return prio;
873}
874
875/*
876 * Calculate the current priority, i.e. the priority
877 * taken into account by the scheduler. This value might
878 * be boosted by RT tasks, or might be boosted by
879 * interactivity modifiers. Will be RT if the task got
880 * RT-boosted. If not then it returns p->normal_prio.
881 */
36c8b586 882static int effective_prio(struct task_struct *p)
b29739f9
IM
883{
884 p->normal_prio = normal_prio(p);
885 /*
886 * If we are RT tasks or we were boosted to RT priority,
887 * keep the priority unchanged. Otherwise, update priority
888 * to the normal priority:
889 */
890 if (!rt_prio(p->prio))
891 return p->normal_prio;
892 return p->prio;
893}
894
1da177e4 895/*
dd41f596 896 * activate_task - move a task to the runqueue.
1da177e4 897 */
dd41f596 898static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1da177e4 899{
dd41f596 900 u64 now = rq_clock(rq);
d425b274 901
dd41f596
IM
902 if (p->state == TASK_UNINTERRUPTIBLE)
903 rq->nr_uninterruptible--;
1da177e4 904
dd41f596
IM
905 enqueue_task(rq, p, wakeup, now);
906 inc_nr_running(p, rq, now);
1da177e4
LT
907}
908
909/*
dd41f596 910 * activate_idle_task - move idle task to the _front_ of runqueue.
1da177e4 911 */
dd41f596 912static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
1da177e4 913{
dd41f596 914 u64 now = rq_clock(rq);
1da177e4 915
dd41f596
IM
916 if (p->state == TASK_UNINTERRUPTIBLE)
917 rq->nr_uninterruptible--;
ece8a684 918
dd41f596
IM
919 enqueue_task(rq, p, 0, now);
920 inc_nr_running(p, rq, now);
1da177e4
LT
921}
922
923/*
924 * deactivate_task - remove a task from the runqueue.
925 */
dd41f596 926static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1da177e4 927{
dd41f596
IM
928 u64 now = rq_clock(rq);
929
930 if (p->state == TASK_UNINTERRUPTIBLE)
931 rq->nr_uninterruptible++;
932
933 dequeue_task(rq, p, sleep, now);
934 dec_nr_running(p, rq, now);
1da177e4
LT
935}
936
1da177e4
LT
937/**
938 * task_curr - is this task currently executing on a CPU?
939 * @p: the task in question.
940 */
36c8b586 941inline int task_curr(const struct task_struct *p)
1da177e4
LT
942{
943 return cpu_curr(task_cpu(p)) == p;
944}
945
2dd73a4f
PW
946/* Used instead of source_load when we know the type == 0 */
947unsigned long weighted_cpuload(const int cpu)
948{
dd41f596
IM
949 return cpu_rq(cpu)->ls.load.weight;
950}
951
952static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
953{
954#ifdef CONFIG_SMP
955 task_thread_info(p)->cpu = cpu;
956 set_task_cfs_rq(p);
957#endif
2dd73a4f
PW
958}
959
1da177e4 960#ifdef CONFIG_SMP
c65cc870 961
dd41f596 962void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
c65cc870 963{
dd41f596
IM
964 int old_cpu = task_cpu(p);
965 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
966 u64 clock_offset, fair_clock_offset;
967
968 clock_offset = old_rq->clock - new_rq->clock;
969 fair_clock_offset = old_rq->cfs.fair_clock -
970 new_rq->cfs.fair_clock;
971 if (p->se.wait_start)
972 p->se.wait_start -= clock_offset;
973 if (p->se.wait_start_fair)
974 p->se.wait_start_fair -= fair_clock_offset;
975 if (p->se.sleep_start)
976 p->se.sleep_start -= clock_offset;
977 if (p->se.block_start)
978 p->se.block_start -= clock_offset;
979 if (p->se.sleep_start_fair)
980 p->se.sleep_start_fair -= fair_clock_offset;
981
982 __set_task_cpu(p, new_cpu);
c65cc870
IM
983}
984
70b97a7f 985struct migration_req {
1da177e4 986 struct list_head list;
1da177e4 987
36c8b586 988 struct task_struct *task;
1da177e4
LT
989 int dest_cpu;
990
1da177e4 991 struct completion done;
70b97a7f 992};
1da177e4
LT
993
994/*
995 * The task's runqueue lock must be held.
996 * Returns true if you have to wait for migration thread.
997 */
36c8b586 998static int
70b97a7f 999migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1da177e4 1000{
70b97a7f 1001 struct rq *rq = task_rq(p);
1da177e4
LT
1002
1003 /*
1004 * If the task is not on a runqueue (and not running), then
1005 * it is sufficient to simply update the task's cpu field.
1006 */
dd41f596 1007 if (!p->se.on_rq && !task_running(rq, p)) {
1da177e4
LT
1008 set_task_cpu(p, dest_cpu);
1009 return 0;
1010 }
1011
1012 init_completion(&req->done);
1da177e4
LT
1013 req->task = p;
1014 req->dest_cpu = dest_cpu;
1015 list_add(&req->list, &rq->migration_queue);
48f24c4d 1016
1da177e4
LT
1017 return 1;
1018}
1019
1020/*
1021 * wait_task_inactive - wait for a thread to unschedule.
1022 *
1023 * The caller must ensure that the task *will* unschedule sometime soon,
1024 * else this function might spin for a *long* time. This function can't
1025 * be called with interrupts off, or it may introduce deadlock with
1026 * smp_call_function() if an IPI is sent by the same process we are
1027 * waiting to become inactive.
1028 */
36c8b586 1029void wait_task_inactive(struct task_struct *p)
1da177e4
LT
1030{
1031 unsigned long flags;
dd41f596 1032 int running, on_rq;
70b97a7f 1033 struct rq *rq;
1da177e4
LT
1034
1035repeat:
fa490cfd
LT
1036 /*
1037 * We do the initial early heuristics without holding
1038 * any task-queue locks at all. We'll only try to get
1039 * the runqueue lock when things look like they will
1040 * work out!
1041 */
1042 rq = task_rq(p);
1043
1044 /*
1045 * If the task is actively running on another CPU
1046 * still, just relax and busy-wait without holding
1047 * any locks.
1048 *
1049 * NOTE! Since we don't hold any locks, it's not
1050 * even sure that "rq" stays as the right runqueue!
1051 * But we don't care, since "task_running()" will
1052 * return false if the runqueue has changed and p
1053 * is actually now running somewhere else!
1054 */
1055 while (task_running(rq, p))
1056 cpu_relax();
1057
1058 /*
1059 * Ok, time to look more closely! We need the rq
1060 * lock now, to be *sure*. If we're wrong, we'll
1061 * just go back and repeat.
1062 */
1da177e4 1063 rq = task_rq_lock(p, &flags);
fa490cfd 1064 running = task_running(rq, p);
dd41f596 1065 on_rq = p->se.on_rq;
fa490cfd
LT
1066 task_rq_unlock(rq, &flags);
1067
1068 /*
1069 * Was it really running after all now that we
1070 * checked with the proper locks actually held?
1071 *
1072 * Oops. Go back and try again..
1073 */
1074 if (unlikely(running)) {
1da177e4 1075 cpu_relax();
1da177e4
LT
1076 goto repeat;
1077 }
fa490cfd
LT
1078
1079 /*
1080 * It's not enough that it's not actively running,
1081 * it must be off the runqueue _entirely_, and not
1082 * preempted!
1083 *
1084 * So if it wa still runnable (but just not actively
1085 * running right now), it's preempted, and we should
1086 * yield - it could be a while.
1087 */
dd41f596 1088 if (unlikely(on_rq)) {
fa490cfd
LT
1089 yield();
1090 goto repeat;
1091 }
1092
1093 /*
1094 * Ahh, all good. It wasn't running, and it wasn't
1095 * runnable, which means that it will never become
1096 * running in the future either. We're all done!
1097 */
1da177e4
LT
1098}
1099
1100/***
1101 * kick_process - kick a running thread to enter/exit the kernel
1102 * @p: the to-be-kicked thread
1103 *
1104 * Cause a process which is running on another CPU to enter
1105 * kernel-mode, without any delay. (to get signals handled.)
1106 *
1107 * NOTE: this function doesnt have to take the runqueue lock,
1108 * because all it wants to ensure is that the remote task enters
1109 * the kernel. If the IPI races and the task has been migrated
1110 * to another CPU then no harm is done and the purpose has been
1111 * achieved as well.
1112 */
36c8b586 1113void kick_process(struct task_struct *p)
1da177e4
LT
1114{
1115 int cpu;
1116
1117 preempt_disable();
1118 cpu = task_cpu(p);
1119 if ((cpu != smp_processor_id()) && task_curr(p))
1120 smp_send_reschedule(cpu);
1121 preempt_enable();
1122}
1123
1124/*
2dd73a4f
PW
1125 * Return a low guess at the load of a migration-source cpu weighted
1126 * according to the scheduling class and "nice" value.
1da177e4
LT
1127 *
1128 * We want to under-estimate the load of migration sources, to
1129 * balance conservatively.
1130 */
a2000572 1131static inline unsigned long source_load(int cpu, int type)
1da177e4 1132{
70b97a7f 1133 struct rq *rq = cpu_rq(cpu);
dd41f596 1134 unsigned long total = weighted_cpuload(cpu);
2dd73a4f 1135
3b0bd9bc 1136 if (type == 0)
dd41f596 1137 return total;
b910472d 1138
dd41f596 1139 return min(rq->cpu_load[type-1], total);
1da177e4
LT
1140}
1141
1142/*
2dd73a4f
PW
1143 * Return a high guess at the load of a migration-target cpu weighted
1144 * according to the scheduling class and "nice" value.
1da177e4 1145 */
a2000572 1146static inline unsigned long target_load(int cpu, int type)
1da177e4 1147{
70b97a7f 1148 struct rq *rq = cpu_rq(cpu);
dd41f596 1149 unsigned long total = weighted_cpuload(cpu);
2dd73a4f 1150
7897986b 1151 if (type == 0)
dd41f596 1152 return total;
3b0bd9bc 1153
dd41f596 1154 return max(rq->cpu_load[type-1], total);
2dd73a4f
PW
1155}
1156
1157/*
1158 * Return the average load per task on the cpu's run queue
1159 */
1160static inline unsigned long cpu_avg_load_per_task(int cpu)
1161{
70b97a7f 1162 struct rq *rq = cpu_rq(cpu);
dd41f596 1163 unsigned long total = weighted_cpuload(cpu);
2dd73a4f
PW
1164 unsigned long n = rq->nr_running;
1165
dd41f596 1166 return n ? total / n : SCHED_LOAD_SCALE;
1da177e4
LT
1167}
1168
147cbb4b
NP
1169/*
1170 * find_idlest_group finds and returns the least busy CPU group within the
1171 * domain.
1172 */
1173static struct sched_group *
1174find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1175{
1176 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1177 unsigned long min_load = ULONG_MAX, this_load = 0;
1178 int load_idx = sd->forkexec_idx;
1179 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1180
1181 do {
1182 unsigned long load, avg_load;
1183 int local_group;
1184 int i;
1185
da5a5522
BD
1186 /* Skip over this group if it has no CPUs allowed */
1187 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1188 goto nextgroup;
1189
147cbb4b 1190 local_group = cpu_isset(this_cpu, group->cpumask);
147cbb4b
NP
1191
1192 /* Tally up the load of all CPUs in the group */
1193 avg_load = 0;
1194
1195 for_each_cpu_mask(i, group->cpumask) {
1196 /* Bias balancing toward cpus of our domain */
1197 if (local_group)
1198 load = source_load(i, load_idx);
1199 else
1200 load = target_load(i, load_idx);
1201
1202 avg_load += load;
1203 }
1204
1205 /* Adjust by relative CPU power of the group */
5517d86b
ED
1206 avg_load = sg_div_cpu_power(group,
1207 avg_load * SCHED_LOAD_SCALE);
147cbb4b
NP
1208
1209 if (local_group) {
1210 this_load = avg_load;
1211 this = group;
1212 } else if (avg_load < min_load) {
1213 min_load = avg_load;
1214 idlest = group;
1215 }
da5a5522 1216nextgroup:
147cbb4b
NP
1217 group = group->next;
1218 } while (group != sd->groups);
1219
1220 if (!idlest || 100*this_load < imbalance*min_load)
1221 return NULL;
1222 return idlest;
1223}
1224
1225/*
0feaece9 1226 * find_idlest_cpu - find the idlest cpu among the cpus in group.
147cbb4b 1227 */
95cdf3b7
IM
1228static int
1229find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
147cbb4b 1230{
da5a5522 1231 cpumask_t tmp;
147cbb4b
NP
1232 unsigned long load, min_load = ULONG_MAX;
1233 int idlest = -1;
1234 int i;
1235
da5a5522
BD
1236 /* Traverse only the allowed CPUs */
1237 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1238
1239 for_each_cpu_mask(i, tmp) {
2dd73a4f 1240 load = weighted_cpuload(i);
147cbb4b
NP
1241
1242 if (load < min_load || (load == min_load && i == this_cpu)) {
1243 min_load = load;
1244 idlest = i;
1245 }
1246 }
1247
1248 return idlest;
1249}
1250
476d139c
NP
1251/*
1252 * sched_balance_self: balance the current task (running on cpu) in domains
1253 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1254 * SD_BALANCE_EXEC.
1255 *
1256 * Balance, ie. select the least loaded group.
1257 *
1258 * Returns the target CPU number, or the same CPU if no balancing is needed.
1259 *
1260 * preempt must be disabled.
1261 */
1262static int sched_balance_self(int cpu, int flag)
1263{
1264 struct task_struct *t = current;
1265 struct sched_domain *tmp, *sd = NULL;
147cbb4b 1266
c96d145e 1267 for_each_domain(cpu, tmp) {
5c45bf27
SS
1268 /*
1269 * If power savings logic is enabled for a domain, stop there.
1270 */
1271 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1272 break;
476d139c
NP
1273 if (tmp->flags & flag)
1274 sd = tmp;
c96d145e 1275 }
476d139c
NP
1276
1277 while (sd) {
1278 cpumask_t span;
1279 struct sched_group *group;
1a848870
SS
1280 int new_cpu, weight;
1281
1282 if (!(sd->flags & flag)) {
1283 sd = sd->child;
1284 continue;
1285 }
476d139c
NP
1286
1287 span = sd->span;
1288 group = find_idlest_group(sd, t, cpu);
1a848870
SS
1289 if (!group) {
1290 sd = sd->child;
1291 continue;
1292 }
476d139c 1293
da5a5522 1294 new_cpu = find_idlest_cpu(group, t, cpu);
1a848870
SS
1295 if (new_cpu == -1 || new_cpu == cpu) {
1296 /* Now try balancing at a lower domain level of cpu */
1297 sd = sd->child;
1298 continue;
1299 }
476d139c 1300
1a848870 1301 /* Now try balancing at a lower domain level of new_cpu */
476d139c 1302 cpu = new_cpu;
476d139c
NP
1303 sd = NULL;
1304 weight = cpus_weight(span);
1305 for_each_domain(cpu, tmp) {
1306 if (weight <= cpus_weight(tmp->span))
1307 break;
1308 if (tmp->flags & flag)
1309 sd = tmp;
1310 }
1311 /* while loop will break here if sd == NULL */
1312 }
1313
1314 return cpu;
1315}
1316
1317#endif /* CONFIG_SMP */
1da177e4
LT
1318
1319/*
1320 * wake_idle() will wake a task on an idle cpu if task->cpu is
1321 * not idle and an idle cpu is available. The span of cpus to
1322 * search starts with cpus closest then further out as needed,
1323 * so we always favor a closer, idle cpu.
1324 *
1325 * Returns the CPU we should wake onto.
1326 */
1327#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
36c8b586 1328static int wake_idle(int cpu, struct task_struct *p)
1da177e4
LT
1329{
1330 cpumask_t tmp;
1331 struct sched_domain *sd;
1332 int i;
1333
4953198b
SS
1334 /*
1335 * If it is idle, then it is the best cpu to run this task.
1336 *
1337 * This cpu is also the best, if it has more than one task already.
1338 * Siblings must be also busy(in most cases) as they didn't already
1339 * pickup the extra load from this cpu and hence we need not check
1340 * sibling runqueue info. This will avoid the checks and cache miss
1341 * penalities associated with that.
1342 */
1343 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1da177e4
LT
1344 return cpu;
1345
1346 for_each_domain(cpu, sd) {
1347 if (sd->flags & SD_WAKE_IDLE) {
e0f364f4 1348 cpus_and(tmp, sd->span, p->cpus_allowed);
1da177e4
LT
1349 for_each_cpu_mask(i, tmp) {
1350 if (idle_cpu(i))
1351 return i;
1352 }
1353 }
e0f364f4
NP
1354 else
1355 break;
1da177e4
LT
1356 }
1357 return cpu;
1358}
1359#else
36c8b586 1360static inline int wake_idle(int cpu, struct task_struct *p)
1da177e4
LT
1361{
1362 return cpu;
1363}
1364#endif
1365
1366/***
1367 * try_to_wake_up - wake up a thread
1368 * @p: the to-be-woken-up thread
1369 * @state: the mask of task states that can be woken
1370 * @sync: do a synchronous wakeup?
1371 *
1372 * Put it on the run-queue if it's not already there. The "current"
1373 * thread is always on the run-queue (except when the actual
1374 * re-schedule is in progress), and as such you're allowed to do
1375 * the simpler "current->state = TASK_RUNNING" to mark yourself
1376 * runnable without the overhead of this.
1377 *
1378 * returns failure only if the task is already active.
1379 */
36c8b586 1380static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1da177e4
LT
1381{
1382 int cpu, this_cpu, success = 0;
1383 unsigned long flags;
1384 long old_state;
70b97a7f 1385 struct rq *rq;
1da177e4 1386#ifdef CONFIG_SMP
7897986b 1387 struct sched_domain *sd, *this_sd = NULL;
70b97a7f 1388 unsigned long load, this_load;
1da177e4
LT
1389 int new_cpu;
1390#endif
1391
1392 rq = task_rq_lock(p, &flags);
1393 old_state = p->state;
1394 if (!(old_state & state))
1395 goto out;
1396
dd41f596 1397 if (p->se.on_rq)
1da177e4
LT
1398 goto out_running;
1399
1400 cpu = task_cpu(p);
1401 this_cpu = smp_processor_id();
1402
1403#ifdef CONFIG_SMP
1404 if (unlikely(task_running(rq, p)))
1405 goto out_activate;
1406
7897986b
NP
1407 new_cpu = cpu;
1408
1da177e4
LT
1409 schedstat_inc(rq, ttwu_cnt);
1410 if (cpu == this_cpu) {
1411 schedstat_inc(rq, ttwu_local);
7897986b
NP
1412 goto out_set_cpu;
1413 }
1414
1415 for_each_domain(this_cpu, sd) {
1416 if (cpu_isset(cpu, sd->span)) {
1417 schedstat_inc(sd, ttwu_wake_remote);
1418 this_sd = sd;
1419 break;
1da177e4
LT
1420 }
1421 }
1da177e4 1422
7897986b 1423 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1da177e4
LT
1424 goto out_set_cpu;
1425
1da177e4 1426 /*
7897986b 1427 * Check for affine wakeup and passive balancing possibilities.
1da177e4 1428 */
7897986b
NP
1429 if (this_sd) {
1430 int idx = this_sd->wake_idx;
1431 unsigned int imbalance;
1da177e4 1432
a3f21bce
NP
1433 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1434
7897986b
NP
1435 load = source_load(cpu, idx);
1436 this_load = target_load(this_cpu, idx);
1da177e4 1437
7897986b
NP
1438 new_cpu = this_cpu; /* Wake to this CPU if we can */
1439
a3f21bce
NP
1440 if (this_sd->flags & SD_WAKE_AFFINE) {
1441 unsigned long tl = this_load;
33859f7f
MOS
1442 unsigned long tl_per_task;
1443
1444 tl_per_task = cpu_avg_load_per_task(this_cpu);
2dd73a4f 1445
1da177e4 1446 /*
a3f21bce
NP
1447 * If sync wakeup then subtract the (maximum possible)
1448 * effect of the currently running task from the load
1449 * of the current CPU:
1da177e4 1450 */
a3f21bce 1451 if (sync)
dd41f596 1452 tl -= current->se.load.weight;
a3f21bce
NP
1453
1454 if ((tl <= load &&
2dd73a4f 1455 tl + target_load(cpu, idx) <= tl_per_task) ||
dd41f596 1456 100*(tl + p->se.load.weight) <= imbalance*load) {
a3f21bce
NP
1457 /*
1458 * This domain has SD_WAKE_AFFINE and
1459 * p is cache cold in this domain, and
1460 * there is no bad imbalance.
1461 */
1462 schedstat_inc(this_sd, ttwu_move_affine);
1463 goto out_set_cpu;
1464 }
1465 }
1466
1467 /*
1468 * Start passive balancing when half the imbalance_pct
1469 * limit is reached.
1470 */
1471 if (this_sd->flags & SD_WAKE_BALANCE) {
1472 if (imbalance*this_load <= 100*load) {
1473 schedstat_inc(this_sd, ttwu_move_balance);
1474 goto out_set_cpu;
1475 }
1da177e4
LT
1476 }
1477 }
1478
1479 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1480out_set_cpu:
1481 new_cpu = wake_idle(new_cpu, p);
1482 if (new_cpu != cpu) {
1483 set_task_cpu(p, new_cpu);
1484 task_rq_unlock(rq, &flags);
1485 /* might preempt at this point */
1486 rq = task_rq_lock(p, &flags);
1487 old_state = p->state;
1488 if (!(old_state & state))
1489 goto out;
dd41f596 1490 if (p->se.on_rq)
1da177e4
LT
1491 goto out_running;
1492
1493 this_cpu = smp_processor_id();
1494 cpu = task_cpu(p);
1495 }
1496
1497out_activate:
1498#endif /* CONFIG_SMP */
dd41f596 1499 activate_task(rq, p, 1);
1da177e4
LT
1500 /*
1501 * Sync wakeups (i.e. those types of wakeups where the waker
1502 * has indicated that it will leave the CPU in short order)
1503 * don't trigger a preemption, if the woken up task will run on
1504 * this cpu. (in this case the 'I will reschedule' promise of
1505 * the waker guarantees that the freshly woken up task is going
1506 * to be considered on this CPU.)
1507 */
dd41f596
IM
1508 if (!sync || cpu != this_cpu)
1509 check_preempt_curr(rq, p);
1da177e4
LT
1510 success = 1;
1511
1512out_running:
1513 p->state = TASK_RUNNING;
1514out:
1515 task_rq_unlock(rq, &flags);
1516
1517 return success;
1518}
1519
36c8b586 1520int fastcall wake_up_process(struct task_struct *p)
1da177e4
LT
1521{
1522 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1523 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1524}
1da177e4
LT
1525EXPORT_SYMBOL(wake_up_process);
1526
36c8b586 1527int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1da177e4
LT
1528{
1529 return try_to_wake_up(p, state, 0);
1530}
1531
1da177e4
LT
1532/*
1533 * Perform scheduler related setup for a newly forked process p.
1534 * p is forked by current.
dd41f596
IM
1535 *
1536 * __sched_fork() is basic setup used by init_idle() too:
1537 */
1538static void __sched_fork(struct task_struct *p)
1539{
1540 p->se.wait_start_fair = 0;
1541 p->se.wait_start = 0;
1542 p->se.exec_start = 0;
1543 p->se.sum_exec_runtime = 0;
1544 p->se.delta_exec = 0;
1545 p->se.delta_fair_run = 0;
1546 p->se.delta_fair_sleep = 0;
1547 p->se.wait_runtime = 0;
1548 p->se.sum_wait_runtime = 0;
1549 p->se.sum_sleep_runtime = 0;
1550 p->se.sleep_start = 0;
1551 p->se.sleep_start_fair = 0;
1552 p->se.block_start = 0;
1553 p->se.sleep_max = 0;
1554 p->se.block_max = 0;
1555 p->se.exec_max = 0;
1556 p->se.wait_max = 0;
1557 p->se.wait_runtime_overruns = 0;
1558 p->se.wait_runtime_underruns = 0;
476d139c 1559
dd41f596
IM
1560 INIT_LIST_HEAD(&p->run_list);
1561 p->se.on_rq = 0;
476d139c 1562
1da177e4
LT
1563 /*
1564 * We mark the process as running here, but have not actually
1565 * inserted it onto the runqueue yet. This guarantees that
1566 * nobody will actually run it, and a signal or other external
1567 * event cannot wake it up and insert it on the runqueue either.
1568 */
1569 p->state = TASK_RUNNING;
dd41f596
IM
1570}
1571
1572/*
1573 * fork()/clone()-time setup:
1574 */
1575void sched_fork(struct task_struct *p, int clone_flags)
1576{
1577 int cpu = get_cpu();
1578
1579 __sched_fork(p);
1580
1581#ifdef CONFIG_SMP
1582 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1583#endif
1584 __set_task_cpu(p, cpu);
b29739f9
IM
1585
1586 /*
1587 * Make sure we do not leak PI boosting priority to the child:
1588 */
1589 p->prio = current->normal_prio;
1590
52f17b6c 1591#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
dd41f596 1592 if (likely(sched_info_on()))
52f17b6c 1593 memset(&p->sched_info, 0, sizeof(p->sched_info));
1da177e4 1594#endif
d6077cb8 1595#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4866cde0
NP
1596 p->oncpu = 0;
1597#endif
1da177e4 1598#ifdef CONFIG_PREEMPT
4866cde0 1599 /* Want to start with kernel preemption disabled. */
a1261f54 1600 task_thread_info(p)->preempt_count = 1;
1da177e4 1601#endif
476d139c 1602 put_cpu();
1da177e4
LT
1603}
1604
dd41f596
IM
1605/*
1606 * After fork, child runs first. (default) If set to 0 then
1607 * parent will (try to) run first.
1608 */
1609unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1610
1da177e4
LT
1611/*
1612 * wake_up_new_task - wake up a newly created task for the first time.
1613 *
1614 * This function will do some initial scheduler statistics housekeeping
1615 * that must be done for every newly created context, then puts the task
1616 * on the runqueue and wakes it.
1617 */
36c8b586 1618void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1da177e4
LT
1619{
1620 unsigned long flags;
dd41f596
IM
1621 struct rq *rq;
1622 int this_cpu;
1da177e4
LT
1623
1624 rq = task_rq_lock(p, &flags);
147cbb4b 1625 BUG_ON(p->state != TASK_RUNNING);
dd41f596 1626 this_cpu = smp_processor_id(); /* parent's CPU */
1da177e4
LT
1627
1628 p->prio = effective_prio(p);
1629
dd41f596
IM
1630 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1631 task_cpu(p) != this_cpu || !current->se.on_rq) {
1632 activate_task(rq, p, 0);
1da177e4 1633 } else {
1da177e4 1634 /*
dd41f596
IM
1635 * Let the scheduling class do new task startup
1636 * management (if any):
1da177e4 1637 */
dd41f596 1638 p->sched_class->task_new(rq, p);
1da177e4 1639 }
dd41f596
IM
1640 check_preempt_curr(rq, p);
1641 task_rq_unlock(rq, &flags);
1da177e4
LT
1642}
1643
4866cde0
NP
1644/**
1645 * prepare_task_switch - prepare to switch tasks
1646 * @rq: the runqueue preparing to switch
1647 * @next: the task we are going to switch to.
1648 *
1649 * This is called with the rq lock held and interrupts off. It must
1650 * be paired with a subsequent finish_task_switch after the context
1651 * switch.
1652 *
1653 * prepare_task_switch sets up locking and calls architecture specific
1654 * hooks.
1655 */
70b97a7f 1656static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
4866cde0
NP
1657{
1658 prepare_lock_switch(rq, next);
1659 prepare_arch_switch(next);
1660}
1661
1da177e4
LT
1662/**
1663 * finish_task_switch - clean up after a task-switch
344babaa 1664 * @rq: runqueue associated with task-switch
1da177e4
LT
1665 * @prev: the thread we just switched away from.
1666 *
4866cde0
NP
1667 * finish_task_switch must be called after the context switch, paired
1668 * with a prepare_task_switch call before the context switch.
1669 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1670 * and do any other architecture-specific cleanup actions.
1da177e4
LT
1671 *
1672 * Note that we may have delayed dropping an mm in context_switch(). If
1673 * so, we finish that here outside of the runqueue lock. (Doing it
1674 * with the lock held can cause deadlocks; see schedule() for
1675 * details.)
1676 */
70b97a7f 1677static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1da177e4
LT
1678 __releases(rq->lock)
1679{
1da177e4 1680 struct mm_struct *mm = rq->prev_mm;
55a101f8 1681 long prev_state;
1da177e4
LT
1682
1683 rq->prev_mm = NULL;
1684
1685 /*
1686 * A task struct has one reference for the use as "current".
c394cc9f 1687 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
55a101f8
ON
1688 * schedule one last time. The schedule call will never return, and
1689 * the scheduled task must drop that reference.
c394cc9f 1690 * The test for TASK_DEAD must occur while the runqueue locks are
1da177e4
LT
1691 * still held, otherwise prev could be scheduled on another cpu, die
1692 * there before we look at prev->state, and then the reference would
1693 * be dropped twice.
1694 * Manfred Spraul <manfred@colorfullife.com>
1695 */
55a101f8 1696 prev_state = prev->state;
4866cde0
NP
1697 finish_arch_switch(prev);
1698 finish_lock_switch(rq, prev);
1da177e4
LT
1699 if (mm)
1700 mmdrop(mm);
c394cc9f 1701 if (unlikely(prev_state == TASK_DEAD)) {
c6fd91f0 1702 /*
1703 * Remove function-return probe instances associated with this
1704 * task and put them back on the free list.
1705 */
1706 kprobe_flush_task(prev);
1da177e4 1707 put_task_struct(prev);
c6fd91f0 1708 }
1da177e4
LT
1709}
1710
1711/**
1712 * schedule_tail - first thing a freshly forked thread must call.
1713 * @prev: the thread we just switched away from.
1714 */
36c8b586 1715asmlinkage void schedule_tail(struct task_struct *prev)
1da177e4
LT
1716 __releases(rq->lock)
1717{
70b97a7f
IM
1718 struct rq *rq = this_rq();
1719
4866cde0
NP
1720 finish_task_switch(rq, prev);
1721#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1722 /* In this case, finish_task_switch does not reenable preemption */
1723 preempt_enable();
1724#endif
1da177e4
LT
1725 if (current->set_child_tid)
1726 put_user(current->pid, current->set_child_tid);
1727}
1728
1729/*
1730 * context_switch - switch to the new MM and the new
1731 * thread's register state.
1732 */
dd41f596 1733static inline void
70b97a7f 1734context_switch(struct rq *rq, struct task_struct *prev,
36c8b586 1735 struct task_struct *next)
1da177e4 1736{
dd41f596 1737 struct mm_struct *mm, *oldmm;
1da177e4 1738
dd41f596
IM
1739 prepare_task_switch(rq, next);
1740 mm = next->mm;
1741 oldmm = prev->active_mm;
9226d125
ZA
1742 /*
1743 * For paravirt, this is coupled with an exit in switch_to to
1744 * combine the page table reload and the switch backend into
1745 * one hypercall.
1746 */
1747 arch_enter_lazy_cpu_mode();
1748
dd41f596 1749 if (unlikely(!mm)) {
1da177e4
LT
1750 next->active_mm = oldmm;
1751 atomic_inc(&oldmm->mm_count);
1752 enter_lazy_tlb(oldmm, next);
1753 } else
1754 switch_mm(oldmm, mm, next);
1755
dd41f596 1756 if (unlikely(!prev->mm)) {
1da177e4 1757 prev->active_mm = NULL;
1da177e4
LT
1758 rq->prev_mm = oldmm;
1759 }
3a5f5e48
IM
1760 /*
1761 * Since the runqueue lock will be released by the next
1762 * task (which is an invalid locking op but in the case
1763 * of the scheduler it's an obvious special-case), so we
1764 * do an early lockdep release here:
1765 */
1766#ifndef __ARCH_WANT_UNLOCKED_CTXSW
8a25d5de 1767 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
3a5f5e48 1768#endif
1da177e4
LT
1769
1770 /* Here we just switch the register state and the stack. */
1771 switch_to(prev, next, prev);
1772
dd41f596
IM
1773 barrier();
1774 /*
1775 * this_rq must be evaluated again because prev may have moved
1776 * CPUs since it called schedule(), thus the 'rq' on its stack
1777 * frame will be invalid.
1778 */
1779 finish_task_switch(this_rq(), prev);
1da177e4
LT
1780}
1781
1782/*
1783 * nr_running, nr_uninterruptible and nr_context_switches:
1784 *
1785 * externally visible scheduler statistics: current number of runnable
1786 * threads, current number of uninterruptible-sleeping threads, total
1787 * number of context switches performed since bootup.
1788 */
1789unsigned long nr_running(void)
1790{
1791 unsigned long i, sum = 0;
1792
1793 for_each_online_cpu(i)
1794 sum += cpu_rq(i)->nr_running;
1795
1796 return sum;
1797}
1798
1799unsigned long nr_uninterruptible(void)
1800{
1801 unsigned long i, sum = 0;
1802
0a945022 1803 for_each_possible_cpu(i)
1da177e4
LT
1804 sum += cpu_rq(i)->nr_uninterruptible;
1805
1806 /*
1807 * Since we read the counters lockless, it might be slightly
1808 * inaccurate. Do not allow it to go below zero though:
1809 */
1810 if (unlikely((long)sum < 0))
1811 sum = 0;
1812
1813 return sum;
1814}
1815
1816unsigned long long nr_context_switches(void)
1817{
cc94abfc
SR
1818 int i;
1819 unsigned long long sum = 0;
1da177e4 1820
0a945022 1821 for_each_possible_cpu(i)
1da177e4
LT
1822 sum += cpu_rq(i)->nr_switches;
1823
1824 return sum;
1825}
1826
1827unsigned long nr_iowait(void)
1828{
1829 unsigned long i, sum = 0;
1830
0a945022 1831 for_each_possible_cpu(i)
1da177e4
LT
1832 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1833
1834 return sum;
1835}
1836
db1b1fef
JS
1837unsigned long nr_active(void)
1838{
1839 unsigned long i, running = 0, uninterruptible = 0;
1840
1841 for_each_online_cpu(i) {
1842 running += cpu_rq(i)->nr_running;
1843 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1844 }
1845
1846 if (unlikely((long)uninterruptible < 0))
1847 uninterruptible = 0;
1848
1849 return running + uninterruptible;
1850}
1851
48f24c4d 1852/*
dd41f596
IM
1853 * Update rq->cpu_load[] statistics. This function is usually called every
1854 * scheduler tick (TICK_NSEC).
48f24c4d 1855 */
dd41f596 1856static void update_cpu_load(struct rq *this_rq)
48f24c4d 1857{
dd41f596
IM
1858 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1859 unsigned long total_load = this_rq->ls.load.weight;
1860 unsigned long this_load = total_load;
1861 struct load_stat *ls = &this_rq->ls;
1862 u64 now = __rq_clock(this_rq);
1863 int i, scale;
1864
1865 this_rq->nr_load_updates++;
1866 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1867 goto do_avg;
1868
1869 /* Update delta_fair/delta_exec fields first */
1870 update_curr_load(this_rq, now);
1871
1872 fair_delta64 = ls->delta_fair + 1;
1873 ls->delta_fair = 0;
1874
1875 exec_delta64 = ls->delta_exec + 1;
1876 ls->delta_exec = 0;
1877
1878 sample_interval64 = now - ls->load_update_last;
1879 ls->load_update_last = now;
1880
1881 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1882 sample_interval64 = TICK_NSEC;
1883
1884 if (exec_delta64 > sample_interval64)
1885 exec_delta64 = sample_interval64;
1886
1887 idle_delta64 = sample_interval64 - exec_delta64;
1888
1889 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1890 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1891
1892 this_load = (unsigned long)tmp64;
1893
1894do_avg:
1895
1896 /* Update our load: */
1897 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1898 unsigned long old_load, new_load;
1899
1900 /* scale is effectively 1 << i now, and >> i divides by scale */
1901
1902 old_load = this_rq->cpu_load[i];
1903 new_load = this_load;
1904
1905 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1906 }
48f24c4d
IM
1907}
1908
dd41f596
IM
1909#ifdef CONFIG_SMP
1910
1da177e4
LT
1911/*
1912 * double_rq_lock - safely lock two runqueues
1913 *
1914 * Note this does not disable interrupts like task_rq_lock,
1915 * you need to do so manually before calling.
1916 */
70b97a7f 1917static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1da177e4
LT
1918 __acquires(rq1->lock)
1919 __acquires(rq2->lock)
1920{
054b9108 1921 BUG_ON(!irqs_disabled());
1da177e4
LT
1922 if (rq1 == rq2) {
1923 spin_lock(&rq1->lock);
1924 __acquire(rq2->lock); /* Fake it out ;) */
1925 } else {
c96d145e 1926 if (rq1 < rq2) {
1da177e4
LT
1927 spin_lock(&rq1->lock);
1928 spin_lock(&rq2->lock);
1929 } else {
1930 spin_lock(&rq2->lock);
1931 spin_lock(&rq1->lock);
1932 }
1933 }
1934}
1935
1936/*
1937 * double_rq_unlock - safely unlock two runqueues
1938 *
1939 * Note this does not restore interrupts like task_rq_unlock,
1940 * you need to do so manually after calling.
1941 */
70b97a7f 1942static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1da177e4
LT
1943 __releases(rq1->lock)
1944 __releases(rq2->lock)
1945{
1946 spin_unlock(&rq1->lock);
1947 if (rq1 != rq2)
1948 spin_unlock(&rq2->lock);
1949 else
1950 __release(rq2->lock);
1951}
1952
1953/*
1954 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1955 */
70b97a7f 1956static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
1da177e4
LT
1957 __releases(this_rq->lock)
1958 __acquires(busiest->lock)
1959 __acquires(this_rq->lock)
1960{
054b9108
KK
1961 if (unlikely(!irqs_disabled())) {
1962 /* printk() doesn't work good under rq->lock */
1963 spin_unlock(&this_rq->lock);
1964 BUG_ON(1);
1965 }
1da177e4 1966 if (unlikely(!spin_trylock(&busiest->lock))) {
c96d145e 1967 if (busiest < this_rq) {
1da177e4
LT
1968 spin_unlock(&this_rq->lock);
1969 spin_lock(&busiest->lock);
1970 spin_lock(&this_rq->lock);
1971 } else
1972 spin_lock(&busiest->lock);
1973 }
1974}
1975
1da177e4
LT
1976/*
1977 * If dest_cpu is allowed for this process, migrate the task to it.
1978 * This is accomplished by forcing the cpu_allowed mask to only
1979 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
1980 * the cpu_allowed mask is restored.
1981 */
36c8b586 1982static void sched_migrate_task(struct task_struct *p, int dest_cpu)
1da177e4 1983{
70b97a7f 1984 struct migration_req req;
1da177e4 1985 unsigned long flags;
70b97a7f 1986 struct rq *rq;
1da177e4
LT
1987
1988 rq = task_rq_lock(p, &flags);
1989 if (!cpu_isset(dest_cpu, p->cpus_allowed)
1990 || unlikely(cpu_is_offline(dest_cpu)))
1991 goto out;
1992
1993 /* force the process onto the specified CPU */
1994 if (migrate_task(p, dest_cpu, &req)) {
1995 /* Need to wait for migration thread (might exit: take ref). */
1996 struct task_struct *mt = rq->migration_thread;
36c8b586 1997
1da177e4
LT
1998 get_task_struct(mt);
1999 task_rq_unlock(rq, &flags);
2000 wake_up_process(mt);
2001 put_task_struct(mt);
2002 wait_for_completion(&req.done);
36c8b586 2003
1da177e4
LT
2004 return;
2005 }
2006out:
2007 task_rq_unlock(rq, &flags);
2008}
2009
2010/*
476d139c
NP
2011 * sched_exec - execve() is a valuable balancing opportunity, because at
2012 * this point the task has the smallest effective memory and cache footprint.
1da177e4
LT
2013 */
2014void sched_exec(void)
2015{
1da177e4 2016 int new_cpu, this_cpu = get_cpu();
476d139c 2017 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
1da177e4 2018 put_cpu();
476d139c
NP
2019 if (new_cpu != this_cpu)
2020 sched_migrate_task(current, new_cpu);
1da177e4
LT
2021}
2022
2023/*
2024 * pull_task - move a task from a remote runqueue to the local runqueue.
2025 * Both runqueues must be locked.
2026 */
dd41f596
IM
2027static void pull_task(struct rq *src_rq, struct task_struct *p,
2028 struct rq *this_rq, int this_cpu)
1da177e4 2029{
dd41f596 2030 deactivate_task(src_rq, p, 0);
1da177e4 2031 set_task_cpu(p, this_cpu);
dd41f596 2032 activate_task(this_rq, p, 0);
1da177e4
LT
2033 /*
2034 * Note that idle threads have a prio of MAX_PRIO, for this test
2035 * to be always true for them.
2036 */
dd41f596 2037 check_preempt_curr(this_rq, p);
1da177e4
LT
2038}
2039
2040/*
2041 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2042 */
858119e1 2043static
70b97a7f 2044int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
d15bcfdb 2045 struct sched_domain *sd, enum cpu_idle_type idle,
95cdf3b7 2046 int *all_pinned)
1da177e4
LT
2047{
2048 /*
2049 * We do not migrate tasks that are:
2050 * 1) running (obviously), or
2051 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2052 * 3) are cache-hot on their current CPU.
2053 */
1da177e4
LT
2054 if (!cpu_isset(this_cpu, p->cpus_allowed))
2055 return 0;
81026794
NP
2056 *all_pinned = 0;
2057
2058 if (task_running(rq, p))
2059 return 0;
1da177e4
LT
2060
2061 /*
dd41f596 2062 * Aggressive migration if too many balance attempts have failed:
1da177e4 2063 */
dd41f596 2064 if (sd->nr_balance_failed > sd->cache_nice_tries)
1da177e4
LT
2065 return 1;
2066
1da177e4
LT
2067 return 1;
2068}
2069
dd41f596 2070static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2dd73a4f 2071 unsigned long max_nr_move, unsigned long max_load_move,
d15bcfdb 2072 struct sched_domain *sd, enum cpu_idle_type idle,
dd41f596
IM
2073 int *all_pinned, unsigned long *load_moved,
2074 int this_best_prio, int best_prio, int best_prio_seen,
2075 struct rq_iterator *iterator)
1da177e4 2076{
dd41f596
IM
2077 int pulled = 0, pinned = 0, skip_for_load;
2078 struct task_struct *p;
2079 long rem_load_move = max_load_move;
1da177e4 2080
2dd73a4f 2081 if (max_nr_move == 0 || max_load_move == 0)
1da177e4
LT
2082 goto out;
2083
81026794
NP
2084 pinned = 1;
2085
1da177e4 2086 /*
dd41f596 2087 * Start the load-balancing iterator:
1da177e4 2088 */
dd41f596
IM
2089 p = iterator->start(iterator->arg);
2090next:
2091 if (!p)
1da177e4 2092 goto out;
50ddd969
PW
2093 /*
2094 * To help distribute high priority tasks accross CPUs we don't
2095 * skip a task if it will be the highest priority task (i.e. smallest
2096 * prio value) on its new queue regardless of its load weight
2097 */
dd41f596
IM
2098 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2099 SCHED_LOAD_SCALE_FUZZ;
2100 if (skip_for_load && p->prio < this_best_prio)
2101 skip_for_load = !best_prio_seen && p->prio == best_prio;
615052dc 2102 if (skip_for_load ||
dd41f596 2103 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
48f24c4d 2104
dd41f596
IM
2105 best_prio_seen |= p->prio == best_prio;
2106 p = iterator->next(iterator->arg);
2107 goto next;
1da177e4
LT
2108 }
2109
dd41f596 2110 pull_task(busiest, p, this_rq, this_cpu);
1da177e4 2111 pulled++;
dd41f596 2112 rem_load_move -= p->se.load.weight;
1da177e4 2113
2dd73a4f
PW
2114 /*
2115 * We only want to steal up to the prescribed number of tasks
2116 * and the prescribed amount of weighted load.
2117 */
2118 if (pulled < max_nr_move && rem_load_move > 0) {
dd41f596
IM
2119 if (p->prio < this_best_prio)
2120 this_best_prio = p->prio;
2121 p = iterator->next(iterator->arg);
2122 goto next;
1da177e4
LT
2123 }
2124out:
2125 /*
2126 * Right now, this is the only place pull_task() is called,
2127 * so we can safely collect pull_task() stats here rather than
2128 * inside pull_task().
2129 */
2130 schedstat_add(sd, lb_gained[idle], pulled);
81026794
NP
2131
2132 if (all_pinned)
2133 *all_pinned = pinned;
dd41f596 2134 *load_moved = max_load_move - rem_load_move;
1da177e4
LT
2135 return pulled;
2136}
2137
dd41f596
IM
2138/*
2139 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2140 * load from busiest to this_rq, as part of a balancing operation within
2141 * "domain". Returns the number of tasks moved.
2142 *
2143 * Called with both runqueues locked.
2144 */
2145static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2146 unsigned long max_nr_move, unsigned long max_load_move,
2147 struct sched_domain *sd, enum cpu_idle_type idle,
2148 int *all_pinned)
2149{
2150 struct sched_class *class = sched_class_highest;
2151 unsigned long load_moved, total_nr_moved = 0, nr_moved;
2152 long rem_load_move = max_load_move;
2153
2154 do {
2155 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2156 max_nr_move, (unsigned long)rem_load_move,
2157 sd, idle, all_pinned, &load_moved);
2158 total_nr_moved += nr_moved;
2159 max_nr_move -= nr_moved;
2160 rem_load_move -= load_moved;
2161 class = class->next;
2162 } while (class && max_nr_move && rem_load_move > 0);
2163
2164 return total_nr_moved;
2165}
2166
1da177e4
LT
2167/*
2168 * find_busiest_group finds and returns the busiest CPU group within the
48f24c4d
IM
2169 * domain. It calculates and returns the amount of weighted load which
2170 * should be moved to restore balance via the imbalance parameter.
1da177e4
LT
2171 */
2172static struct sched_group *
2173find_busiest_group(struct sched_domain *sd, int this_cpu,
dd41f596
IM
2174 unsigned long *imbalance, enum cpu_idle_type idle,
2175 int *sd_idle, cpumask_t *cpus, int *balance)
1da177e4
LT
2176{
2177 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2178 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
0c117f1b 2179 unsigned long max_pull;
2dd73a4f
PW
2180 unsigned long busiest_load_per_task, busiest_nr_running;
2181 unsigned long this_load_per_task, this_nr_running;
7897986b 2182 int load_idx;
5c45bf27
SS
2183#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2184 int power_savings_balance = 1;
2185 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2186 unsigned long min_nr_running = ULONG_MAX;
2187 struct sched_group *group_min = NULL, *group_leader = NULL;
2188#endif
1da177e4
LT
2189
2190 max_load = this_load = total_load = total_pwr = 0;
2dd73a4f
PW
2191 busiest_load_per_task = busiest_nr_running = 0;
2192 this_load_per_task = this_nr_running = 0;
d15bcfdb 2193 if (idle == CPU_NOT_IDLE)
7897986b 2194 load_idx = sd->busy_idx;
d15bcfdb 2195 else if (idle == CPU_NEWLY_IDLE)
7897986b
NP
2196 load_idx = sd->newidle_idx;
2197 else
2198 load_idx = sd->idle_idx;
1da177e4
LT
2199
2200 do {
5c45bf27 2201 unsigned long load, group_capacity;
1da177e4
LT
2202 int local_group;
2203 int i;
783609c6 2204 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2dd73a4f 2205 unsigned long sum_nr_running, sum_weighted_load;
1da177e4
LT
2206
2207 local_group = cpu_isset(this_cpu, group->cpumask);
2208
783609c6
SS
2209 if (local_group)
2210 balance_cpu = first_cpu(group->cpumask);
2211
1da177e4 2212 /* Tally up the load of all CPUs in the group */
2dd73a4f 2213 sum_weighted_load = sum_nr_running = avg_load = 0;
1da177e4
LT
2214
2215 for_each_cpu_mask(i, group->cpumask) {
0a2966b4
CL
2216 struct rq *rq;
2217
2218 if (!cpu_isset(i, *cpus))
2219 continue;
2220
2221 rq = cpu_rq(i);
2dd73a4f 2222
5969fe06
NP
2223 if (*sd_idle && !idle_cpu(i))
2224 *sd_idle = 0;
2225
1da177e4 2226 /* Bias balancing toward cpus of our domain */
783609c6
SS
2227 if (local_group) {
2228 if (idle_cpu(i) && !first_idle_cpu) {
2229 first_idle_cpu = 1;
2230 balance_cpu = i;
2231 }
2232
a2000572 2233 load = target_load(i, load_idx);
783609c6 2234 } else
a2000572 2235 load = source_load(i, load_idx);
1da177e4
LT
2236
2237 avg_load += load;
2dd73a4f 2238 sum_nr_running += rq->nr_running;
dd41f596 2239 sum_weighted_load += weighted_cpuload(i);
1da177e4
LT
2240 }
2241
783609c6
SS
2242 /*
2243 * First idle cpu or the first cpu(busiest) in this sched group
2244 * is eligible for doing load balancing at this and above
2245 * domains.
2246 */
2247 if (local_group && balance_cpu != this_cpu && balance) {
2248 *balance = 0;
2249 goto ret;
2250 }
2251
1da177e4 2252 total_load += avg_load;
5517d86b 2253 total_pwr += group->__cpu_power;
1da177e4
LT
2254
2255 /* Adjust by relative CPU power of the group */
5517d86b
ED
2256 avg_load = sg_div_cpu_power(group,
2257 avg_load * SCHED_LOAD_SCALE);
1da177e4 2258
5517d86b 2259 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
5c45bf27 2260
1da177e4
LT
2261 if (local_group) {
2262 this_load = avg_load;
2263 this = group;
2dd73a4f
PW
2264 this_nr_running = sum_nr_running;
2265 this_load_per_task = sum_weighted_load;
2266 } else if (avg_load > max_load &&
5c45bf27 2267 sum_nr_running > group_capacity) {
1da177e4
LT
2268 max_load = avg_load;
2269 busiest = group;
2dd73a4f
PW
2270 busiest_nr_running = sum_nr_running;
2271 busiest_load_per_task = sum_weighted_load;
1da177e4 2272 }
5c45bf27
SS
2273
2274#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2275 /*
2276 * Busy processors will not participate in power savings
2277 * balance.
2278 */
dd41f596
IM
2279 if (idle == CPU_NOT_IDLE ||
2280 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2281 goto group_next;
5c45bf27
SS
2282
2283 /*
2284 * If the local group is idle or completely loaded
2285 * no need to do power savings balance at this domain
2286 */
2287 if (local_group && (this_nr_running >= group_capacity ||
2288 !this_nr_running))
2289 power_savings_balance = 0;
2290
dd41f596 2291 /*
5c45bf27
SS
2292 * If a group is already running at full capacity or idle,
2293 * don't include that group in power savings calculations
dd41f596
IM
2294 */
2295 if (!power_savings_balance || sum_nr_running >= group_capacity
5c45bf27 2296 || !sum_nr_running)
dd41f596 2297 goto group_next;
5c45bf27 2298
dd41f596 2299 /*
5c45bf27 2300 * Calculate the group which has the least non-idle load.
dd41f596
IM
2301 * This is the group from where we need to pick up the load
2302 * for saving power
2303 */
2304 if ((sum_nr_running < min_nr_running) ||
2305 (sum_nr_running == min_nr_running &&
5c45bf27
SS
2306 first_cpu(group->cpumask) <
2307 first_cpu(group_min->cpumask))) {
dd41f596
IM
2308 group_min = group;
2309 min_nr_running = sum_nr_running;
5c45bf27
SS
2310 min_load_per_task = sum_weighted_load /
2311 sum_nr_running;
dd41f596 2312 }
5c45bf27 2313
dd41f596 2314 /*
5c45bf27 2315 * Calculate the group which is almost near its
dd41f596
IM
2316 * capacity but still has some space to pick up some load
2317 * from other group and save more power
2318 */
2319 if (sum_nr_running <= group_capacity - 1) {
2320 if (sum_nr_running > leader_nr_running ||
2321 (sum_nr_running == leader_nr_running &&
2322 first_cpu(group->cpumask) >
2323 first_cpu(group_leader->cpumask))) {
2324 group_leader = group;
2325 leader_nr_running = sum_nr_running;
2326 }
48f24c4d 2327 }
5c45bf27
SS
2328group_next:
2329#endif
1da177e4
LT
2330 group = group->next;
2331 } while (group != sd->groups);
2332
2dd73a4f 2333 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
1da177e4
LT
2334 goto out_balanced;
2335
2336 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2337
2338 if (this_load >= avg_load ||
2339 100*max_load <= sd->imbalance_pct*this_load)
2340 goto out_balanced;
2341
2dd73a4f 2342 busiest_load_per_task /= busiest_nr_running;
1da177e4
LT
2343 /*
2344 * We're trying to get all the cpus to the average_load, so we don't
2345 * want to push ourselves above the average load, nor do we wish to
2346 * reduce the max loaded cpu below the average load, as either of these
2347 * actions would just result in more rebalancing later, and ping-pong
2348 * tasks around. Thus we look for the minimum possible imbalance.
2349 * Negative imbalances (*we* are more loaded than anyone else) will
2350 * be counted as no imbalance for these purposes -- we can't fix that
2351 * by pulling tasks to us. Be careful of negative numbers as they'll
2352 * appear as very large values with unsigned longs.
2353 */
2dd73a4f
PW
2354 if (max_load <= busiest_load_per_task)
2355 goto out_balanced;
2356
2357 /*
2358 * In the presence of smp nice balancing, certain scenarios can have
2359 * max load less than avg load(as we skip the groups at or below
2360 * its cpu_power, while calculating max_load..)
2361 */
2362 if (max_load < avg_load) {
2363 *imbalance = 0;
2364 goto small_imbalance;
2365 }
0c117f1b
SS
2366
2367 /* Don't want to pull so many tasks that a group would go idle */
2dd73a4f 2368 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
0c117f1b 2369
1da177e4 2370 /* How much load to actually move to equalise the imbalance */
5517d86b
ED
2371 *imbalance = min(max_pull * busiest->__cpu_power,
2372 (avg_load - this_load) * this->__cpu_power)
1da177e4
LT
2373 / SCHED_LOAD_SCALE;
2374
2dd73a4f
PW
2375 /*
2376 * if *imbalance is less than the average load per runnable task
2377 * there is no gaurantee that any tasks will be moved so we'll have
2378 * a think about bumping its value to force at least one task to be
2379 * moved
2380 */
dd41f596 2381 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
48f24c4d 2382 unsigned long tmp, pwr_now, pwr_move;
2dd73a4f
PW
2383 unsigned int imbn;
2384
2385small_imbalance:
2386 pwr_move = pwr_now = 0;
2387 imbn = 2;
2388 if (this_nr_running) {
2389 this_load_per_task /= this_nr_running;
2390 if (busiest_load_per_task > this_load_per_task)
2391 imbn = 1;
2392 } else
2393 this_load_per_task = SCHED_LOAD_SCALE;
1da177e4 2394
dd41f596
IM
2395 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2396 busiest_load_per_task * imbn) {
2dd73a4f 2397 *imbalance = busiest_load_per_task;
1da177e4
LT
2398 return busiest;
2399 }
2400
2401 /*
2402 * OK, we don't have enough imbalance to justify moving tasks,
2403 * however we may be able to increase total CPU power used by
2404 * moving them.
2405 */
2406
5517d86b
ED
2407 pwr_now += busiest->__cpu_power *
2408 min(busiest_load_per_task, max_load);
2409 pwr_now += this->__cpu_power *
2410 min(this_load_per_task, this_load);
1da177e4
LT
2411 pwr_now /= SCHED_LOAD_SCALE;
2412
2413 /* Amount of load we'd subtract */
5517d86b
ED
2414 tmp = sg_div_cpu_power(busiest,
2415 busiest_load_per_task * SCHED_LOAD_SCALE);
1da177e4 2416 if (max_load > tmp)
5517d86b 2417 pwr_move += busiest->__cpu_power *
2dd73a4f 2418 min(busiest_load_per_task, max_load - tmp);
1da177e4
LT
2419
2420 /* Amount of load we'd add */
5517d86b 2421 if (max_load * busiest->__cpu_power <
33859f7f 2422 busiest_load_per_task * SCHED_LOAD_SCALE)
5517d86b
ED
2423 tmp = sg_div_cpu_power(this,
2424 max_load * busiest->__cpu_power);
1da177e4 2425 else
5517d86b
ED
2426 tmp = sg_div_cpu_power(this,
2427 busiest_load_per_task * SCHED_LOAD_SCALE);
2428 pwr_move += this->__cpu_power *
2429 min(this_load_per_task, this_load + tmp);
1da177e4
LT
2430 pwr_move /= SCHED_LOAD_SCALE;
2431
2432 /* Move if we gain throughput */
2433 if (pwr_move <= pwr_now)
2434 goto out_balanced;
2435
2dd73a4f 2436 *imbalance = busiest_load_per_task;
1da177e4
LT
2437 }
2438
1da177e4
LT
2439 return busiest;
2440
2441out_balanced:
5c45bf27 2442#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
d15bcfdb 2443 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
5c45bf27 2444 goto ret;
1da177e4 2445
5c45bf27
SS
2446 if (this == group_leader && group_leader != group_min) {
2447 *imbalance = min_load_per_task;
2448 return group_min;
2449 }
5c45bf27 2450#endif
783609c6 2451ret:
1da177e4
LT
2452 *imbalance = 0;
2453 return NULL;
2454}
2455
2456/*
2457 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2458 */
70b97a7f 2459static struct rq *
d15bcfdb 2460find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
0a2966b4 2461 unsigned long imbalance, cpumask_t *cpus)
1da177e4 2462{
70b97a7f 2463 struct rq *busiest = NULL, *rq;
2dd73a4f 2464 unsigned long max_load = 0;
1da177e4
LT
2465 int i;
2466
2467 for_each_cpu_mask(i, group->cpumask) {
dd41f596 2468 unsigned long wl;
0a2966b4
CL
2469
2470 if (!cpu_isset(i, *cpus))
2471 continue;
2472
48f24c4d 2473 rq = cpu_rq(i);
dd41f596 2474 wl = weighted_cpuload(i);
2dd73a4f 2475
dd41f596 2476 if (rq->nr_running == 1 && wl > imbalance)
2dd73a4f 2477 continue;
1da177e4 2478
dd41f596
IM
2479 if (wl > max_load) {
2480 max_load = wl;
48f24c4d 2481 busiest = rq;
1da177e4
LT
2482 }
2483 }
2484
2485 return busiest;
2486}
2487
77391d71
NP
2488/*
2489 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2490 * so long as it is large enough.
2491 */
2492#define MAX_PINNED_INTERVAL 512
2493
48f24c4d
IM
2494static inline unsigned long minus_1_or_zero(unsigned long n)
2495{
2496 return n > 0 ? n - 1 : 0;
2497}
2498
1da177e4
LT
2499/*
2500 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2501 * tasks if there is an imbalance.
1da177e4 2502 */
70b97a7f 2503static int load_balance(int this_cpu, struct rq *this_rq,
d15bcfdb 2504 struct sched_domain *sd, enum cpu_idle_type idle,
783609c6 2505 int *balance)
1da177e4 2506{
48f24c4d 2507 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
1da177e4 2508 struct sched_group *group;
1da177e4 2509 unsigned long imbalance;
70b97a7f 2510 struct rq *busiest;
0a2966b4 2511 cpumask_t cpus = CPU_MASK_ALL;
fe2eea3f 2512 unsigned long flags;
5969fe06 2513
89c4710e
SS
2514 /*
2515 * When power savings policy is enabled for the parent domain, idle
2516 * sibling can pick up load irrespective of busy siblings. In this case,
dd41f596 2517 * let the state of idle sibling percolate up as CPU_IDLE, instead of
d15bcfdb 2518 * portraying it as CPU_NOT_IDLE.
89c4710e 2519 */
d15bcfdb 2520 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
89c4710e 2521 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
5969fe06 2522 sd_idle = 1;
1da177e4 2523
1da177e4
LT
2524 schedstat_inc(sd, lb_cnt[idle]);
2525
0a2966b4
CL
2526redo:
2527 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
783609c6
SS
2528 &cpus, balance);
2529
06066714 2530 if (*balance == 0)
783609c6 2531 goto out_balanced;
783609c6 2532
1da177e4
LT
2533 if (!group) {
2534 schedstat_inc(sd, lb_nobusyg[idle]);
2535 goto out_balanced;
2536 }
2537
0a2966b4 2538 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
1da177e4
LT
2539 if (!busiest) {
2540 schedstat_inc(sd, lb_nobusyq[idle]);
2541 goto out_balanced;
2542 }
2543
db935dbd 2544 BUG_ON(busiest == this_rq);
1da177e4
LT
2545
2546 schedstat_add(sd, lb_imbalance[idle], imbalance);
2547
2548 nr_moved = 0;
2549 if (busiest->nr_running > 1) {
2550 /*
2551 * Attempt to move tasks. If find_busiest_group has found
2552 * an imbalance but busiest->nr_running <= 1, the group is
2553 * still unbalanced. nr_moved simply stays zero, so it is
2554 * correctly treated as an imbalance.
2555 */
fe2eea3f 2556 local_irq_save(flags);
e17224bf 2557 double_rq_lock(this_rq, busiest);
1da177e4 2558 nr_moved = move_tasks(this_rq, this_cpu, busiest,
48f24c4d
IM
2559 minus_1_or_zero(busiest->nr_running),
2560 imbalance, sd, idle, &all_pinned);
e17224bf 2561 double_rq_unlock(this_rq, busiest);
fe2eea3f 2562 local_irq_restore(flags);
81026794 2563
46cb4b7c
SS
2564 /*
2565 * some other cpu did the load balance for us.
2566 */
2567 if (nr_moved && this_cpu != smp_processor_id())
2568 resched_cpu(this_cpu);
2569
81026794 2570 /* All tasks on this runqueue were pinned by CPU affinity */
0a2966b4
CL
2571 if (unlikely(all_pinned)) {
2572 cpu_clear(cpu_of(busiest), cpus);
2573 if (!cpus_empty(cpus))
2574 goto redo;
81026794 2575 goto out_balanced;
0a2966b4 2576 }
1da177e4 2577 }
81026794 2578
1da177e4
LT
2579 if (!nr_moved) {
2580 schedstat_inc(sd, lb_failed[idle]);
2581 sd->nr_balance_failed++;
2582
2583 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
1da177e4 2584
fe2eea3f 2585 spin_lock_irqsave(&busiest->lock, flags);
fa3b6ddc
SS
2586
2587 /* don't kick the migration_thread, if the curr
2588 * task on busiest cpu can't be moved to this_cpu
2589 */
2590 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
fe2eea3f 2591 spin_unlock_irqrestore(&busiest->lock, flags);
fa3b6ddc
SS
2592 all_pinned = 1;
2593 goto out_one_pinned;
2594 }
2595
1da177e4
LT
2596 if (!busiest->active_balance) {
2597 busiest->active_balance = 1;
2598 busiest->push_cpu = this_cpu;
81026794 2599 active_balance = 1;
1da177e4 2600 }
fe2eea3f 2601 spin_unlock_irqrestore(&busiest->lock, flags);
81026794 2602 if (active_balance)
1da177e4
LT
2603 wake_up_process(busiest->migration_thread);
2604
2605 /*
2606 * We've kicked active balancing, reset the failure
2607 * counter.
2608 */
39507451 2609 sd->nr_balance_failed = sd->cache_nice_tries+1;
1da177e4 2610 }
81026794 2611 } else
1da177e4
LT
2612 sd->nr_balance_failed = 0;
2613
81026794 2614 if (likely(!active_balance)) {
1da177e4
LT
2615 /* We were unbalanced, so reset the balancing interval */
2616 sd->balance_interval = sd->min_interval;
81026794
NP
2617 } else {
2618 /*
2619 * If we've begun active balancing, start to back off. This
2620 * case may not be covered by the all_pinned logic if there
2621 * is only 1 task on the busy runqueue (because we don't call
2622 * move_tasks).
2623 */
2624 if (sd->balance_interval < sd->max_interval)
2625 sd->balance_interval *= 2;
1da177e4
LT
2626 }
2627
5c45bf27 2628 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
89c4710e 2629 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
5969fe06 2630 return -1;
1da177e4
LT
2631 return nr_moved;
2632
2633out_balanced:
1da177e4
LT
2634 schedstat_inc(sd, lb_balanced[idle]);
2635
16cfb1c0 2636 sd->nr_balance_failed = 0;
fa3b6ddc
SS
2637
2638out_one_pinned:
1da177e4 2639 /* tune up the balancing interval */
77391d71
NP
2640 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2641 (sd->balance_interval < sd->max_interval))
1da177e4
LT
2642 sd->balance_interval *= 2;
2643
48f24c4d 2644 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
89c4710e 2645 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
5969fe06 2646 return -1;
1da177e4
LT
2647 return 0;
2648}
2649
2650/*
2651 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2652 * tasks if there is an imbalance.
2653 *
d15bcfdb 2654 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
1da177e4
LT
2655 * this_rq is locked.
2656 */
48f24c4d 2657static int
70b97a7f 2658load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
1da177e4
LT
2659{
2660 struct sched_group *group;
70b97a7f 2661 struct rq *busiest = NULL;
1da177e4
LT
2662 unsigned long imbalance;
2663 int nr_moved = 0;
5969fe06 2664 int sd_idle = 0;
0a2966b4 2665 cpumask_t cpus = CPU_MASK_ALL;
5969fe06 2666
89c4710e
SS
2667 /*
2668 * When power savings policy is enabled for the parent domain, idle
2669 * sibling can pick up load irrespective of busy siblings. In this case,
2670 * let the state of idle sibling percolate up as IDLE, instead of
d15bcfdb 2671 * portraying it as CPU_NOT_IDLE.
89c4710e
SS
2672 */
2673 if (sd->flags & SD_SHARE_CPUPOWER &&
2674 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
5969fe06 2675 sd_idle = 1;
1da177e4 2676
d15bcfdb 2677 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
0a2966b4 2678redo:
d15bcfdb 2679 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
783609c6 2680 &sd_idle, &cpus, NULL);
1da177e4 2681 if (!group) {
d15bcfdb 2682 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
16cfb1c0 2683 goto out_balanced;
1da177e4
LT
2684 }
2685
d15bcfdb 2686 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
0a2966b4 2687 &cpus);
db935dbd 2688 if (!busiest) {
d15bcfdb 2689 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
16cfb1c0 2690 goto out_balanced;
1da177e4
LT
2691 }
2692
db935dbd
NP
2693 BUG_ON(busiest == this_rq);
2694
d15bcfdb 2695 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
d6d5cfaf
NP
2696
2697 nr_moved = 0;
2698 if (busiest->nr_running > 1) {
2699 /* Attempt to move tasks */
2700 double_lock_balance(this_rq, busiest);
2701 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2dd73a4f 2702 minus_1_or_zero(busiest->nr_running),
d15bcfdb 2703 imbalance, sd, CPU_NEWLY_IDLE, NULL);
d6d5cfaf 2704 spin_unlock(&busiest->lock);
0a2966b4
CL
2705
2706 if (!nr_moved) {
2707 cpu_clear(cpu_of(busiest), cpus);
2708 if (!cpus_empty(cpus))
2709 goto redo;
2710 }
d6d5cfaf
NP
2711 }
2712
5969fe06 2713 if (!nr_moved) {
d15bcfdb 2714 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
89c4710e
SS
2715 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2716 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
5969fe06
NP
2717 return -1;
2718 } else
16cfb1c0 2719 sd->nr_balance_failed = 0;
1da177e4 2720
1da177e4 2721 return nr_moved;
16cfb1c0
NP
2722
2723out_balanced:
d15bcfdb 2724 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
48f24c4d 2725 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
89c4710e 2726 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
5969fe06 2727 return -1;
16cfb1c0 2728 sd->nr_balance_failed = 0;
48f24c4d 2729
16cfb1c0 2730 return 0;
1da177e4
LT
2731}
2732
2733/*
2734 * idle_balance is called by schedule() if this_cpu is about to become
2735 * idle. Attempts to pull tasks from other CPUs.
2736 */
70b97a7f 2737static void idle_balance(int this_cpu, struct rq *this_rq)
1da177e4
LT
2738{
2739 struct sched_domain *sd;
dd41f596
IM
2740 int pulled_task = -1;
2741 unsigned long next_balance = jiffies + HZ;
1da177e4
LT
2742
2743 for_each_domain(this_cpu, sd) {
92c4ca5c
CL
2744 unsigned long interval;
2745
2746 if (!(sd->flags & SD_LOAD_BALANCE))
2747 continue;
2748
2749 if (sd->flags & SD_BALANCE_NEWIDLE)
48f24c4d 2750 /* If we've pulled tasks over stop searching: */
1bd77f2d 2751 pulled_task = load_balance_newidle(this_cpu,
92c4ca5c
CL
2752 this_rq, sd);
2753
2754 interval = msecs_to_jiffies(sd->balance_interval);
2755 if (time_after(next_balance, sd->last_balance + interval))
2756 next_balance = sd->last_balance + interval;
2757 if (pulled_task)
2758 break;
1da177e4 2759 }
dd41f596 2760 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
1bd77f2d
CL
2761 /*
2762 * We are going idle. next_balance may be set based on
2763 * a busy processor. So reset next_balance.
2764 */
2765 this_rq->next_balance = next_balance;
dd41f596 2766 }
1da177e4
LT
2767}
2768
2769/*
2770 * active_load_balance is run by migration threads. It pushes running tasks
2771 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2772 * running on each physical CPU where possible, and avoids physical /
2773 * logical imbalances.
2774 *
2775 * Called with busiest_rq locked.
2776 */
70b97a7f 2777static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
1da177e4 2778{
39507451 2779 int target_cpu = busiest_rq->push_cpu;
70b97a7f
IM
2780 struct sched_domain *sd;
2781 struct rq *target_rq;
39507451 2782
48f24c4d 2783 /* Is there any task to move? */
39507451 2784 if (busiest_rq->nr_running <= 1)
39507451
NP
2785 return;
2786
2787 target_rq = cpu_rq(target_cpu);
1da177e4
LT
2788
2789 /*
39507451
NP
2790 * This condition is "impossible", if it occurs
2791 * we need to fix it. Originally reported by
2792 * Bjorn Helgaas on a 128-cpu setup.
1da177e4 2793 */
39507451 2794 BUG_ON(busiest_rq == target_rq);
1da177e4 2795
39507451
NP
2796 /* move a task from busiest_rq to target_rq */
2797 double_lock_balance(busiest_rq, target_rq);
2798
2799 /* Search for an sd spanning us and the target CPU. */
c96d145e 2800 for_each_domain(target_cpu, sd) {
39507451 2801 if ((sd->flags & SD_LOAD_BALANCE) &&
48f24c4d 2802 cpu_isset(busiest_cpu, sd->span))
39507451 2803 break;
c96d145e 2804 }
39507451 2805
48f24c4d
IM
2806 if (likely(sd)) {
2807 schedstat_inc(sd, alb_cnt);
39507451 2808
48f24c4d 2809 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
d15bcfdb 2810 RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
48f24c4d
IM
2811 NULL))
2812 schedstat_inc(sd, alb_pushed);
2813 else
2814 schedstat_inc(sd, alb_failed);
2815 }
39507451 2816 spin_unlock(&target_rq->lock);
1da177e4
LT
2817}
2818
46cb4b7c
SS
2819#ifdef CONFIG_NO_HZ
2820static struct {
2821 atomic_t load_balancer;
2822 cpumask_t cpu_mask;
2823} nohz ____cacheline_aligned = {
2824 .load_balancer = ATOMIC_INIT(-1),
2825 .cpu_mask = CPU_MASK_NONE,
2826};
2827
7835b98b 2828/*
46cb4b7c
SS
2829 * This routine will try to nominate the ilb (idle load balancing)
2830 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2831 * load balancing on behalf of all those cpus. If all the cpus in the system
2832 * go into this tickless mode, then there will be no ilb owner (as there is
2833 * no need for one) and all the cpus will sleep till the next wakeup event
2834 * arrives...
2835 *
2836 * For the ilb owner, tick is not stopped. And this tick will be used
2837 * for idle load balancing. ilb owner will still be part of
2838 * nohz.cpu_mask..
7835b98b 2839 *
46cb4b7c
SS
2840 * While stopping the tick, this cpu will become the ilb owner if there
2841 * is no other owner. And will be the owner till that cpu becomes busy
2842 * or if all cpus in the system stop their ticks at which point
2843 * there is no need for ilb owner.
2844 *
2845 * When the ilb owner becomes busy, it nominates another owner, during the
2846 * next busy scheduler_tick()
2847 */
2848int select_nohz_load_balancer(int stop_tick)
2849{
2850 int cpu = smp_processor_id();
2851
2852 if (stop_tick) {
2853 cpu_set(cpu, nohz.cpu_mask);
2854 cpu_rq(cpu)->in_nohz_recently = 1;
2855
2856 /*
2857 * If we are going offline and still the leader, give up!
2858 */
2859 if (cpu_is_offline(cpu) &&
2860 atomic_read(&nohz.load_balancer) == cpu) {
2861 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2862 BUG();
2863 return 0;
2864 }
2865
2866 /* time for ilb owner also to sleep */
2867 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
2868 if (atomic_read(&nohz.load_balancer) == cpu)
2869 atomic_set(&nohz.load_balancer, -1);
2870 return 0;
2871 }
2872
2873 if (atomic_read(&nohz.load_balancer) == -1) {
2874 /* make me the ilb owner */
2875 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
2876 return 1;
2877 } else if (atomic_read(&nohz.load_balancer) == cpu)
2878 return 1;
2879 } else {
2880 if (!cpu_isset(cpu, nohz.cpu_mask))
2881 return 0;
2882
2883 cpu_clear(cpu, nohz.cpu_mask);
2884
2885 if (atomic_read(&nohz.load_balancer) == cpu)
2886 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2887 BUG();
2888 }
2889 return 0;
2890}
2891#endif
2892
2893static DEFINE_SPINLOCK(balancing);
2894
2895/*
7835b98b
CL
2896 * It checks each scheduling domain to see if it is due to be balanced,
2897 * and initiates a balancing operation if so.
2898 *
2899 * Balancing parameters are set up in arch_init_sched_domains.
2900 */
d15bcfdb 2901static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
7835b98b 2902{
46cb4b7c
SS
2903 int balance = 1;
2904 struct rq *rq = cpu_rq(cpu);
7835b98b
CL
2905 unsigned long interval;
2906 struct sched_domain *sd;
46cb4b7c 2907 /* Earliest time when we have to do rebalance again */
c9819f45 2908 unsigned long next_balance = jiffies + 60*HZ;
1da177e4 2909
46cb4b7c 2910 for_each_domain(cpu, sd) {
1da177e4
LT
2911 if (!(sd->flags & SD_LOAD_BALANCE))
2912 continue;
2913
2914 interval = sd->balance_interval;
d15bcfdb 2915 if (idle != CPU_IDLE)
1da177e4
LT
2916 interval *= sd->busy_factor;
2917
2918 /* scale ms to jiffies */
2919 interval = msecs_to_jiffies(interval);
2920 if (unlikely(!interval))
2921 interval = 1;
dd41f596
IM
2922 if (interval > HZ*NR_CPUS/10)
2923 interval = HZ*NR_CPUS/10;
2924
1da177e4 2925
08c183f3
CL
2926 if (sd->flags & SD_SERIALIZE) {
2927 if (!spin_trylock(&balancing))
2928 goto out;
2929 }
2930
c9819f45 2931 if (time_after_eq(jiffies, sd->last_balance + interval)) {
46cb4b7c 2932 if (load_balance(cpu, rq, sd, idle, &balance)) {
fa3b6ddc
SS
2933 /*
2934 * We've pulled tasks over so either we're no
5969fe06
NP
2935 * longer idle, or one of our SMT siblings is
2936 * not idle.
2937 */
d15bcfdb 2938 idle = CPU_NOT_IDLE;
1da177e4 2939 }
1bd77f2d 2940 sd->last_balance = jiffies;
1da177e4 2941 }
08c183f3
CL
2942 if (sd->flags & SD_SERIALIZE)
2943 spin_unlock(&balancing);
2944out:
c9819f45
CL
2945 if (time_after(next_balance, sd->last_balance + interval))
2946 next_balance = sd->last_balance + interval;
783609c6
SS
2947
2948 /*
2949 * Stop the load balance at this level. There is another
2950 * CPU in our sched group which is doing load balancing more
2951 * actively.
2952 */
2953 if (!balance)
2954 break;
1da177e4 2955 }
46cb4b7c
SS
2956 rq->next_balance = next_balance;
2957}
2958
2959/*
2960 * run_rebalance_domains is triggered when needed from the scheduler tick.
2961 * In CONFIG_NO_HZ case, the idle load balance owner will do the
2962 * rebalancing for all the cpus for whom scheduler ticks are stopped.
2963 */
2964static void run_rebalance_domains(struct softirq_action *h)
2965{
dd41f596
IM
2966 int this_cpu = smp_processor_id();
2967 struct rq *this_rq = cpu_rq(this_cpu);
2968 enum cpu_idle_type idle = this_rq->idle_at_tick ?
2969 CPU_IDLE : CPU_NOT_IDLE;
46cb4b7c 2970
dd41f596 2971 rebalance_domains(this_cpu, idle);
46cb4b7c
SS
2972
2973#ifdef CONFIG_NO_HZ
2974 /*
2975 * If this cpu is the owner for idle load balancing, then do the
2976 * balancing on behalf of the other idle cpus whose ticks are
2977 * stopped.
2978 */
dd41f596
IM
2979 if (this_rq->idle_at_tick &&
2980 atomic_read(&nohz.load_balancer) == this_cpu) {
46cb4b7c
SS
2981 cpumask_t cpus = nohz.cpu_mask;
2982 struct rq *rq;
2983 int balance_cpu;
2984
dd41f596 2985 cpu_clear(this_cpu, cpus);
46cb4b7c
SS
2986 for_each_cpu_mask(balance_cpu, cpus) {
2987 /*
2988 * If this cpu gets work to do, stop the load balancing
2989 * work being done for other cpus. Next load
2990 * balancing owner will pick it up.
2991 */
2992 if (need_resched())
2993 break;
2994
dd41f596 2995 rebalance_domains(balance_cpu, SCHED_IDLE);
46cb4b7c
SS
2996
2997 rq = cpu_rq(balance_cpu);
dd41f596
IM
2998 if (time_after(this_rq->next_balance, rq->next_balance))
2999 this_rq->next_balance = rq->next_balance;
46cb4b7c
SS
3000 }
3001 }
3002#endif
3003}
3004
3005/*
3006 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3007 *
3008 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3009 * idle load balancing owner or decide to stop the periodic load balancing,
3010 * if the whole system is idle.
3011 */
dd41f596 3012static inline void trigger_load_balance(struct rq *rq, int cpu)
46cb4b7c 3013{
46cb4b7c
SS
3014#ifdef CONFIG_NO_HZ
3015 /*
3016 * If we were in the nohz mode recently and busy at the current
3017 * scheduler tick, then check if we need to nominate new idle
3018 * load balancer.
3019 */
3020 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3021 rq->in_nohz_recently = 0;
3022
3023 if (atomic_read(&nohz.load_balancer) == cpu) {
3024 cpu_clear(cpu, nohz.cpu_mask);
3025 atomic_set(&nohz.load_balancer, -1);
3026 }
3027
3028 if (atomic_read(&nohz.load_balancer) == -1) {
3029 /*
3030 * simple selection for now: Nominate the
3031 * first cpu in the nohz list to be the next
3032 * ilb owner.
3033 *
3034 * TBD: Traverse the sched domains and nominate
3035 * the nearest cpu in the nohz.cpu_mask.
3036 */
3037 int ilb = first_cpu(nohz.cpu_mask);
3038
3039 if (ilb != NR_CPUS)
3040 resched_cpu(ilb);
3041 }
3042 }
3043
3044 /*
3045 * If this cpu is idle and doing idle load balancing for all the
3046 * cpus with ticks stopped, is it time for that to stop?
3047 */
3048 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3049 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3050 resched_cpu(cpu);
3051 return;
3052 }
3053
3054 /*
3055 * If this cpu is idle and the idle load balancing is done by
3056 * someone else, then no need raise the SCHED_SOFTIRQ
3057 */
3058 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3059 cpu_isset(cpu, nohz.cpu_mask))
3060 return;
3061#endif
3062 if (time_after_eq(jiffies, rq->next_balance))
3063 raise_softirq(SCHED_SOFTIRQ);
1da177e4 3064}
dd41f596
IM
3065
3066#else /* CONFIG_SMP */
3067
1da177e4
LT
3068/*
3069 * on UP we do not need to balance between CPUs:
3070 */
70b97a7f 3071static inline void idle_balance(int cpu, struct rq *rq)
1da177e4
LT
3072{
3073}
dd41f596
IM
3074
3075/* Avoid "used but not defined" warning on UP */
3076static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3077 unsigned long max_nr_move, unsigned long max_load_move,
3078 struct sched_domain *sd, enum cpu_idle_type idle,
3079 int *all_pinned, unsigned long *load_moved,
3080 int this_best_prio, int best_prio, int best_prio_seen,
3081 struct rq_iterator *iterator)
3082{
3083 *load_moved = 0;
3084
3085 return 0;
3086}
3087
1da177e4
LT
3088#endif
3089
1da177e4
LT
3090DEFINE_PER_CPU(struct kernel_stat, kstat);
3091
3092EXPORT_PER_CPU_SYMBOL(kstat);
3093
3094/*
41b86e9c
IM
3095 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3096 * that have not yet been banked in case the task is currently running.
1da177e4 3097 */
41b86e9c 3098unsigned long long task_sched_runtime(struct task_struct *p)
1da177e4 3099{
1da177e4 3100 unsigned long flags;
41b86e9c
IM
3101 u64 ns, delta_exec;
3102 struct rq *rq;
48f24c4d 3103
41b86e9c
IM
3104 rq = task_rq_lock(p, &flags);
3105 ns = p->se.sum_exec_runtime;
3106 if (rq->curr == p) {
3107 delta_exec = rq_clock(rq) - p->se.exec_start;
3108 if ((s64)delta_exec > 0)
3109 ns += delta_exec;
3110 }
3111 task_rq_unlock(rq, &flags);
48f24c4d 3112
1da177e4
LT
3113 return ns;
3114}
3115
1da177e4
LT
3116/*
3117 * Account user cpu time to a process.
3118 * @p: the process that the cpu time gets accounted to
3119 * @hardirq_offset: the offset to subtract from hardirq_count()
3120 * @cputime: the cpu time spent in user space since the last update
3121 */
3122void account_user_time(struct task_struct *p, cputime_t cputime)
3123{
3124 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3125 cputime64_t tmp;
3126
3127 p->utime = cputime_add(p->utime, cputime);
3128
3129 /* Add user time to cpustat. */
3130 tmp = cputime_to_cputime64(cputime);
3131 if (TASK_NICE(p) > 0)
3132 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3133 else
3134 cpustat->user = cputime64_add(cpustat->user, tmp);
3135}
3136
3137/*
3138 * Account system cpu time to a process.
3139 * @p: the process that the cpu time gets accounted to
3140 * @hardirq_offset: the offset to subtract from hardirq_count()
3141 * @cputime: the cpu time spent in kernel space since the last update
3142 */
3143void account_system_time(struct task_struct *p, int hardirq_offset,
3144 cputime_t cputime)
3145{
3146 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
70b97a7f 3147 struct rq *rq = this_rq();
1da177e4
LT
3148 cputime64_t tmp;
3149
3150 p->stime = cputime_add(p->stime, cputime);
3151
3152 /* Add system time to cpustat. */
3153 tmp = cputime_to_cputime64(cputime);
3154 if (hardirq_count() - hardirq_offset)
3155 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3156 else if (softirq_count())
3157 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3158 else if (p != rq->idle)
3159 cpustat->system = cputime64_add(cpustat->system, tmp);
3160 else if (atomic_read(&rq->nr_iowait) > 0)
3161 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3162 else
3163 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3164 /* Account for system time used */
3165 acct_update_integrals(p);
1da177e4
LT
3166}
3167
3168/*
3169 * Account for involuntary wait time.
3170 * @p: the process from which the cpu time has been stolen
3171 * @steal: the cpu time spent in involuntary wait
3172 */
3173void account_steal_time(struct task_struct *p, cputime_t steal)
3174{
3175 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3176 cputime64_t tmp = cputime_to_cputime64(steal);
70b97a7f 3177 struct rq *rq = this_rq();
1da177e4
LT
3178
3179 if (p == rq->idle) {
3180 p->stime = cputime_add(p->stime, steal);
3181 if (atomic_read(&rq->nr_iowait) > 0)
3182 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3183 else
3184 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3185 } else
3186 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3187}
3188
7835b98b
CL
3189/*
3190 * This function gets called by the timer code, with HZ frequency.
3191 * We call it with interrupts disabled.
3192 *
3193 * It also gets called by the fork code, when changing the parent's
3194 * timeslices.
3195 */
3196void scheduler_tick(void)
3197{
7835b98b
CL
3198 int cpu = smp_processor_id();
3199 struct rq *rq = cpu_rq(cpu);
dd41f596
IM
3200 struct task_struct *curr = rq->curr;
3201
3202 spin_lock(&rq->lock);
3203 if (curr != rq->idle) /* FIXME: needed? */
3204 curr->sched_class->task_tick(rq, curr);
3205 update_cpu_load(rq);
3206 spin_unlock(&rq->lock);
7835b98b 3207
e418e1c2 3208#ifdef CONFIG_SMP
dd41f596
IM
3209 rq->idle_at_tick = idle_cpu(cpu);
3210 trigger_load_balance(rq, cpu);
e418e1c2 3211#endif
1da177e4
LT
3212}
3213
1da177e4
LT
3214#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3215
3216void fastcall add_preempt_count(int val)
3217{
3218 /*
3219 * Underflow?
3220 */
9a11b49a
IM
3221 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3222 return;
1da177e4
LT
3223 preempt_count() += val;
3224 /*
3225 * Spinlock count overflowing soon?
3226 */
33859f7f
MOS
3227 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3228 PREEMPT_MASK - 10);
1da177e4
LT
3229}
3230EXPORT_SYMBOL(add_preempt_count);
3231
3232void fastcall sub_preempt_count(int val)
3233{
3234 /*
3235 * Underflow?
3236 */
9a11b49a
IM
3237 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3238 return;
1da177e4
LT
3239 /*
3240 * Is the spinlock portion underflowing?
3241 */
9a11b49a
IM
3242 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3243 !(preempt_count() & PREEMPT_MASK)))
3244 return;
3245
1da177e4
LT
3246 preempt_count() -= val;
3247}
3248EXPORT_SYMBOL(sub_preempt_count);
3249
3250#endif
3251
3252/*
dd41f596 3253 * Print scheduling while atomic bug:
1da177e4 3254 */
dd41f596 3255static noinline void __schedule_bug(struct task_struct *prev)
1da177e4 3256{
dd41f596
IM
3257 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3258 prev->comm, preempt_count(), prev->pid);
3259 debug_show_held_locks(prev);
3260 if (irqs_disabled())
3261 print_irqtrace_events(prev);
3262 dump_stack();
3263}
1da177e4 3264
dd41f596
IM
3265/*
3266 * Various schedule()-time debugging checks and statistics:
3267 */
3268static inline void schedule_debug(struct task_struct *prev)
3269{
1da177e4
LT
3270 /*
3271 * Test if we are atomic. Since do_exit() needs to call into
3272 * schedule() atomically, we ignore that path for now.
3273 * Otherwise, whine if we are scheduling when we should not be.
3274 */
dd41f596
IM
3275 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3276 __schedule_bug(prev);
3277
1da177e4
LT
3278 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3279
dd41f596
IM
3280 schedstat_inc(this_rq(), sched_cnt);
3281}
3282
3283/*
3284 * Pick up the highest-prio task:
3285 */
3286static inline struct task_struct *
3287pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3288{
3289 struct sched_class *class;
3290 struct task_struct *p;
1da177e4
LT
3291
3292 /*
dd41f596
IM
3293 * Optimization: we know that if all tasks are in
3294 * the fair class we can call that function directly:
1da177e4 3295 */
dd41f596
IM
3296 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3297 p = fair_sched_class.pick_next_task(rq, now);
3298 if (likely(p))
3299 return p;
1da177e4
LT
3300 }
3301
dd41f596
IM
3302 class = sched_class_highest;
3303 for ( ; ; ) {
3304 p = class->pick_next_task(rq, now);
3305 if (p)
3306 return p;
3307 /*
3308 * Will never be NULL as the idle class always
3309 * returns a non-NULL p:
3310 */
3311 class = class->next;
3312 }
3313}
1da177e4 3314
dd41f596
IM
3315/*
3316 * schedule() is the main scheduler function.
3317 */
3318asmlinkage void __sched schedule(void)
3319{
3320 struct task_struct *prev, *next;
3321 long *switch_count;
3322 struct rq *rq;
3323 u64 now;
3324 int cpu;
3325
3326need_resched:
3327 preempt_disable();
3328 cpu = smp_processor_id();
3329 rq = cpu_rq(cpu);
3330 rcu_qsctr_inc(cpu);
3331 prev = rq->curr;
3332 switch_count = &prev->nivcsw;
3333
3334 release_kernel_lock(prev);
3335need_resched_nonpreemptible:
3336
3337 schedule_debug(prev);
1da177e4
LT
3338
3339 spin_lock_irq(&rq->lock);
dd41f596 3340 clear_tsk_need_resched(prev);
1da177e4 3341
1da177e4 3342 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
1da177e4 3343 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
dd41f596 3344 unlikely(signal_pending(prev)))) {
1da177e4 3345 prev->state = TASK_RUNNING;
dd41f596
IM
3346 } else {
3347 deactivate_task(rq, prev, 1);
1da177e4 3348 }
dd41f596 3349 switch_count = &prev->nvcsw;
1da177e4
LT
3350 }
3351
dd41f596 3352 if (unlikely(!rq->nr_running))
1da177e4 3353 idle_balance(cpu, rq);
1da177e4 3354
dd41f596
IM
3355 now = __rq_clock(rq);
3356 prev->sched_class->put_prev_task(rq, prev, now);
3357 next = pick_next_task(rq, prev, now);
1da177e4
LT
3358
3359 sched_info_switch(prev, next);
dd41f596 3360
1da177e4 3361 if (likely(prev != next)) {
1da177e4
LT
3362 rq->nr_switches++;
3363 rq->curr = next;
3364 ++*switch_count;
3365
dd41f596 3366 context_switch(rq, prev, next); /* unlocks the rq */
1da177e4
LT
3367 } else
3368 spin_unlock_irq(&rq->lock);
3369
dd41f596
IM
3370 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3371 cpu = smp_processor_id();
3372 rq = cpu_rq(cpu);
1da177e4 3373 goto need_resched_nonpreemptible;
dd41f596 3374 }
1da177e4
LT
3375 preempt_enable_no_resched();
3376 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3377 goto need_resched;
3378}
1da177e4
LT
3379EXPORT_SYMBOL(schedule);
3380
3381#ifdef CONFIG_PREEMPT
3382/*
2ed6e34f 3383 * this is the entry point to schedule() from in-kernel preemption
1da177e4
LT
3384 * off of preempt_enable. Kernel preemptions off return from interrupt
3385 * occur there and call schedule directly.
3386 */
3387asmlinkage void __sched preempt_schedule(void)
3388{
3389 struct thread_info *ti = current_thread_info();
3390#ifdef CONFIG_PREEMPT_BKL
3391 struct task_struct *task = current;
3392 int saved_lock_depth;
3393#endif
3394 /*
3395 * If there is a non-zero preempt_count or interrupts are disabled,
3396 * we do not want to preempt the current task. Just return..
3397 */
beed33a8 3398 if (likely(ti->preempt_count || irqs_disabled()))
1da177e4
LT
3399 return;
3400
3401need_resched:
3402 add_preempt_count(PREEMPT_ACTIVE);
3403 /*
3404 * We keep the big kernel semaphore locked, but we
3405 * clear ->lock_depth so that schedule() doesnt
3406 * auto-release the semaphore:
3407 */
3408#ifdef CONFIG_PREEMPT_BKL
3409 saved_lock_depth = task->lock_depth;
3410 task->lock_depth = -1;
3411#endif
3412 schedule();
3413#ifdef CONFIG_PREEMPT_BKL
3414 task->lock_depth = saved_lock_depth;
3415#endif
3416 sub_preempt_count(PREEMPT_ACTIVE);
3417
3418 /* we could miss a preemption opportunity between schedule and now */
3419 barrier();
3420 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3421 goto need_resched;
3422}
1da177e4
LT
3423EXPORT_SYMBOL(preempt_schedule);
3424
3425/*
2ed6e34f 3426 * this is the entry point to schedule() from kernel preemption
1da177e4
LT
3427 * off of irq context.
3428 * Note, that this is called and return with irqs disabled. This will
3429 * protect us against recursive calling from irq.
3430 */
3431asmlinkage void __sched preempt_schedule_irq(void)
3432{
3433 struct thread_info *ti = current_thread_info();
3434#ifdef CONFIG_PREEMPT_BKL
3435 struct task_struct *task = current;
3436 int saved_lock_depth;
3437#endif
2ed6e34f 3438 /* Catch callers which need to be fixed */
1da177e4
LT
3439 BUG_ON(ti->preempt_count || !irqs_disabled());
3440
3441need_resched:
3442 add_preempt_count(PREEMPT_ACTIVE);
3443 /*
3444 * We keep the big kernel semaphore locked, but we
3445 * clear ->lock_depth so that schedule() doesnt
3446 * auto-release the semaphore:
3447 */
3448#ifdef CONFIG_PREEMPT_BKL
3449 saved_lock_depth = task->lock_depth;
3450 task->lock_depth = -1;
3451#endif
3452 local_irq_enable();
3453 schedule();
3454 local_irq_disable();
3455#ifdef CONFIG_PREEMPT_BKL
3456 task->lock_depth = saved_lock_depth;
3457#endif
3458 sub_preempt_count(PREEMPT_ACTIVE);
3459
3460 /* we could miss a preemption opportunity between schedule and now */
3461 barrier();
3462 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3463 goto need_resched;
3464}
3465
3466#endif /* CONFIG_PREEMPT */
3467
95cdf3b7
IM
3468int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3469 void *key)
1da177e4 3470{
48f24c4d 3471 return try_to_wake_up(curr->private, mode, sync);
1da177e4 3472}
1da177e4
LT
3473EXPORT_SYMBOL(default_wake_function);
3474
3475/*
3476 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3477 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3478 * number) then we wake all the non-exclusive tasks and one exclusive task.
3479 *
3480 * There are circumstances in which we can try to wake a task which has already
3481 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3482 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3483 */
3484static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3485 int nr_exclusive, int sync, void *key)
3486{
3487 struct list_head *tmp, *next;
3488
3489 list_for_each_safe(tmp, next, &q->task_list) {
48f24c4d
IM
3490 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3491 unsigned flags = curr->flags;
3492
1da177e4 3493 if (curr->func(curr, mode, sync, key) &&
48f24c4d 3494 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
1da177e4
LT
3495 break;
3496 }
3497}
3498
3499/**
3500 * __wake_up - wake up threads blocked on a waitqueue.
3501 * @q: the waitqueue
3502 * @mode: which threads
3503 * @nr_exclusive: how many wake-one or wake-many threads to wake up
67be2dd1 3504 * @key: is directly passed to the wakeup function
1da177e4
LT
3505 */
3506void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
95cdf3b7 3507 int nr_exclusive, void *key)
1da177e4
LT
3508{
3509 unsigned long flags;
3510
3511 spin_lock_irqsave(&q->lock, flags);
3512 __wake_up_common(q, mode, nr_exclusive, 0, key);
3513 spin_unlock_irqrestore(&q->lock, flags);
3514}
1da177e4
LT
3515EXPORT_SYMBOL(__wake_up);
3516
3517/*
3518 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3519 */
3520void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3521{
3522 __wake_up_common(q, mode, 1, 0, NULL);
3523}
3524
3525/**
67be2dd1 3526 * __wake_up_sync - wake up threads blocked on a waitqueue.
1da177e4
LT
3527 * @q: the waitqueue
3528 * @mode: which threads
3529 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3530 *
3531 * The sync wakeup differs that the waker knows that it will schedule
3532 * away soon, so while the target thread will be woken up, it will not
3533 * be migrated to another CPU - ie. the two threads are 'synchronized'
3534 * with each other. This can prevent needless bouncing between CPUs.
3535 *
3536 * On UP it can prevent extra preemption.
3537 */
95cdf3b7
IM
3538void fastcall
3539__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
1da177e4
LT
3540{
3541 unsigned long flags;
3542 int sync = 1;
3543
3544 if (unlikely(!q))
3545 return;
3546
3547 if (unlikely(!nr_exclusive))
3548 sync = 0;
3549
3550 spin_lock_irqsave(&q->lock, flags);
3551 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3552 spin_unlock_irqrestore(&q->lock, flags);
3553}
3554EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3555
3556void fastcall complete(struct completion *x)
3557{
3558 unsigned long flags;
3559
3560 spin_lock_irqsave(&x->wait.lock, flags);
3561 x->done++;
3562 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3563 1, 0, NULL);
3564 spin_unlock_irqrestore(&x->wait.lock, flags);
3565}
3566EXPORT_SYMBOL(complete);
3567
3568void fastcall complete_all(struct completion *x)
3569{
3570 unsigned long flags;
3571
3572 spin_lock_irqsave(&x->wait.lock, flags);
3573 x->done += UINT_MAX/2;
3574 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3575 0, 0, NULL);
3576 spin_unlock_irqrestore(&x->wait.lock, flags);
3577}
3578EXPORT_SYMBOL(complete_all);
3579
3580void fastcall __sched wait_for_completion(struct completion *x)
3581{
3582 might_sleep();
48f24c4d 3583
1da177e4
LT
3584 spin_lock_irq(&x->wait.lock);
3585 if (!x->done) {
3586 DECLARE_WAITQUEUE(wait, current);
3587
3588 wait.flags |= WQ_FLAG_EXCLUSIVE;
3589 __add_wait_queue_tail(&x->wait, &wait);
3590 do {
3591 __set_current_state(TASK_UNINTERRUPTIBLE);
3592 spin_unlock_irq(&x->wait.lock);
3593 schedule();
3594 spin_lock_irq(&x->wait.lock);
3595 } while (!x->done);
3596 __remove_wait_queue(&x->wait, &wait);
3597 }
3598 x->done--;
3599 spin_unlock_irq(&x->wait.lock);
3600}
3601EXPORT_SYMBOL(wait_for_completion);
3602
3603unsigned long fastcall __sched
3604wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3605{
3606 might_sleep();
3607
3608 spin_lock_irq(&x->wait.lock);
3609 if (!x->done) {
3610 DECLARE_WAITQUEUE(wait, current);
3611
3612 wait.flags |= WQ_FLAG_EXCLUSIVE;
3613 __add_wait_queue_tail(&x->wait, &wait);
3614 do {
3615 __set_current_state(TASK_UNINTERRUPTIBLE);
3616 spin_unlock_irq(&x->wait.lock);
3617 timeout = schedule_timeout(timeout);
3618 spin_lock_irq(&x->wait.lock);
3619 if (!timeout) {
3620 __remove_wait_queue(&x->wait, &wait);
3621 goto out;
3622 }
3623 } while (!x->done);
3624 __remove_wait_queue(&x->wait, &wait);
3625 }
3626 x->done--;
3627out:
3628 spin_unlock_irq(&x->wait.lock);
3629 return timeout;
3630}
3631EXPORT_SYMBOL(wait_for_completion_timeout);
3632
3633int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3634{
3635 int ret = 0;
3636
3637 might_sleep();
3638
3639 spin_lock_irq(&x->wait.lock);
3640 if (!x->done) {
3641 DECLARE_WAITQUEUE(wait, current);
3642
3643 wait.flags |= WQ_FLAG_EXCLUSIVE;
3644 __add_wait_queue_tail(&x->wait, &wait);
3645 do {
3646 if (signal_pending(current)) {
3647 ret = -ERESTARTSYS;
3648 __remove_wait_queue(&x->wait, &wait);
3649 goto out;
3650 }
3651 __set_current_state(TASK_INTERRUPTIBLE);
3652 spin_unlock_irq(&x->wait.lock);
3653 schedule();
3654 spin_lock_irq(&x->wait.lock);
3655 } while (!x->done);
3656 __remove_wait_queue(&x->wait, &wait);
3657 }
3658 x->done--;
3659out:
3660 spin_unlock_irq(&x->wait.lock);
3661
3662 return ret;
3663}
3664EXPORT_SYMBOL(wait_for_completion_interruptible);
3665
3666unsigned long fastcall __sched
3667wait_for_completion_interruptible_timeout(struct completion *x,
3668 unsigned long timeout)
3669{
3670 might_sleep();
3671
3672 spin_lock_irq(&x->wait.lock);
3673 if (!x->done) {
3674 DECLARE_WAITQUEUE(wait, current);
3675
3676 wait.flags |= WQ_FLAG_EXCLUSIVE;
3677 __add_wait_queue_tail(&x->wait, &wait);
3678 do {
3679 if (signal_pending(current)) {
3680 timeout = -ERESTARTSYS;
3681 __remove_wait_queue(&x->wait, &wait);
3682 goto out;
3683 }
3684 __set_current_state(TASK_INTERRUPTIBLE);
3685 spin_unlock_irq(&x->wait.lock);
3686 timeout = schedule_timeout(timeout);
3687 spin_lock_irq(&x->wait.lock);
3688 if (!timeout) {
3689 __remove_wait_queue(&x->wait, &wait);
3690 goto out;
3691 }
3692 } while (!x->done);
3693 __remove_wait_queue(&x->wait, &wait);
3694 }
3695 x->done--;
3696out:
3697 spin_unlock_irq(&x->wait.lock);
3698 return timeout;
3699}
3700EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3701
3702
3703#define SLEEP_ON_VAR \
3704 unsigned long flags; \
3705 wait_queue_t wait; \
3706 init_waitqueue_entry(&wait, current);
3707
3708#define SLEEP_ON_HEAD \
3709 spin_lock_irqsave(&q->lock,flags); \
3710 __add_wait_queue(q, &wait); \
3711 spin_unlock(&q->lock);
3712
3713#define SLEEP_ON_TAIL \
3714 spin_lock_irq(&q->lock); \
3715 __remove_wait_queue(q, &wait); \
3716 spin_unlock_irqrestore(&q->lock, flags);
3717
3718void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3719{
3720 SLEEP_ON_VAR
3721
3722 current->state = TASK_INTERRUPTIBLE;
3723
3724 SLEEP_ON_HEAD
3725 schedule();
3726 SLEEP_ON_TAIL
3727}
1da177e4
LT
3728EXPORT_SYMBOL(interruptible_sleep_on);
3729
95cdf3b7
IM
3730long fastcall __sched
3731interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
1da177e4
LT
3732{
3733 SLEEP_ON_VAR
3734
3735 current->state = TASK_INTERRUPTIBLE;
3736
3737 SLEEP_ON_HEAD
3738 timeout = schedule_timeout(timeout);
3739 SLEEP_ON_TAIL
3740
3741 return timeout;
3742}
1da177e4
LT
3743EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3744
3745void fastcall __sched sleep_on(wait_queue_head_t *q)
3746{
3747 SLEEP_ON_VAR
3748
3749 current->state = TASK_UNINTERRUPTIBLE;
3750
3751 SLEEP_ON_HEAD
3752 schedule();
3753 SLEEP_ON_TAIL
3754}
1da177e4
LT
3755EXPORT_SYMBOL(sleep_on);
3756
3757long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3758{
3759 SLEEP_ON_VAR
3760
3761 current->state = TASK_UNINTERRUPTIBLE;
3762
3763 SLEEP_ON_HEAD
3764 timeout = schedule_timeout(timeout);
3765 SLEEP_ON_TAIL
3766
3767 return timeout;
3768}
3769
3770EXPORT_SYMBOL(sleep_on_timeout);
3771
b29739f9
IM
3772#ifdef CONFIG_RT_MUTEXES
3773
3774/*
3775 * rt_mutex_setprio - set the current priority of a task
3776 * @p: task
3777 * @prio: prio value (kernel-internal form)
3778 *
3779 * This function changes the 'effective' priority of a task. It does
3780 * not touch ->normal_prio like __setscheduler().
3781 *
3782 * Used by the rt_mutex code to implement priority inheritance logic.
3783 */
36c8b586 3784void rt_mutex_setprio(struct task_struct *p, int prio)
b29739f9
IM
3785{
3786 unsigned long flags;
dd41f596 3787 int oldprio, on_rq;
70b97a7f 3788 struct rq *rq;
dd41f596 3789 u64 now;
b29739f9
IM
3790
3791 BUG_ON(prio < 0 || prio > MAX_PRIO);
3792
3793 rq = task_rq_lock(p, &flags);
dd41f596 3794 now = rq_clock(rq);
b29739f9 3795
d5f9f942 3796 oldprio = p->prio;
dd41f596
IM
3797 on_rq = p->se.on_rq;
3798 if (on_rq)
3799 dequeue_task(rq, p, 0, now);
3800
3801 if (rt_prio(prio))
3802 p->sched_class = &rt_sched_class;
3803 else
3804 p->sched_class = &fair_sched_class;
3805
b29739f9
IM
3806 p->prio = prio;
3807
dd41f596
IM
3808 if (on_rq) {
3809 enqueue_task(rq, p, 0, now);
b29739f9
IM
3810 /*
3811 * Reschedule if we are currently running on this runqueue and
d5f9f942
AM
3812 * our priority decreased, or if we are not currently running on
3813 * this runqueue and our priority is higher than the current's
b29739f9 3814 */
d5f9f942
AM
3815 if (task_running(rq, p)) {
3816 if (p->prio > oldprio)
3817 resched_task(rq->curr);
dd41f596
IM
3818 } else {
3819 check_preempt_curr(rq, p);
3820 }
b29739f9
IM
3821 }
3822 task_rq_unlock(rq, &flags);
3823}
3824
3825#endif
3826
36c8b586 3827void set_user_nice(struct task_struct *p, long nice)
1da177e4 3828{
dd41f596 3829 int old_prio, delta, on_rq;
1da177e4 3830 unsigned long flags;
70b97a7f 3831 struct rq *rq;
dd41f596 3832 u64 now;
1da177e4
LT
3833
3834 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3835 return;
3836 /*
3837 * We have to be careful, if called from sys_setpriority(),
3838 * the task might be in the middle of scheduling on another CPU.
3839 */
3840 rq = task_rq_lock(p, &flags);
dd41f596 3841 now = rq_clock(rq);
1da177e4
LT
3842 /*
3843 * The RT priorities are set via sched_setscheduler(), but we still
3844 * allow the 'normal' nice value to be set - but as expected
3845 * it wont have any effect on scheduling until the task is
dd41f596 3846 * SCHED_FIFO/SCHED_RR:
1da177e4 3847 */
e05606d3 3848 if (task_has_rt_policy(p)) {
1da177e4
LT
3849 p->static_prio = NICE_TO_PRIO(nice);
3850 goto out_unlock;
3851 }
dd41f596
IM
3852 on_rq = p->se.on_rq;
3853 if (on_rq) {
3854 dequeue_task(rq, p, 0, now);
3855 dec_load(rq, p, now);
2dd73a4f 3856 }
1da177e4 3857
1da177e4 3858 p->static_prio = NICE_TO_PRIO(nice);
2dd73a4f 3859 set_load_weight(p);
b29739f9
IM
3860 old_prio = p->prio;
3861 p->prio = effective_prio(p);
3862 delta = p->prio - old_prio;
1da177e4 3863
dd41f596
IM
3864 if (on_rq) {
3865 enqueue_task(rq, p, 0, now);
3866 inc_load(rq, p, now);
1da177e4 3867 /*
d5f9f942
AM
3868 * If the task increased its priority or is running and
3869 * lowered its priority, then reschedule its CPU:
1da177e4 3870 */
d5f9f942 3871 if (delta < 0 || (delta > 0 && task_running(rq, p)))
1da177e4
LT
3872 resched_task(rq->curr);
3873 }
3874out_unlock:
3875 task_rq_unlock(rq, &flags);
3876}
1da177e4
LT
3877EXPORT_SYMBOL(set_user_nice);
3878
e43379f1
MM
3879/*
3880 * can_nice - check if a task can reduce its nice value
3881 * @p: task
3882 * @nice: nice value
3883 */
36c8b586 3884int can_nice(const struct task_struct *p, const int nice)
e43379f1 3885{
024f4747
MM
3886 /* convert nice value [19,-20] to rlimit style value [1,40] */
3887 int nice_rlim = 20 - nice;
48f24c4d 3888
e43379f1
MM
3889 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3890 capable(CAP_SYS_NICE));
3891}
3892
1da177e4
LT
3893#ifdef __ARCH_WANT_SYS_NICE
3894
3895/*
3896 * sys_nice - change the priority of the current process.
3897 * @increment: priority increment
3898 *
3899 * sys_setpriority is a more generic, but much slower function that
3900 * does similar things.
3901 */
3902asmlinkage long sys_nice(int increment)
3903{
48f24c4d 3904 long nice, retval;
1da177e4
LT
3905
3906 /*
3907 * Setpriority might change our priority at the same moment.
3908 * We don't have to worry. Conceptually one call occurs first
3909 * and we have a single winner.
3910 */
e43379f1
MM
3911 if (increment < -40)
3912 increment = -40;
1da177e4
LT
3913 if (increment > 40)
3914 increment = 40;
3915
3916 nice = PRIO_TO_NICE(current->static_prio) + increment;
3917 if (nice < -20)
3918 nice = -20;
3919 if (nice > 19)
3920 nice = 19;
3921
e43379f1
MM
3922 if (increment < 0 && !can_nice(current, nice))
3923 return -EPERM;
3924
1da177e4
LT
3925 retval = security_task_setnice(current, nice);
3926 if (retval)
3927 return retval;
3928
3929 set_user_nice(current, nice);
3930 return 0;
3931}
3932
3933#endif
3934
3935/**
3936 * task_prio - return the priority value of a given task.
3937 * @p: the task in question.
3938 *
3939 * This is the priority value as seen by users in /proc.
3940 * RT tasks are offset by -200. Normal tasks are centered
3941 * around 0, value goes from -16 to +15.
3942 */
36c8b586 3943int task_prio(const struct task_struct *p)
1da177e4
LT
3944{
3945 return p->prio - MAX_RT_PRIO;
3946}
3947
3948/**
3949 * task_nice - return the nice value of a given task.
3950 * @p: the task in question.
3951 */
36c8b586 3952int task_nice(const struct task_struct *p)
1da177e4
LT
3953{
3954 return TASK_NICE(p);
3955}
1da177e4 3956EXPORT_SYMBOL_GPL(task_nice);
1da177e4
LT
3957
3958/**
3959 * idle_cpu - is a given cpu idle currently?
3960 * @cpu: the processor in question.
3961 */
3962int idle_cpu(int cpu)
3963{
3964 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3965}
3966
1da177e4
LT
3967/**
3968 * idle_task - return the idle task for a given cpu.
3969 * @cpu: the processor in question.
3970 */
36c8b586 3971struct task_struct *idle_task(int cpu)
1da177e4
LT
3972{
3973 return cpu_rq(cpu)->idle;
3974}
3975
3976/**
3977 * find_process_by_pid - find a process with a matching PID value.
3978 * @pid: the pid in question.
3979 */
36c8b586 3980static inline struct task_struct *find_process_by_pid(pid_t pid)
1da177e4
LT
3981{
3982 return pid ? find_task_by_pid(pid) : current;
3983}
3984
3985/* Actually do priority change: must hold rq lock. */
dd41f596
IM
3986static void
3987__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
1da177e4 3988{
dd41f596 3989 BUG_ON(p->se.on_rq);
48f24c4d 3990
1da177e4 3991 p->policy = policy;
dd41f596
IM
3992 switch (p->policy) {
3993 case SCHED_NORMAL:
3994 case SCHED_BATCH:
3995 case SCHED_IDLE:
3996 p->sched_class = &fair_sched_class;
3997 break;
3998 case SCHED_FIFO:
3999 case SCHED_RR:
4000 p->sched_class = &rt_sched_class;
4001 break;
4002 }
4003
1da177e4 4004 p->rt_priority = prio;
b29739f9
IM
4005 p->normal_prio = normal_prio(p);
4006 /* we are holding p->pi_lock already */
4007 p->prio = rt_mutex_getprio(p);
2dd73a4f 4008 set_load_weight(p);
1da177e4
LT
4009}
4010
4011/**
72fd4a35 4012 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
1da177e4
LT
4013 * @p: the task in question.
4014 * @policy: new policy.
4015 * @param: structure containing the new RT priority.
5fe1d75f 4016 *
72fd4a35 4017 * NOTE that the task may be already dead.
1da177e4 4018 */
95cdf3b7
IM
4019int sched_setscheduler(struct task_struct *p, int policy,
4020 struct sched_param *param)
1da177e4 4021{
dd41f596 4022 int retval, oldprio, oldpolicy = -1, on_rq;
1da177e4 4023 unsigned long flags;
70b97a7f 4024 struct rq *rq;
1da177e4 4025
66e5393a
SR
4026 /* may grab non-irq protected spin_locks */
4027 BUG_ON(in_interrupt());
1da177e4
LT
4028recheck:
4029 /* double check policy once rq lock held */
4030 if (policy < 0)
4031 policy = oldpolicy = p->policy;
4032 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
dd41f596
IM
4033 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4034 policy != SCHED_IDLE)
b0a9499c 4035 return -EINVAL;
1da177e4
LT
4036 /*
4037 * Valid priorities for SCHED_FIFO and SCHED_RR are
dd41f596
IM
4038 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4039 * SCHED_BATCH and SCHED_IDLE is 0.
1da177e4
LT
4040 */
4041 if (param->sched_priority < 0 ||
95cdf3b7 4042 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
d46523ea 4043 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
1da177e4 4044 return -EINVAL;
e05606d3 4045 if (rt_policy(policy) != (param->sched_priority != 0))
1da177e4
LT
4046 return -EINVAL;
4047
37e4ab3f
OC
4048 /*
4049 * Allow unprivileged RT tasks to decrease priority:
4050 */
4051 if (!capable(CAP_SYS_NICE)) {
e05606d3 4052 if (rt_policy(policy)) {
8dc3e909 4053 unsigned long rlim_rtprio;
8dc3e909
ON
4054
4055 if (!lock_task_sighand(p, &flags))
4056 return -ESRCH;
4057 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4058 unlock_task_sighand(p, &flags);
4059
4060 /* can't set/change the rt policy */
4061 if (policy != p->policy && !rlim_rtprio)
4062 return -EPERM;
4063
4064 /* can't increase priority */
4065 if (param->sched_priority > p->rt_priority &&
4066 param->sched_priority > rlim_rtprio)
4067 return -EPERM;
4068 }
dd41f596
IM
4069 /*
4070 * Like positive nice levels, dont allow tasks to
4071 * move out of SCHED_IDLE either:
4072 */
4073 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4074 return -EPERM;
5fe1d75f 4075
37e4ab3f
OC
4076 /* can't change other user's priorities */
4077 if ((current->euid != p->euid) &&
4078 (current->euid != p->uid))
4079 return -EPERM;
4080 }
1da177e4
LT
4081
4082 retval = security_task_setscheduler(p, policy, param);
4083 if (retval)
4084 return retval;
b29739f9
IM
4085 /*
4086 * make sure no PI-waiters arrive (or leave) while we are
4087 * changing the priority of the task:
4088 */
4089 spin_lock_irqsave(&p->pi_lock, flags);
1da177e4
LT
4090 /*
4091 * To be able to change p->policy safely, the apropriate
4092 * runqueue lock must be held.
4093 */
b29739f9 4094 rq = __task_rq_lock(p);
1da177e4
LT
4095 /* recheck policy now with rq lock held */
4096 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4097 policy = oldpolicy = -1;
b29739f9
IM
4098 __task_rq_unlock(rq);
4099 spin_unlock_irqrestore(&p->pi_lock, flags);
1da177e4
LT
4100 goto recheck;
4101 }
dd41f596
IM
4102 on_rq = p->se.on_rq;
4103 if (on_rq)
4104 deactivate_task(rq, p, 0);
1da177e4 4105 oldprio = p->prio;
dd41f596
IM
4106 __setscheduler(rq, p, policy, param->sched_priority);
4107 if (on_rq) {
4108 activate_task(rq, p, 0);
1da177e4
LT
4109 /*
4110 * Reschedule if we are currently running on this runqueue and
d5f9f942
AM
4111 * our priority decreased, or if we are not currently running on
4112 * this runqueue and our priority is higher than the current's
1da177e4 4113 */
d5f9f942
AM
4114 if (task_running(rq, p)) {
4115 if (p->prio > oldprio)
4116 resched_task(rq->curr);
dd41f596
IM
4117 } else {
4118 check_preempt_curr(rq, p);
4119 }
1da177e4 4120 }
b29739f9
IM
4121 __task_rq_unlock(rq);
4122 spin_unlock_irqrestore(&p->pi_lock, flags);
4123
95e02ca9
TG
4124 rt_mutex_adjust_pi(p);
4125
1da177e4
LT
4126 return 0;
4127}
4128EXPORT_SYMBOL_GPL(sched_setscheduler);
4129
95cdf3b7
IM
4130static int
4131do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
1da177e4 4132{
1da177e4
LT
4133 struct sched_param lparam;
4134 struct task_struct *p;
36c8b586 4135 int retval;
1da177e4
LT
4136
4137 if (!param || pid < 0)
4138 return -EINVAL;
4139 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4140 return -EFAULT;
5fe1d75f
ON
4141
4142 rcu_read_lock();
4143 retval = -ESRCH;
1da177e4 4144 p = find_process_by_pid(pid);
5fe1d75f
ON
4145 if (p != NULL)
4146 retval = sched_setscheduler(p, policy, &lparam);
4147 rcu_read_unlock();
36c8b586 4148
1da177e4
LT
4149 return retval;
4150}
4151
4152/**
4153 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4154 * @pid: the pid in question.
4155 * @policy: new policy.
4156 * @param: structure containing the new RT priority.
4157 */
4158asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4159 struct sched_param __user *param)
4160{
c21761f1
JB
4161 /* negative values for policy are not valid */
4162 if (policy < 0)
4163 return -EINVAL;
4164
1da177e4
LT
4165 return do_sched_setscheduler(pid, policy, param);
4166}
4167
4168/**
4169 * sys_sched_setparam - set/change the RT priority of a thread
4170 * @pid: the pid in question.
4171 * @param: structure containing the new RT priority.
4172 */
4173asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4174{
4175 return do_sched_setscheduler(pid, -1, param);
4176}
4177
4178/**
4179 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4180 * @pid: the pid in question.
4181 */
4182asmlinkage long sys_sched_getscheduler(pid_t pid)
4183{
36c8b586 4184 struct task_struct *p;
1da177e4 4185 int retval = -EINVAL;
1da177e4
LT
4186
4187 if (pid < 0)
4188 goto out_nounlock;
4189
4190 retval = -ESRCH;
4191 read_lock(&tasklist_lock);
4192 p = find_process_by_pid(pid);
4193 if (p) {
4194 retval = security_task_getscheduler(p);
4195 if (!retval)
4196 retval = p->policy;
4197 }
4198 read_unlock(&tasklist_lock);
4199
4200out_nounlock:
4201 return retval;
4202}
4203
4204/**
4205 * sys_sched_getscheduler - get the RT priority of a thread
4206 * @pid: the pid in question.
4207 * @param: structure containing the RT priority.
4208 */
4209asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4210{
4211 struct sched_param lp;
36c8b586 4212 struct task_struct *p;
1da177e4 4213 int retval = -EINVAL;
1da177e4
LT
4214
4215 if (!param || pid < 0)
4216 goto out_nounlock;
4217
4218 read_lock(&tasklist_lock);
4219 p = find_process_by_pid(pid);
4220 retval = -ESRCH;
4221 if (!p)
4222 goto out_unlock;
4223
4224 retval = security_task_getscheduler(p);
4225 if (retval)
4226 goto out_unlock;
4227
4228 lp.sched_priority = p->rt_priority;
4229 read_unlock(&tasklist_lock);
4230
4231 /*
4232 * This one might sleep, we cannot do it with a spinlock held ...
4233 */
4234 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4235
4236out_nounlock:
4237 return retval;
4238
4239out_unlock:
4240 read_unlock(&tasklist_lock);
4241 return retval;
4242}
4243
4244long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4245{
1da177e4 4246 cpumask_t cpus_allowed;
36c8b586
IM
4247 struct task_struct *p;
4248 int retval;
1da177e4 4249
5be9361c 4250 mutex_lock(&sched_hotcpu_mutex);
1da177e4
LT
4251 read_lock(&tasklist_lock);
4252
4253 p = find_process_by_pid(pid);
4254 if (!p) {
4255 read_unlock(&tasklist_lock);
5be9361c 4256 mutex_unlock(&sched_hotcpu_mutex);
1da177e4
LT
4257 return -ESRCH;
4258 }
4259
4260 /*
4261 * It is not safe to call set_cpus_allowed with the
4262 * tasklist_lock held. We will bump the task_struct's
4263 * usage count and then drop tasklist_lock.
4264 */
4265 get_task_struct(p);
4266 read_unlock(&tasklist_lock);
4267
4268 retval = -EPERM;
4269 if ((current->euid != p->euid) && (current->euid != p->uid) &&
4270 !capable(CAP_SYS_NICE))
4271 goto out_unlock;
4272
e7834f8f
DQ
4273 retval = security_task_setscheduler(p, 0, NULL);
4274 if (retval)
4275 goto out_unlock;
4276
1da177e4
LT
4277 cpus_allowed = cpuset_cpus_allowed(p);
4278 cpus_and(new_mask, new_mask, cpus_allowed);
4279 retval = set_cpus_allowed(p, new_mask);
4280
4281out_unlock:
4282 put_task_struct(p);
5be9361c 4283 mutex_unlock(&sched_hotcpu_mutex);
1da177e4
LT
4284 return retval;
4285}
4286
4287static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4288 cpumask_t *new_mask)
4289{
4290 if (len < sizeof(cpumask_t)) {
4291 memset(new_mask, 0, sizeof(cpumask_t));
4292 } else if (len > sizeof(cpumask_t)) {
4293 len = sizeof(cpumask_t);
4294 }
4295 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4296}
4297
4298/**
4299 * sys_sched_setaffinity - set the cpu affinity of a process
4300 * @pid: pid of the process
4301 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4302 * @user_mask_ptr: user-space pointer to the new cpu mask
4303 */
4304asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4305 unsigned long __user *user_mask_ptr)
4306{
4307 cpumask_t new_mask;
4308 int retval;
4309
4310 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4311 if (retval)
4312 return retval;
4313
4314 return sched_setaffinity(pid, new_mask);
4315}
4316
4317/*
4318 * Represents all cpu's present in the system
4319 * In systems capable of hotplug, this map could dynamically grow
4320 * as new cpu's are detected in the system via any platform specific
4321 * method, such as ACPI for e.g.
4322 */
4323
4cef0c61 4324cpumask_t cpu_present_map __read_mostly;
1da177e4
LT
4325EXPORT_SYMBOL(cpu_present_map);
4326
4327#ifndef CONFIG_SMP
4cef0c61 4328cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
e16b38f7
GB
4329EXPORT_SYMBOL(cpu_online_map);
4330
4cef0c61 4331cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
e16b38f7 4332EXPORT_SYMBOL(cpu_possible_map);
1da177e4
LT
4333#endif
4334
4335long sched_getaffinity(pid_t pid, cpumask_t *mask)
4336{
36c8b586 4337 struct task_struct *p;
1da177e4 4338 int retval;
1da177e4 4339
5be9361c 4340 mutex_lock(&sched_hotcpu_mutex);
1da177e4
LT
4341 read_lock(&tasklist_lock);
4342
4343 retval = -ESRCH;
4344 p = find_process_by_pid(pid);
4345 if (!p)
4346 goto out_unlock;
4347
e7834f8f
DQ
4348 retval = security_task_getscheduler(p);
4349 if (retval)
4350 goto out_unlock;
4351
2f7016d9 4352 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
1da177e4
LT
4353
4354out_unlock:
4355 read_unlock(&tasklist_lock);
5be9361c 4356 mutex_unlock(&sched_hotcpu_mutex);
1da177e4
LT
4357 if (retval)
4358 return retval;
4359
4360 return 0;
4361}
4362
4363/**
4364 * sys_sched_getaffinity - get the cpu affinity of a process
4365 * @pid: pid of the process
4366 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4367 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4368 */
4369asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4370 unsigned long __user *user_mask_ptr)
4371{
4372 int ret;
4373 cpumask_t mask;
4374
4375 if (len < sizeof(cpumask_t))
4376 return -EINVAL;
4377
4378 ret = sched_getaffinity(pid, &mask);
4379 if (ret < 0)
4380 return ret;
4381
4382 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4383 return -EFAULT;
4384
4385 return sizeof(cpumask_t);
4386}
4387
4388/**
4389 * sys_sched_yield - yield the current processor to other threads.
4390 *
dd41f596
IM
4391 * This function yields the current CPU to other tasks. If there are no
4392 * other threads running on this CPU then this function will return.
1da177e4
LT
4393 */
4394asmlinkage long sys_sched_yield(void)
4395{
70b97a7f 4396 struct rq *rq = this_rq_lock();
1da177e4
LT
4397
4398 schedstat_inc(rq, yld_cnt);
dd41f596 4399 if (unlikely(rq->nr_running == 1))
1da177e4 4400 schedstat_inc(rq, yld_act_empty);
dd41f596
IM
4401 else
4402 current->sched_class->yield_task(rq, current);
1da177e4
LT
4403
4404 /*
4405 * Since we are going to call schedule() anyway, there's
4406 * no need to preempt or enable interrupts:
4407 */
4408 __release(rq->lock);
8a25d5de 4409 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1da177e4
LT
4410 _raw_spin_unlock(&rq->lock);
4411 preempt_enable_no_resched();
4412
4413 schedule();
4414
4415 return 0;
4416}
4417
e7b38404 4418static void __cond_resched(void)
1da177e4 4419{
8e0a43d8
IM
4420#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4421 __might_sleep(__FILE__, __LINE__);
4422#endif
5bbcfd90
IM
4423 /*
4424 * The BKS might be reacquired before we have dropped
4425 * PREEMPT_ACTIVE, which could trigger a second
4426 * cond_resched() call.
4427 */
1da177e4
LT
4428 do {
4429 add_preempt_count(PREEMPT_ACTIVE);
4430 schedule();
4431 sub_preempt_count(PREEMPT_ACTIVE);
4432 } while (need_resched());
4433}
4434
4435int __sched cond_resched(void)
4436{
9414232f
IM
4437 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4438 system_state == SYSTEM_RUNNING) {
1da177e4
LT
4439 __cond_resched();
4440 return 1;
4441 }
4442 return 0;
4443}
1da177e4
LT
4444EXPORT_SYMBOL(cond_resched);
4445
4446/*
4447 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4448 * call schedule, and on return reacquire the lock.
4449 *
4450 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4451 * operations here to prevent schedule() from being called twice (once via
4452 * spin_unlock(), once by hand).
4453 */
95cdf3b7 4454int cond_resched_lock(spinlock_t *lock)
1da177e4 4455{
6df3cecb
JK
4456 int ret = 0;
4457
1da177e4
LT
4458 if (need_lockbreak(lock)) {
4459 spin_unlock(lock);
4460 cpu_relax();
6df3cecb 4461 ret = 1;
1da177e4
LT
4462 spin_lock(lock);
4463 }
9414232f 4464 if (need_resched() && system_state == SYSTEM_RUNNING) {
8a25d5de 4465 spin_release(&lock->dep_map, 1, _THIS_IP_);
1da177e4
LT
4466 _raw_spin_unlock(lock);
4467 preempt_enable_no_resched();
4468 __cond_resched();
6df3cecb 4469 ret = 1;
1da177e4 4470 spin_lock(lock);
1da177e4 4471 }
6df3cecb 4472 return ret;
1da177e4 4473}
1da177e4
LT
4474EXPORT_SYMBOL(cond_resched_lock);
4475
4476int __sched cond_resched_softirq(void)
4477{
4478 BUG_ON(!in_softirq());
4479
9414232f 4480 if (need_resched() && system_state == SYSTEM_RUNNING) {
98d82567 4481 local_bh_enable();
1da177e4
LT
4482 __cond_resched();
4483 local_bh_disable();
4484 return 1;
4485 }
4486 return 0;
4487}
1da177e4
LT
4488EXPORT_SYMBOL(cond_resched_softirq);
4489
1da177e4
LT
4490/**
4491 * yield - yield the current processor to other threads.
4492 *
72fd4a35 4493 * This is a shortcut for kernel-space yielding - it marks the
1da177e4
LT
4494 * thread runnable and calls sys_sched_yield().
4495 */
4496void __sched yield(void)
4497{
4498 set_current_state(TASK_RUNNING);
4499 sys_sched_yield();
4500}
1da177e4
LT
4501EXPORT_SYMBOL(yield);
4502
4503/*
4504 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4505 * that process accounting knows that this is a task in IO wait state.
4506 *
4507 * But don't do that if it is a deliberate, throttling IO wait (this task
4508 * has set its backing_dev_info: the queue against which it should throttle)
4509 */
4510void __sched io_schedule(void)
4511{
70b97a7f 4512 struct rq *rq = &__raw_get_cpu_var(runqueues);
1da177e4 4513
0ff92245 4514 delayacct_blkio_start();
1da177e4
LT
4515 atomic_inc(&rq->nr_iowait);
4516 schedule();
4517 atomic_dec(&rq->nr_iowait);
0ff92245 4518 delayacct_blkio_end();
1da177e4 4519}
1da177e4
LT
4520EXPORT_SYMBOL(io_schedule);
4521
4522long __sched io_schedule_timeout(long timeout)
4523{
70b97a7f 4524 struct rq *rq = &__raw_get_cpu_var(runqueues);
1da177e4
LT
4525 long ret;
4526
0ff92245 4527 delayacct_blkio_start();
1da177e4
LT
4528 atomic_inc(&rq->nr_iowait);
4529 ret = schedule_timeout(timeout);
4530 atomic_dec(&rq->nr_iowait);
0ff92245 4531 delayacct_blkio_end();
1da177e4
LT
4532 return ret;
4533}
4534
4535/**
4536 * sys_sched_get_priority_max - return maximum RT priority.
4537 * @policy: scheduling class.
4538 *
4539 * this syscall returns the maximum rt_priority that can be used
4540 * by a given scheduling class.
4541 */
4542asmlinkage long sys_sched_get_priority_max(int policy)
4543{
4544 int ret = -EINVAL;
4545
4546 switch (policy) {
4547 case SCHED_FIFO:
4548 case SCHED_RR:
4549 ret = MAX_USER_RT_PRIO-1;
4550 break;
4551 case SCHED_NORMAL:
b0a9499c 4552 case SCHED_BATCH:
dd41f596 4553 case SCHED_IDLE:
1da177e4
LT
4554 ret = 0;
4555 break;
4556 }
4557 return ret;
4558}
4559
4560/**
4561 * sys_sched_get_priority_min - return minimum RT priority.
4562 * @policy: scheduling class.
4563 *
4564 * this syscall returns the minimum rt_priority that can be used
4565 * by a given scheduling class.
4566 */
4567asmlinkage long sys_sched_get_priority_min(int policy)
4568{
4569 int ret = -EINVAL;
4570
4571 switch (policy) {
4572 case SCHED_FIFO:
4573 case SCHED_RR:
4574 ret = 1;
4575 break;
4576 case SCHED_NORMAL:
b0a9499c 4577 case SCHED_BATCH:
dd41f596 4578 case SCHED_IDLE:
1da177e4
LT
4579 ret = 0;
4580 }
4581 return ret;
4582}
4583
4584/**
4585 * sys_sched_rr_get_interval - return the default timeslice of a process.
4586 * @pid: pid of the process.
4587 * @interval: userspace pointer to the timeslice value.
4588 *
4589 * this syscall writes the default timeslice value of a given process
4590 * into the user-space timespec buffer. A value of '0' means infinity.
4591 */
4592asmlinkage
4593long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4594{
36c8b586 4595 struct task_struct *p;
1da177e4
LT
4596 int retval = -EINVAL;
4597 struct timespec t;
1da177e4
LT
4598
4599 if (pid < 0)
4600 goto out_nounlock;
4601
4602 retval = -ESRCH;
4603 read_lock(&tasklist_lock);
4604 p = find_process_by_pid(pid);
4605 if (!p)
4606 goto out_unlock;
4607
4608 retval = security_task_getscheduler(p);
4609 if (retval)
4610 goto out_unlock;
4611
b78709cf 4612 jiffies_to_timespec(p->policy == SCHED_FIFO ?
dd41f596 4613 0 : static_prio_timeslice(p->static_prio), &t);
1da177e4
LT
4614 read_unlock(&tasklist_lock);
4615 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4616out_nounlock:
4617 return retval;
4618out_unlock:
4619 read_unlock(&tasklist_lock);
4620 return retval;
4621}
4622
2ed6e34f 4623static const char stat_nam[] = "RSDTtZX";
36c8b586
IM
4624
4625static void show_task(struct task_struct *p)
1da177e4 4626{
1da177e4 4627 unsigned long free = 0;
36c8b586 4628 unsigned state;
1da177e4 4629
1da177e4 4630 state = p->state ? __ffs(p->state) + 1 : 0;
2ed6e34f
AM
4631 printk("%-13.13s %c", p->comm,
4632 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
1da177e4
LT
4633#if (BITS_PER_LONG == 32)
4634 if (state == TASK_RUNNING)
4635 printk(" running ");
4636 else
4637 printk(" %08lX ", thread_saved_pc(p));
4638#else
4639 if (state == TASK_RUNNING)
4640 printk(" running task ");
4641 else
4642 printk(" %016lx ", thread_saved_pc(p));
4643#endif
4644#ifdef CONFIG_DEBUG_STACK_USAGE
4645 {
10ebffde 4646 unsigned long *n = end_of_stack(p);
1da177e4
LT
4647 while (!*n)
4648 n++;
10ebffde 4649 free = (unsigned long)n - (unsigned long)end_of_stack(p);
1da177e4
LT
4650 }
4651#endif
35f6f753 4652 printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
1da177e4
LT
4653 if (!p->mm)
4654 printk(" (L-TLB)\n");
4655 else
4656 printk(" (NOTLB)\n");
4657
4658 if (state != TASK_RUNNING)
4659 show_stack(p, NULL);
4660}
4661
e59e2ae2 4662void show_state_filter(unsigned long state_filter)
1da177e4 4663{
36c8b586 4664 struct task_struct *g, *p;
1da177e4
LT
4665
4666#if (BITS_PER_LONG == 32)
4667 printk("\n"
301827ac
CC
4668 " free sibling\n");
4669 printk(" task PC stack pid father child younger older\n");
1da177e4
LT
4670#else
4671 printk("\n"
301827ac
CC
4672 " free sibling\n");
4673 printk(" task PC stack pid father child younger older\n");
1da177e4
LT
4674#endif
4675 read_lock(&tasklist_lock);
4676 do_each_thread(g, p) {
4677 /*
4678 * reset the NMI-timeout, listing all files on a slow
4679 * console might take alot of time:
4680 */
4681 touch_nmi_watchdog();
39bc89fd 4682 if (!state_filter || (p->state & state_filter))
e59e2ae2 4683 show_task(p);
1da177e4
LT
4684 } while_each_thread(g, p);
4685
04c9167f
JF
4686 touch_all_softlockup_watchdogs();
4687
dd41f596
IM
4688#ifdef CONFIG_SCHED_DEBUG
4689 sysrq_sched_debug_show();
4690#endif
1da177e4 4691 read_unlock(&tasklist_lock);
e59e2ae2
IM
4692 /*
4693 * Only show locks if all tasks are dumped:
4694 */
4695 if (state_filter == -1)
4696 debug_show_all_locks();
1da177e4
LT
4697}
4698
1df21055
IM
4699void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4700{
dd41f596 4701 idle->sched_class = &idle_sched_class;
1df21055
IM
4702}
4703
f340c0d1
IM
4704/**
4705 * init_idle - set up an idle thread for a given CPU
4706 * @idle: task in question
4707 * @cpu: cpu the idle task belongs to
4708 *
4709 * NOTE: this function does not set the idle thread's NEED_RESCHED
4710 * flag, to make booting more robust.
4711 */
5c1e1767 4712void __cpuinit init_idle(struct task_struct *idle, int cpu)
1da177e4 4713{
70b97a7f 4714 struct rq *rq = cpu_rq(cpu);
1da177e4
LT
4715 unsigned long flags;
4716
dd41f596
IM
4717 __sched_fork(idle);
4718 idle->se.exec_start = sched_clock();
4719
b29739f9 4720 idle->prio = idle->normal_prio = MAX_PRIO;
1da177e4 4721 idle->cpus_allowed = cpumask_of_cpu(cpu);
dd41f596 4722 __set_task_cpu(idle, cpu);
1da177e4
LT
4723
4724 spin_lock_irqsave(&rq->lock, flags);
4725 rq->curr = rq->idle = idle;
4866cde0
NP
4726#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4727 idle->oncpu = 1;
4728#endif
1da177e4
LT
4729 spin_unlock_irqrestore(&rq->lock, flags);
4730
4731 /* Set the preempt count _outside_ the spinlocks! */
4732#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
a1261f54 4733 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
1da177e4 4734#else
a1261f54 4735 task_thread_info(idle)->preempt_count = 0;
1da177e4 4736#endif
dd41f596
IM
4737 /*
4738 * The idle tasks have their own, simple scheduling class:
4739 */
4740 idle->sched_class = &idle_sched_class;
1da177e4
LT
4741}
4742
4743/*
4744 * In a system that switches off the HZ timer nohz_cpu_mask
4745 * indicates which cpus entered this state. This is used
4746 * in the rcu update to wait only for active cpus. For system
4747 * which do not switch off the HZ timer nohz_cpu_mask should
4748 * always be CPU_MASK_NONE.
4749 */
4750cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4751
dd41f596
IM
4752/*
4753 * Increase the granularity value when there are more CPUs,
4754 * because with more CPUs the 'effective latency' as visible
4755 * to users decreases. But the relationship is not linear,
4756 * so pick a second-best guess by going with the log2 of the
4757 * number of CPUs.
4758 *
4759 * This idea comes from the SD scheduler of Con Kolivas:
4760 */
4761static inline void sched_init_granularity(void)
4762{
4763 unsigned int factor = 1 + ilog2(num_online_cpus());
4764 const unsigned long gran_limit = 10000000;
4765
4766 sysctl_sched_granularity *= factor;
4767 if (sysctl_sched_granularity > gran_limit)
4768 sysctl_sched_granularity = gran_limit;
4769
4770 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4771 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4772}
4773
1da177e4
LT
4774#ifdef CONFIG_SMP
4775/*
4776 * This is how migration works:
4777 *
70b97a7f 4778 * 1) we queue a struct migration_req structure in the source CPU's
1da177e4
LT
4779 * runqueue and wake up that CPU's migration thread.
4780 * 2) we down() the locked semaphore => thread blocks.
4781 * 3) migration thread wakes up (implicitly it forces the migrated
4782 * thread off the CPU)
4783 * 4) it gets the migration request and checks whether the migrated
4784 * task is still in the wrong runqueue.
4785 * 5) if it's in the wrong runqueue then the migration thread removes
4786 * it and puts it into the right queue.
4787 * 6) migration thread up()s the semaphore.
4788 * 7) we wake up and the migration is done.
4789 */
4790
4791/*
4792 * Change a given task's CPU affinity. Migrate the thread to a
4793 * proper CPU and schedule it away if the CPU it's executing on
4794 * is removed from the allowed bitmask.
4795 *
4796 * NOTE: the caller must have a valid reference to the task, the
4797 * task must not exit() & deallocate itself prematurely. The
4798 * call is not atomic; no spinlocks may be held.
4799 */
36c8b586 4800int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
1da177e4 4801{
70b97a7f 4802 struct migration_req req;
1da177e4 4803 unsigned long flags;
70b97a7f 4804 struct rq *rq;
48f24c4d 4805 int ret = 0;
1da177e4
LT
4806
4807 rq = task_rq_lock(p, &flags);
4808 if (!cpus_intersects(new_mask, cpu_online_map)) {
4809 ret = -EINVAL;
4810 goto out;
4811 }
4812
4813 p->cpus_allowed = new_mask;
4814 /* Can the task run on the task's current CPU? If so, we're done */
4815 if (cpu_isset(task_cpu(p), new_mask))
4816 goto out;
4817
4818 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4819 /* Need help from migration thread: drop lock and wait. */
4820 task_rq_unlock(rq, &flags);
4821 wake_up_process(rq->migration_thread);
4822 wait_for_completion(&req.done);
4823 tlb_migrate_finish(p->mm);
4824 return 0;
4825 }
4826out:
4827 task_rq_unlock(rq, &flags);
48f24c4d 4828
1da177e4
LT
4829 return ret;
4830}
1da177e4
LT
4831EXPORT_SYMBOL_GPL(set_cpus_allowed);
4832
4833/*
4834 * Move (not current) task off this cpu, onto dest cpu. We're doing
4835 * this because either it can't run here any more (set_cpus_allowed()
4836 * away from this CPU, or CPU going down), or because we're
4837 * attempting to rebalance this task on exec (sched_exec).
4838 *
4839 * So we race with normal scheduler movements, but that's OK, as long
4840 * as the task is no longer on this CPU.
efc30814
KK
4841 *
4842 * Returns non-zero if task was successfully migrated.
1da177e4 4843 */
efc30814 4844static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
1da177e4 4845{
70b97a7f 4846 struct rq *rq_dest, *rq_src;
dd41f596 4847 int ret = 0, on_rq;
1da177e4
LT
4848
4849 if (unlikely(cpu_is_offline(dest_cpu)))
efc30814 4850 return ret;
1da177e4
LT
4851
4852 rq_src = cpu_rq(src_cpu);
4853 rq_dest = cpu_rq(dest_cpu);
4854
4855 double_rq_lock(rq_src, rq_dest);
4856 /* Already moved. */
4857 if (task_cpu(p) != src_cpu)
4858 goto out;
4859 /* Affinity changed (again). */
4860 if (!cpu_isset(dest_cpu, p->cpus_allowed))
4861 goto out;
4862
dd41f596
IM
4863 on_rq = p->se.on_rq;
4864 if (on_rq)
4865 deactivate_task(rq_src, p, 0);
1da177e4 4866 set_task_cpu(p, dest_cpu);
dd41f596
IM
4867 if (on_rq) {
4868 activate_task(rq_dest, p, 0);
4869 check_preempt_curr(rq_dest, p);
1da177e4 4870 }
efc30814 4871 ret = 1;
1da177e4
LT
4872out:
4873 double_rq_unlock(rq_src, rq_dest);
efc30814 4874 return ret;
1da177e4
LT
4875}
4876
4877/*
4878 * migration_thread - this is a highprio system thread that performs
4879 * thread migration by bumping thread off CPU then 'pushing' onto
4880 * another runqueue.
4881 */
95cdf3b7 4882static int migration_thread(void *data)
1da177e4 4883{
1da177e4 4884 int cpu = (long)data;
70b97a7f 4885 struct rq *rq;
1da177e4
LT
4886
4887 rq = cpu_rq(cpu);
4888 BUG_ON(rq->migration_thread != current);
4889
4890 set_current_state(TASK_INTERRUPTIBLE);
4891 while (!kthread_should_stop()) {
70b97a7f 4892 struct migration_req *req;
1da177e4 4893 struct list_head *head;
1da177e4 4894
3e1d1d28 4895 try_to_freeze();
1da177e4
LT
4896
4897 spin_lock_irq(&rq->lock);
4898
4899 if (cpu_is_offline(cpu)) {
4900 spin_unlock_irq(&rq->lock);
4901 goto wait_to_die;
4902 }
4903
4904 if (rq->active_balance) {
4905 active_load_balance(rq, cpu);
4906 rq->active_balance = 0;
4907 }
4908
4909 head = &rq->migration_queue;
4910
4911 if (list_empty(head)) {
4912 spin_unlock_irq(&rq->lock);
4913 schedule();
4914 set_current_state(TASK_INTERRUPTIBLE);
4915 continue;
4916 }
70b97a7f 4917 req = list_entry(head->next, struct migration_req, list);
1da177e4
LT
4918 list_del_init(head->next);
4919
674311d5
NP
4920 spin_unlock(&rq->lock);
4921 __migrate_task(req->task, cpu, req->dest_cpu);
4922 local_irq_enable();
1da177e4
LT
4923
4924 complete(&req->done);
4925 }
4926 __set_current_state(TASK_RUNNING);
4927 return 0;
4928
4929wait_to_die:
4930 /* Wait for kthread_stop */
4931 set_current_state(TASK_INTERRUPTIBLE);
4932 while (!kthread_should_stop()) {
4933 schedule();
4934 set_current_state(TASK_INTERRUPTIBLE);
4935 }
4936 __set_current_state(TASK_RUNNING);
4937 return 0;
4938}
4939
4940#ifdef CONFIG_HOTPLUG_CPU
054b9108
KK
4941/*
4942 * Figure out where task on dead CPU should go, use force if neccessary.
4943 * NOTE: interrupts should be disabled by the caller
4944 */
48f24c4d 4945static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
1da177e4 4946{
efc30814 4947 unsigned long flags;
1da177e4 4948 cpumask_t mask;
70b97a7f
IM
4949 struct rq *rq;
4950 int dest_cpu;
1da177e4 4951
efc30814 4952restart:
1da177e4
LT
4953 /* On same node? */
4954 mask = node_to_cpumask(cpu_to_node(dead_cpu));
48f24c4d 4955 cpus_and(mask, mask, p->cpus_allowed);
1da177e4
LT
4956 dest_cpu = any_online_cpu(mask);
4957
4958 /* On any allowed CPU? */
4959 if (dest_cpu == NR_CPUS)
48f24c4d 4960 dest_cpu = any_online_cpu(p->cpus_allowed);
1da177e4
LT
4961
4962 /* No more Mr. Nice Guy. */
4963 if (dest_cpu == NR_CPUS) {
48f24c4d
IM
4964 rq = task_rq_lock(p, &flags);
4965 cpus_setall(p->cpus_allowed);
4966 dest_cpu = any_online_cpu(p->cpus_allowed);
efc30814 4967 task_rq_unlock(rq, &flags);
1da177e4
LT
4968
4969 /*
4970 * Don't tell them about moving exiting tasks or
4971 * kernel threads (both mm NULL), since they never
4972 * leave kernel.
4973 */
48f24c4d 4974 if (p->mm && printk_ratelimit())
1da177e4
LT
4975 printk(KERN_INFO "process %d (%s) no "
4976 "longer affine to cpu%d\n",
48f24c4d 4977 p->pid, p->comm, dead_cpu);
1da177e4 4978 }
48f24c4d 4979 if (!__migrate_task(p, dead_cpu, dest_cpu))
efc30814 4980 goto restart;
1da177e4
LT
4981}
4982
4983/*
4984 * While a dead CPU has no uninterruptible tasks queued at this point,
4985 * it might still have a nonzero ->nr_uninterruptible counter, because
4986 * for performance reasons the counter is not stricly tracking tasks to
4987 * their home CPUs. So we just add the counter to another CPU's counter,
4988 * to keep the global sum constant after CPU-down:
4989 */
70b97a7f 4990static void migrate_nr_uninterruptible(struct rq *rq_src)
1da177e4 4991{
70b97a7f 4992 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
1da177e4
LT
4993 unsigned long flags;
4994
4995 local_irq_save(flags);
4996 double_rq_lock(rq_src, rq_dest);
4997 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
4998 rq_src->nr_uninterruptible = 0;
4999 double_rq_unlock(rq_src, rq_dest);
5000 local_irq_restore(flags);
5001}
5002
5003/* Run through task list and migrate tasks from the dead cpu. */
5004static void migrate_live_tasks(int src_cpu)
5005{
48f24c4d 5006 struct task_struct *p, *t;
1da177e4
LT
5007
5008 write_lock_irq(&tasklist_lock);
5009
48f24c4d
IM
5010 do_each_thread(t, p) {
5011 if (p == current)
1da177e4
LT
5012 continue;
5013
48f24c4d
IM
5014 if (task_cpu(p) == src_cpu)
5015 move_task_off_dead_cpu(src_cpu, p);
5016 } while_each_thread(t, p);
1da177e4
LT
5017
5018 write_unlock_irq(&tasklist_lock);
5019}
5020
dd41f596
IM
5021/*
5022 * Schedules idle task to be the next runnable task on current CPU.
1da177e4 5023 * It does so by boosting its priority to highest possible and adding it to
48f24c4d 5024 * the _front_ of the runqueue. Used by CPU offline code.
1da177e4
LT
5025 */
5026void sched_idle_next(void)
5027{
48f24c4d 5028 int this_cpu = smp_processor_id();
70b97a7f 5029 struct rq *rq = cpu_rq(this_cpu);
1da177e4
LT
5030 struct task_struct *p = rq->idle;
5031 unsigned long flags;
5032
5033 /* cpu has to be offline */
48f24c4d 5034 BUG_ON(cpu_online(this_cpu));
1da177e4 5035
48f24c4d
IM
5036 /*
5037 * Strictly not necessary since rest of the CPUs are stopped by now
5038 * and interrupts disabled on the current cpu.
1da177e4
LT
5039 */
5040 spin_lock_irqsave(&rq->lock, flags);
5041
dd41f596 5042 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
48f24c4d
IM
5043
5044 /* Add idle task to the _front_ of its priority queue: */
dd41f596 5045 activate_idle_task(p, rq);
1da177e4
LT
5046
5047 spin_unlock_irqrestore(&rq->lock, flags);
5048}
5049
48f24c4d
IM
5050/*
5051 * Ensures that the idle task is using init_mm right before its cpu goes
1da177e4
LT
5052 * offline.
5053 */
5054void idle_task_exit(void)
5055{
5056 struct mm_struct *mm = current->active_mm;
5057
5058 BUG_ON(cpu_online(smp_processor_id()));
5059
5060 if (mm != &init_mm)
5061 switch_mm(mm, &init_mm, current);
5062 mmdrop(mm);
5063}
5064
054b9108 5065/* called under rq->lock with disabled interrupts */
36c8b586 5066static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
1da177e4 5067{
70b97a7f 5068 struct rq *rq = cpu_rq(dead_cpu);
1da177e4
LT
5069
5070 /* Must be exiting, otherwise would be on tasklist. */
48f24c4d 5071 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
1da177e4
LT
5072
5073 /* Cannot have done final schedule yet: would have vanished. */
c394cc9f 5074 BUG_ON(p->state == TASK_DEAD);
1da177e4 5075
48f24c4d 5076 get_task_struct(p);
1da177e4
LT
5077
5078 /*
5079 * Drop lock around migration; if someone else moves it,
5080 * that's OK. No task can be added to this CPU, so iteration is
5081 * fine.
054b9108 5082 * NOTE: interrupts should be left disabled --dev@
1da177e4 5083 */
054b9108 5084 spin_unlock(&rq->lock);
48f24c4d 5085 move_task_off_dead_cpu(dead_cpu, p);
054b9108 5086 spin_lock(&rq->lock);
1da177e4 5087
48f24c4d 5088 put_task_struct(p);
1da177e4
LT
5089}
5090
5091/* release_task() removes task from tasklist, so we won't find dead tasks. */
5092static void migrate_dead_tasks(unsigned int dead_cpu)
5093{
70b97a7f 5094 struct rq *rq = cpu_rq(dead_cpu);
dd41f596 5095 struct task_struct *next;
48f24c4d 5096
dd41f596
IM
5097 for ( ; ; ) {
5098 if (!rq->nr_running)
5099 break;
5100 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5101 if (!next)
5102 break;
5103 migrate_dead(dead_cpu, next);
1da177e4
LT
5104 }
5105}
5106#endif /* CONFIG_HOTPLUG_CPU */
5107
5108/*
5109 * migration_call - callback that gets triggered when a CPU is added.
5110 * Here we can start up the necessary migration thread for the new CPU.
5111 */
48f24c4d
IM
5112static int __cpuinit
5113migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
1da177e4 5114{
1da177e4 5115 struct task_struct *p;
48f24c4d 5116 int cpu = (long)hcpu;
1da177e4 5117 unsigned long flags;
70b97a7f 5118 struct rq *rq;
1da177e4
LT
5119
5120 switch (action) {
5be9361c
GS
5121 case CPU_LOCK_ACQUIRE:
5122 mutex_lock(&sched_hotcpu_mutex);
5123 break;
5124
1da177e4 5125 case CPU_UP_PREPARE:
8bb78442 5126 case CPU_UP_PREPARE_FROZEN:
dd41f596 5127 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
1da177e4
LT
5128 if (IS_ERR(p))
5129 return NOTIFY_BAD;
5130 p->flags |= PF_NOFREEZE;
5131 kthread_bind(p, cpu);
5132 /* Must be high prio: stop_machine expects to yield to it. */
5133 rq = task_rq_lock(p, &flags);
dd41f596 5134 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
1da177e4
LT
5135 task_rq_unlock(rq, &flags);
5136 cpu_rq(cpu)->migration_thread = p;
5137 break;
48f24c4d 5138
1da177e4 5139 case CPU_ONLINE:
8bb78442 5140 case CPU_ONLINE_FROZEN:
1da177e4
LT
5141 /* Strictly unneccessary, as first user will wake it. */
5142 wake_up_process(cpu_rq(cpu)->migration_thread);
5143 break;
48f24c4d 5144
1da177e4
LT
5145#ifdef CONFIG_HOTPLUG_CPU
5146 case CPU_UP_CANCELED:
8bb78442 5147 case CPU_UP_CANCELED_FROZEN:
fc75cdfa
HC
5148 if (!cpu_rq(cpu)->migration_thread)
5149 break;
1da177e4 5150 /* Unbind it from offline cpu so it can run. Fall thru. */
a4c4af7c
HC
5151 kthread_bind(cpu_rq(cpu)->migration_thread,
5152 any_online_cpu(cpu_online_map));
1da177e4
LT
5153 kthread_stop(cpu_rq(cpu)->migration_thread);
5154 cpu_rq(cpu)->migration_thread = NULL;
5155 break;
48f24c4d 5156
1da177e4 5157 case CPU_DEAD:
8bb78442 5158 case CPU_DEAD_FROZEN:
1da177e4
LT
5159 migrate_live_tasks(cpu);
5160 rq = cpu_rq(cpu);
5161 kthread_stop(rq->migration_thread);
5162 rq->migration_thread = NULL;
5163 /* Idle task back to normal (off runqueue, low prio) */
5164 rq = task_rq_lock(rq->idle, &flags);
dd41f596 5165 deactivate_task(rq, rq->idle, 0);
1da177e4 5166 rq->idle->static_prio = MAX_PRIO;
dd41f596
IM
5167 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5168 rq->idle->sched_class = &idle_sched_class;
1da177e4
LT
5169 migrate_dead_tasks(cpu);
5170 task_rq_unlock(rq, &flags);
5171 migrate_nr_uninterruptible(rq);
5172 BUG_ON(rq->nr_running != 0);
5173
5174 /* No need to migrate the tasks: it was best-effort if
5be9361c 5175 * they didn't take sched_hotcpu_mutex. Just wake up
1da177e4
LT
5176 * the requestors. */
5177 spin_lock_irq(&rq->lock);
5178 while (!list_empty(&rq->migration_queue)) {
70b97a7f
IM
5179 struct migration_req *req;
5180
1da177e4 5181 req = list_entry(rq->migration_queue.next,
70b97a7f 5182 struct migration_req, list);
1da177e4
LT
5183 list_del_init(&req->list);
5184 complete(&req->done);
5185 }
5186 spin_unlock_irq(&rq->lock);
5187 break;
5188#endif
5be9361c
GS
5189 case CPU_LOCK_RELEASE:
5190 mutex_unlock(&sched_hotcpu_mutex);
5191 break;
1da177e4
LT
5192 }
5193 return NOTIFY_OK;
5194}
5195
5196/* Register at highest priority so that task migration (migrate_all_tasks)
5197 * happens before everything else.
5198 */
26c2143b 5199static struct notifier_block __cpuinitdata migration_notifier = {
1da177e4
LT
5200 .notifier_call = migration_call,
5201 .priority = 10
5202};
5203
5204int __init migration_init(void)
5205{
5206 void *cpu = (void *)(long)smp_processor_id();
07dccf33 5207 int err;
48f24c4d
IM
5208
5209 /* Start one for the boot CPU: */
07dccf33
AM
5210 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5211 BUG_ON(err == NOTIFY_BAD);
1da177e4
LT
5212 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5213 register_cpu_notifier(&migration_notifier);
48f24c4d 5214
1da177e4
LT
5215 return 0;
5216}
5217#endif
5218
5219#ifdef CONFIG_SMP
476f3534
CL
5220
5221/* Number of possible processor ids */
5222int nr_cpu_ids __read_mostly = NR_CPUS;
5223EXPORT_SYMBOL(nr_cpu_ids);
5224
1a20ff27 5225#undef SCHED_DOMAIN_DEBUG
1da177e4
LT
5226#ifdef SCHED_DOMAIN_DEBUG
5227static void sched_domain_debug(struct sched_domain *sd, int cpu)
5228{
5229 int level = 0;
5230
41c7ce9a
NP
5231 if (!sd) {
5232 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5233 return;
5234 }
5235
1da177e4
LT
5236 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5237
5238 do {
5239 int i;
5240 char str[NR_CPUS];
5241 struct sched_group *group = sd->groups;
5242 cpumask_t groupmask;
5243
5244 cpumask_scnprintf(str, NR_CPUS, sd->span);
5245 cpus_clear(groupmask);
5246
5247 printk(KERN_DEBUG);
5248 for (i = 0; i < level + 1; i++)
5249 printk(" ");
5250 printk("domain %d: ", level);
5251
5252 if (!(sd->flags & SD_LOAD_BALANCE)) {
5253 printk("does not load-balance\n");
5254 if (sd->parent)
33859f7f
MOS
5255 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5256 " has parent");
1da177e4
LT
5257 break;
5258 }
5259
5260 printk("span %s\n", str);
5261
5262 if (!cpu_isset(cpu, sd->span))
33859f7f
MOS
5263 printk(KERN_ERR "ERROR: domain->span does not contain "
5264 "CPU%d\n", cpu);
1da177e4 5265 if (!cpu_isset(cpu, group->cpumask))
33859f7f
MOS
5266 printk(KERN_ERR "ERROR: domain->groups does not contain"
5267 " CPU%d\n", cpu);
1da177e4
LT
5268
5269 printk(KERN_DEBUG);
5270 for (i = 0; i < level + 2; i++)
5271 printk(" ");
5272 printk("groups:");
5273 do {
5274 if (!group) {
5275 printk("\n");
5276 printk(KERN_ERR "ERROR: group is NULL\n");
5277 break;
5278 }
5279
5517d86b 5280 if (!group->__cpu_power) {
1da177e4 5281 printk("\n");
33859f7f
MOS
5282 printk(KERN_ERR "ERROR: domain->cpu_power not "
5283 "set\n");
1da177e4
LT
5284 }
5285
5286 if (!cpus_weight(group->cpumask)) {
5287 printk("\n");
5288 printk(KERN_ERR "ERROR: empty group\n");
5289 }
5290
5291 if (cpus_intersects(groupmask, group->cpumask)) {
5292 printk("\n");
5293 printk(KERN_ERR "ERROR: repeated CPUs\n");
5294 }
5295
5296 cpus_or(groupmask, groupmask, group->cpumask);
5297
5298 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5299 printk(" %s", str);
5300
5301 group = group->next;
5302 } while (group != sd->groups);
5303 printk("\n");
5304
5305 if (!cpus_equal(sd->span, groupmask))
33859f7f
MOS
5306 printk(KERN_ERR "ERROR: groups don't span "
5307 "domain->span\n");
1da177e4
LT
5308
5309 level++;
5310 sd = sd->parent;
33859f7f
MOS
5311 if (!sd)
5312 continue;
1da177e4 5313
33859f7f
MOS
5314 if (!cpus_subset(groupmask, sd->span))
5315 printk(KERN_ERR "ERROR: parent span is not a superset "
5316 "of domain->span\n");
1da177e4
LT
5317
5318 } while (sd);
5319}
5320#else
48f24c4d 5321# define sched_domain_debug(sd, cpu) do { } while (0)
1da177e4
LT
5322#endif
5323
1a20ff27 5324static int sd_degenerate(struct sched_domain *sd)
245af2c7
SS
5325{
5326 if (cpus_weight(sd->span) == 1)
5327 return 1;
5328
5329 /* Following flags need at least 2 groups */
5330 if (sd->flags & (SD_LOAD_BALANCE |
5331 SD_BALANCE_NEWIDLE |
5332 SD_BALANCE_FORK |
89c4710e
SS
5333 SD_BALANCE_EXEC |
5334 SD_SHARE_CPUPOWER |
5335 SD_SHARE_PKG_RESOURCES)) {
245af2c7
SS
5336 if (sd->groups != sd->groups->next)
5337 return 0;
5338 }
5339
5340 /* Following flags don't use groups */
5341 if (sd->flags & (SD_WAKE_IDLE |
5342 SD_WAKE_AFFINE |
5343 SD_WAKE_BALANCE))
5344 return 0;
5345
5346 return 1;
5347}
5348
48f24c4d
IM
5349static int
5350sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
245af2c7
SS
5351{
5352 unsigned long cflags = sd->flags, pflags = parent->flags;
5353
5354 if (sd_degenerate(parent))
5355 return 1;
5356
5357 if (!cpus_equal(sd->span, parent->span))
5358 return 0;
5359
5360 /* Does parent contain flags not in child? */
5361 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5362 if (cflags & SD_WAKE_AFFINE)
5363 pflags &= ~SD_WAKE_BALANCE;
5364 /* Flags needing groups don't count if only 1 group in parent */
5365 if (parent->groups == parent->groups->next) {
5366 pflags &= ~(SD_LOAD_BALANCE |
5367 SD_BALANCE_NEWIDLE |
5368 SD_BALANCE_FORK |
89c4710e
SS
5369 SD_BALANCE_EXEC |
5370 SD_SHARE_CPUPOWER |
5371 SD_SHARE_PKG_RESOURCES);
245af2c7
SS
5372 }
5373 if (~cflags & pflags)
5374 return 0;
5375
5376 return 1;
5377}
5378
1da177e4
LT
5379/*
5380 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5381 * hold the hotplug lock.
5382 */
9c1cfda2 5383static void cpu_attach_domain(struct sched_domain *sd, int cpu)
1da177e4 5384{
70b97a7f 5385 struct rq *rq = cpu_rq(cpu);
245af2c7
SS
5386 struct sched_domain *tmp;
5387
5388 /* Remove the sched domains which do not contribute to scheduling. */
5389 for (tmp = sd; tmp; tmp = tmp->parent) {
5390 struct sched_domain *parent = tmp->parent;
5391 if (!parent)
5392 break;
1a848870 5393 if (sd_parent_degenerate(tmp, parent)) {
245af2c7 5394 tmp->parent = parent->parent;
1a848870
SS
5395 if (parent->parent)
5396 parent->parent->child = tmp;
5397 }
245af2c7
SS
5398 }
5399
1a848870 5400 if (sd && sd_degenerate(sd)) {
245af2c7 5401 sd = sd->parent;
1a848870
SS
5402 if (sd)
5403 sd->child = NULL;
5404 }
1da177e4
LT
5405
5406 sched_domain_debug(sd, cpu);
5407
674311d5 5408 rcu_assign_pointer(rq->sd, sd);
1da177e4
LT
5409}
5410
5411/* cpus with isolated domains */
67af63a6 5412static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
1da177e4
LT
5413
5414/* Setup the mask of cpus configured for isolated domains */
5415static int __init isolated_cpu_setup(char *str)
5416{
5417 int ints[NR_CPUS], i;
5418
5419 str = get_options(str, ARRAY_SIZE(ints), ints);
5420 cpus_clear(cpu_isolated_map);
5421 for (i = 1; i <= ints[0]; i++)
5422 if (ints[i] < NR_CPUS)
5423 cpu_set(ints[i], cpu_isolated_map);
5424 return 1;
5425}
5426
5427__setup ("isolcpus=", isolated_cpu_setup);
5428
5429/*
6711cab4
SS
5430 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5431 * to a function which identifies what group(along with sched group) a CPU
5432 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5433 * (due to the fact that we keep track of groups covered with a cpumask_t).
1da177e4
LT
5434 *
5435 * init_sched_build_groups will build a circular linked list of the groups
5436 * covered by the given span, and will set each group's ->cpumask correctly,
5437 * and ->cpu_power to 0.
5438 */
a616058b 5439static void
6711cab4
SS
5440init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5441 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5442 struct sched_group **sg))
1da177e4
LT
5443{
5444 struct sched_group *first = NULL, *last = NULL;
5445 cpumask_t covered = CPU_MASK_NONE;
5446 int i;
5447
5448 for_each_cpu_mask(i, span) {
6711cab4
SS
5449 struct sched_group *sg;
5450 int group = group_fn(i, cpu_map, &sg);
1da177e4
LT
5451 int j;
5452
5453 if (cpu_isset(i, covered))
5454 continue;
5455
5456 sg->cpumask = CPU_MASK_NONE;
5517d86b 5457 sg->__cpu_power = 0;
1da177e4
LT
5458
5459 for_each_cpu_mask(j, span) {
6711cab4 5460 if (group_fn(j, cpu_map, NULL) != group)
1da177e4
LT
5461 continue;
5462
5463 cpu_set(j, covered);
5464 cpu_set(j, sg->cpumask);
5465 }
5466 if (!first)
5467 first = sg;
5468 if (last)
5469 last->next = sg;
5470 last = sg;
5471 }
5472 last->next = first;
5473}
5474
9c1cfda2 5475#define SD_NODES_PER_DOMAIN 16
1da177e4 5476
9c1cfda2 5477#ifdef CONFIG_NUMA
198e2f18 5478
9c1cfda2
JH
5479/**
5480 * find_next_best_node - find the next node to include in a sched_domain
5481 * @node: node whose sched_domain we're building
5482 * @used_nodes: nodes already in the sched_domain
5483 *
5484 * Find the next node to include in a given scheduling domain. Simply
5485 * finds the closest node not already in the @used_nodes map.
5486 *
5487 * Should use nodemask_t.
5488 */
5489static int find_next_best_node(int node, unsigned long *used_nodes)
5490{
5491 int i, n, val, min_val, best_node = 0;
5492
5493 min_val = INT_MAX;
5494
5495 for (i = 0; i < MAX_NUMNODES; i++) {
5496 /* Start at @node */
5497 n = (node + i) % MAX_NUMNODES;
5498
5499 if (!nr_cpus_node(n))
5500 continue;
5501
5502 /* Skip already used nodes */
5503 if (test_bit(n, used_nodes))
5504 continue;
5505
5506 /* Simple min distance search */
5507 val = node_distance(node, n);
5508
5509 if (val < min_val) {
5510 min_val = val;
5511 best_node = n;
5512 }
5513 }
5514
5515 set_bit(best_node, used_nodes);
5516 return best_node;
5517}
5518
5519/**
5520 * sched_domain_node_span - get a cpumask for a node's sched_domain
5521 * @node: node whose cpumask we're constructing
5522 * @size: number of nodes to include in this span
5523 *
5524 * Given a node, construct a good cpumask for its sched_domain to span. It
5525 * should be one that prevents unnecessary balancing, but also spreads tasks
5526 * out optimally.
5527 */
5528static cpumask_t sched_domain_node_span(int node)
5529{
9c1cfda2 5530 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
48f24c4d
IM
5531 cpumask_t span, nodemask;
5532 int i;
9c1cfda2
JH
5533
5534 cpus_clear(span);
5535 bitmap_zero(used_nodes, MAX_NUMNODES);
5536
5537 nodemask = node_to_cpumask(node);
5538 cpus_or(span, span, nodemask);
5539 set_bit(node, used_nodes);
5540
5541 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5542 int next_node = find_next_best_node(node, used_nodes);
48f24c4d 5543
9c1cfda2
JH
5544 nodemask = node_to_cpumask(next_node);
5545 cpus_or(span, span, nodemask);
5546 }
5547
5548 return span;
5549}
5550#endif
5551
5c45bf27 5552int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
48f24c4d 5553
9c1cfda2 5554/*
48f24c4d 5555 * SMT sched-domains:
9c1cfda2 5556 */
1da177e4
LT
5557#ifdef CONFIG_SCHED_SMT
5558static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
6711cab4 5559static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
48f24c4d 5560
6711cab4
SS
5561static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5562 struct sched_group **sg)
1da177e4 5563{
6711cab4
SS
5564 if (sg)
5565 *sg = &per_cpu(sched_group_cpus, cpu);
1da177e4
LT
5566 return cpu;
5567}
5568#endif
5569
48f24c4d
IM
5570/*
5571 * multi-core sched-domains:
5572 */
1e9f28fa
SS
5573#ifdef CONFIG_SCHED_MC
5574static DEFINE_PER_CPU(struct sched_domain, core_domains);
6711cab4 5575static DEFINE_PER_CPU(struct sched_group, sched_group_core);
1e9f28fa
SS
5576#endif
5577
5578#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6711cab4
SS
5579static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5580 struct sched_group **sg)
1e9f28fa 5581{
6711cab4 5582 int group;
a616058b
SS
5583 cpumask_t mask = cpu_sibling_map[cpu];
5584 cpus_and(mask, mask, *cpu_map);
6711cab4
SS
5585 group = first_cpu(mask);
5586 if (sg)
5587 *sg = &per_cpu(sched_group_core, group);
5588 return group;
1e9f28fa
SS
5589}
5590#elif defined(CONFIG_SCHED_MC)
6711cab4
SS
5591static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5592 struct sched_group **sg)
1e9f28fa 5593{
6711cab4
SS
5594 if (sg)
5595 *sg = &per_cpu(sched_group_core, cpu);
1e9f28fa
SS
5596 return cpu;
5597}
5598#endif
5599
1da177e4 5600static DEFINE_PER_CPU(struct sched_domain, phys_domains);
6711cab4 5601static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
48f24c4d 5602
6711cab4
SS
5603static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5604 struct sched_group **sg)
1da177e4 5605{
6711cab4 5606 int group;
48f24c4d 5607#ifdef CONFIG_SCHED_MC
1e9f28fa 5608 cpumask_t mask = cpu_coregroup_map(cpu);
a616058b 5609 cpus_and(mask, mask, *cpu_map);
6711cab4 5610 group = first_cpu(mask);
1e9f28fa 5611#elif defined(CONFIG_SCHED_SMT)
a616058b
SS
5612 cpumask_t mask = cpu_sibling_map[cpu];
5613 cpus_and(mask, mask, *cpu_map);
6711cab4 5614 group = first_cpu(mask);
1da177e4 5615#else
6711cab4 5616 group = cpu;
1da177e4 5617#endif
6711cab4
SS
5618 if (sg)
5619 *sg = &per_cpu(sched_group_phys, group);
5620 return group;
1da177e4
LT
5621}
5622
5623#ifdef CONFIG_NUMA
1da177e4 5624/*
9c1cfda2
JH
5625 * The init_sched_build_groups can't handle what we want to do with node
5626 * groups, so roll our own. Now each node has its own list of groups which
5627 * gets dynamically allocated.
1da177e4 5628 */
9c1cfda2 5629static DEFINE_PER_CPU(struct sched_domain, node_domains);
d1b55138 5630static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
1da177e4 5631
9c1cfda2 5632static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
6711cab4 5633static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
9c1cfda2 5634
6711cab4
SS
5635static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5636 struct sched_group **sg)
9c1cfda2 5637{
6711cab4
SS
5638 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5639 int group;
5640
5641 cpus_and(nodemask, nodemask, *cpu_map);
5642 group = first_cpu(nodemask);
5643
5644 if (sg)
5645 *sg = &per_cpu(sched_group_allnodes, group);
5646 return group;
1da177e4 5647}
6711cab4 5648
08069033
SS
5649static void init_numa_sched_groups_power(struct sched_group *group_head)
5650{
5651 struct sched_group *sg = group_head;
5652 int j;
5653
5654 if (!sg)
5655 return;
5656next_sg:
5657 for_each_cpu_mask(j, sg->cpumask) {
5658 struct sched_domain *sd;
5659
5660 sd = &per_cpu(phys_domains, j);
5661 if (j != first_cpu(sd->groups->cpumask)) {
5662 /*
5663 * Only add "power" once for each
5664 * physical package.
5665 */
5666 continue;
5667 }
5668
5517d86b 5669 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
08069033
SS
5670 }
5671 sg = sg->next;
5672 if (sg != group_head)
5673 goto next_sg;
5674}
1da177e4
LT
5675#endif
5676
a616058b 5677#ifdef CONFIG_NUMA
51888ca2
SV
5678/* Free memory allocated for various sched_group structures */
5679static void free_sched_groups(const cpumask_t *cpu_map)
5680{
a616058b 5681 int cpu, i;
51888ca2
SV
5682
5683 for_each_cpu_mask(cpu, *cpu_map) {
51888ca2
SV
5684 struct sched_group **sched_group_nodes
5685 = sched_group_nodes_bycpu[cpu];
5686
51888ca2
SV
5687 if (!sched_group_nodes)
5688 continue;
5689
5690 for (i = 0; i < MAX_NUMNODES; i++) {
5691 cpumask_t nodemask = node_to_cpumask(i);
5692 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5693
5694 cpus_and(nodemask, nodemask, *cpu_map);
5695 if (cpus_empty(nodemask))
5696 continue;
5697
5698 if (sg == NULL)
5699 continue;
5700 sg = sg->next;
5701next_sg:
5702 oldsg = sg;
5703 sg = sg->next;
5704 kfree(oldsg);
5705 if (oldsg != sched_group_nodes[i])
5706 goto next_sg;
5707 }
5708 kfree(sched_group_nodes);
5709 sched_group_nodes_bycpu[cpu] = NULL;
5710 }
51888ca2 5711}
a616058b
SS
5712#else
5713static void free_sched_groups(const cpumask_t *cpu_map)
5714{
5715}
5716#endif
51888ca2 5717
89c4710e
SS
5718/*
5719 * Initialize sched groups cpu_power.
5720 *
5721 * cpu_power indicates the capacity of sched group, which is used while
5722 * distributing the load between different sched groups in a sched domain.
5723 * Typically cpu_power for all the groups in a sched domain will be same unless
5724 * there are asymmetries in the topology. If there are asymmetries, group
5725 * having more cpu_power will pickup more load compared to the group having
5726 * less cpu_power.
5727 *
5728 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5729 * the maximum number of tasks a group can handle in the presence of other idle
5730 * or lightly loaded groups in the same sched domain.
5731 */
5732static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5733{
5734 struct sched_domain *child;
5735 struct sched_group *group;
5736
5737 WARN_ON(!sd || !sd->groups);
5738
5739 if (cpu != first_cpu(sd->groups->cpumask))
5740 return;
5741
5742 child = sd->child;
5743
5517d86b
ED
5744 sd->groups->__cpu_power = 0;
5745
89c4710e
SS
5746 /*
5747 * For perf policy, if the groups in child domain share resources
5748 * (for example cores sharing some portions of the cache hierarchy
5749 * or SMT), then set this domain groups cpu_power such that each group
5750 * can handle only one task, when there are other idle groups in the
5751 * same sched domain.
5752 */
5753 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
5754 (child->flags &
5755 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
5517d86b 5756 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
89c4710e
SS
5757 return;
5758 }
5759
89c4710e
SS
5760 /*
5761 * add cpu_power of each child group to this groups cpu_power
5762 */
5763 group = child->groups;
5764 do {
5517d86b 5765 sg_inc_cpu_power(sd->groups, group->__cpu_power);
89c4710e
SS
5766 group = group->next;
5767 } while (group != child->groups);
5768}
5769
1da177e4 5770/*
1a20ff27
DG
5771 * Build sched domains for a given set of cpus and attach the sched domains
5772 * to the individual cpus
1da177e4 5773 */
51888ca2 5774static int build_sched_domains(const cpumask_t *cpu_map)
1da177e4
LT
5775{
5776 int i;
d1b55138
JH
5777#ifdef CONFIG_NUMA
5778 struct sched_group **sched_group_nodes = NULL;
6711cab4 5779 int sd_allnodes = 0;
d1b55138
JH
5780
5781 /*
5782 * Allocate the per-node list of sched groups
5783 */
dd41f596 5784 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
d3a5aa98 5785 GFP_KERNEL);
d1b55138
JH
5786 if (!sched_group_nodes) {
5787 printk(KERN_WARNING "Can not alloc sched group node list\n");
51888ca2 5788 return -ENOMEM;
d1b55138
JH
5789 }
5790 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5791#endif
1da177e4
LT
5792
5793 /*
1a20ff27 5794 * Set up domains for cpus specified by the cpu_map.
1da177e4 5795 */
1a20ff27 5796 for_each_cpu_mask(i, *cpu_map) {
1da177e4
LT
5797 struct sched_domain *sd = NULL, *p;
5798 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
5799
1a20ff27 5800 cpus_and(nodemask, nodemask, *cpu_map);
1da177e4
LT
5801
5802#ifdef CONFIG_NUMA
dd41f596
IM
5803 if (cpus_weight(*cpu_map) >
5804 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
9c1cfda2
JH
5805 sd = &per_cpu(allnodes_domains, i);
5806 *sd = SD_ALLNODES_INIT;
5807 sd->span = *cpu_map;
6711cab4 5808 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
9c1cfda2 5809 p = sd;
6711cab4 5810 sd_allnodes = 1;
9c1cfda2
JH
5811 } else
5812 p = NULL;
5813
1da177e4 5814 sd = &per_cpu(node_domains, i);
1da177e4 5815 *sd = SD_NODE_INIT;
9c1cfda2
JH
5816 sd->span = sched_domain_node_span(cpu_to_node(i));
5817 sd->parent = p;
1a848870
SS
5818 if (p)
5819 p->child = sd;
9c1cfda2 5820 cpus_and(sd->span, sd->span, *cpu_map);
1da177e4
LT
5821#endif
5822
5823 p = sd;
5824 sd = &per_cpu(phys_domains, i);
1da177e4
LT
5825 *sd = SD_CPU_INIT;
5826 sd->span = nodemask;
5827 sd->parent = p;
1a848870
SS
5828 if (p)
5829 p->child = sd;
6711cab4 5830 cpu_to_phys_group(i, cpu_map, &sd->groups);
1da177e4 5831
1e9f28fa
SS
5832#ifdef CONFIG_SCHED_MC
5833 p = sd;
5834 sd = &per_cpu(core_domains, i);
1e9f28fa
SS
5835 *sd = SD_MC_INIT;
5836 sd->span = cpu_coregroup_map(i);
5837 cpus_and(sd->span, sd->span, *cpu_map);
5838 sd->parent = p;
1a848870 5839 p->child = sd;
6711cab4 5840 cpu_to_core_group(i, cpu_map, &sd->groups);
1e9f28fa
SS
5841#endif
5842
1da177e4
LT
5843#ifdef CONFIG_SCHED_SMT
5844 p = sd;
5845 sd = &per_cpu(cpu_domains, i);
1da177e4
LT
5846 *sd = SD_SIBLING_INIT;
5847 sd->span = cpu_sibling_map[i];
1a20ff27 5848 cpus_and(sd->span, sd->span, *cpu_map);
1da177e4 5849 sd->parent = p;
1a848870 5850 p->child = sd;
6711cab4 5851 cpu_to_cpu_group(i, cpu_map, &sd->groups);
1da177e4
LT
5852#endif
5853 }
5854
5855#ifdef CONFIG_SCHED_SMT
5856 /* Set up CPU (sibling) groups */
9c1cfda2 5857 for_each_cpu_mask(i, *cpu_map) {
1da177e4 5858 cpumask_t this_sibling_map = cpu_sibling_map[i];
1a20ff27 5859 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
1da177e4
LT
5860 if (i != first_cpu(this_sibling_map))
5861 continue;
5862
dd41f596
IM
5863 init_sched_build_groups(this_sibling_map, cpu_map,
5864 &cpu_to_cpu_group);
1da177e4
LT
5865 }
5866#endif
5867
1e9f28fa
SS
5868#ifdef CONFIG_SCHED_MC
5869 /* Set up multi-core groups */
5870 for_each_cpu_mask(i, *cpu_map) {
5871 cpumask_t this_core_map = cpu_coregroup_map(i);
5872 cpus_and(this_core_map, this_core_map, *cpu_map);
5873 if (i != first_cpu(this_core_map))
5874 continue;
dd41f596
IM
5875 init_sched_build_groups(this_core_map, cpu_map,
5876 &cpu_to_core_group);
1e9f28fa
SS
5877 }
5878#endif
5879
1da177e4
LT
5880 /* Set up physical groups */
5881 for (i = 0; i < MAX_NUMNODES; i++) {
5882 cpumask_t nodemask = node_to_cpumask(i);
5883
1a20ff27 5884 cpus_and(nodemask, nodemask, *cpu_map);
1da177e4
LT
5885 if (cpus_empty(nodemask))
5886 continue;
5887
6711cab4 5888 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
1da177e4
LT
5889 }
5890
5891#ifdef CONFIG_NUMA
5892 /* Set up node groups */
6711cab4 5893 if (sd_allnodes)
dd41f596
IM
5894 init_sched_build_groups(*cpu_map, cpu_map,
5895 &cpu_to_allnodes_group);
9c1cfda2
JH
5896
5897 for (i = 0; i < MAX_NUMNODES; i++) {
5898 /* Set up node groups */
5899 struct sched_group *sg, *prev;
5900 cpumask_t nodemask = node_to_cpumask(i);
5901 cpumask_t domainspan;
5902 cpumask_t covered = CPU_MASK_NONE;
5903 int j;
5904
5905 cpus_and(nodemask, nodemask, *cpu_map);
d1b55138
JH
5906 if (cpus_empty(nodemask)) {
5907 sched_group_nodes[i] = NULL;
9c1cfda2 5908 continue;
d1b55138 5909 }
9c1cfda2
JH
5910
5911 domainspan = sched_domain_node_span(i);
5912 cpus_and(domainspan, domainspan, *cpu_map);
5913
15f0b676 5914 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
51888ca2
SV
5915 if (!sg) {
5916 printk(KERN_WARNING "Can not alloc domain group for "
5917 "node %d\n", i);
5918 goto error;
5919 }
9c1cfda2
JH
5920 sched_group_nodes[i] = sg;
5921 for_each_cpu_mask(j, nodemask) {
5922 struct sched_domain *sd;
5923 sd = &per_cpu(node_domains, j);
5924 sd->groups = sg;
9c1cfda2 5925 }
5517d86b 5926 sg->__cpu_power = 0;
9c1cfda2 5927 sg->cpumask = nodemask;
51888ca2 5928 sg->next = sg;
9c1cfda2
JH
5929 cpus_or(covered, covered, nodemask);
5930 prev = sg;
5931
5932 for (j = 0; j < MAX_NUMNODES; j++) {
5933 cpumask_t tmp, notcovered;
5934 int n = (i + j) % MAX_NUMNODES;
5935
5936 cpus_complement(notcovered, covered);
5937 cpus_and(tmp, notcovered, *cpu_map);
5938 cpus_and(tmp, tmp, domainspan);
5939 if (cpus_empty(tmp))
5940 break;
5941
5942 nodemask = node_to_cpumask(n);
5943 cpus_and(tmp, tmp, nodemask);
5944 if (cpus_empty(tmp))
5945 continue;
5946
15f0b676
SV
5947 sg = kmalloc_node(sizeof(struct sched_group),
5948 GFP_KERNEL, i);
9c1cfda2
JH
5949 if (!sg) {
5950 printk(KERN_WARNING
5951 "Can not alloc domain group for node %d\n", j);
51888ca2 5952 goto error;
9c1cfda2 5953 }
5517d86b 5954 sg->__cpu_power = 0;
9c1cfda2 5955 sg->cpumask = tmp;
51888ca2 5956 sg->next = prev->next;
9c1cfda2
JH
5957 cpus_or(covered, covered, tmp);
5958 prev->next = sg;
5959 prev = sg;
5960 }
9c1cfda2 5961 }
1da177e4
LT
5962#endif
5963
5964 /* Calculate CPU power for physical packages and nodes */
5c45bf27 5965#ifdef CONFIG_SCHED_SMT
1a20ff27 5966 for_each_cpu_mask(i, *cpu_map) {
dd41f596
IM
5967 struct sched_domain *sd = &per_cpu(cpu_domains, i);
5968
89c4710e 5969 init_sched_groups_power(i, sd);
5c45bf27 5970 }
1da177e4 5971#endif
1e9f28fa 5972#ifdef CONFIG_SCHED_MC
5c45bf27 5973 for_each_cpu_mask(i, *cpu_map) {
dd41f596
IM
5974 struct sched_domain *sd = &per_cpu(core_domains, i);
5975
89c4710e 5976 init_sched_groups_power(i, sd);
5c45bf27
SS
5977 }
5978#endif
1e9f28fa 5979
5c45bf27 5980 for_each_cpu_mask(i, *cpu_map) {
dd41f596
IM
5981 struct sched_domain *sd = &per_cpu(phys_domains, i);
5982
89c4710e 5983 init_sched_groups_power(i, sd);
1da177e4
LT
5984 }
5985
9c1cfda2 5986#ifdef CONFIG_NUMA
08069033
SS
5987 for (i = 0; i < MAX_NUMNODES; i++)
5988 init_numa_sched_groups_power(sched_group_nodes[i]);
9c1cfda2 5989
6711cab4
SS
5990 if (sd_allnodes) {
5991 struct sched_group *sg;
f712c0c7 5992
6711cab4 5993 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
f712c0c7
SS
5994 init_numa_sched_groups_power(sg);
5995 }
9c1cfda2
JH
5996#endif
5997
1da177e4 5998 /* Attach the domains */
1a20ff27 5999 for_each_cpu_mask(i, *cpu_map) {
1da177e4
LT
6000 struct sched_domain *sd;
6001#ifdef CONFIG_SCHED_SMT
6002 sd = &per_cpu(cpu_domains, i);
1e9f28fa
SS
6003#elif defined(CONFIG_SCHED_MC)
6004 sd = &per_cpu(core_domains, i);
1da177e4
LT
6005#else
6006 sd = &per_cpu(phys_domains, i);
6007#endif
6008 cpu_attach_domain(sd, i);
6009 }
51888ca2
SV
6010
6011 return 0;
6012
a616058b 6013#ifdef CONFIG_NUMA
51888ca2
SV
6014error:
6015 free_sched_groups(cpu_map);
6016 return -ENOMEM;
a616058b 6017#endif
1da177e4 6018}
1a20ff27
DG
6019/*
6020 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6021 */
51888ca2 6022static int arch_init_sched_domains(const cpumask_t *cpu_map)
1a20ff27
DG
6023{
6024 cpumask_t cpu_default_map;
51888ca2 6025 int err;
1da177e4 6026
1a20ff27
DG
6027 /*
6028 * Setup mask for cpus without special case scheduling requirements.
6029 * For now this just excludes isolated cpus, but could be used to
6030 * exclude other special cases in the future.
6031 */
6032 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6033
51888ca2
SV
6034 err = build_sched_domains(&cpu_default_map);
6035
6036 return err;
1a20ff27
DG
6037}
6038
6039static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
1da177e4 6040{
51888ca2 6041 free_sched_groups(cpu_map);
9c1cfda2 6042}
1da177e4 6043
1a20ff27
DG
6044/*
6045 * Detach sched domains from a group of cpus specified in cpu_map
6046 * These cpus will now be attached to the NULL domain
6047 */
858119e1 6048static void detach_destroy_domains(const cpumask_t *cpu_map)
1a20ff27
DG
6049{
6050 int i;
6051
6052 for_each_cpu_mask(i, *cpu_map)
6053 cpu_attach_domain(NULL, i);
6054 synchronize_sched();
6055 arch_destroy_sched_domains(cpu_map);
6056}
6057
6058/*
6059 * Partition sched domains as specified by the cpumasks below.
6060 * This attaches all cpus from the cpumasks to the NULL domain,
6061 * waits for a RCU quiescent period, recalculates sched
6062 * domain information and then attaches them back to the
6063 * correct sched domains
6064 * Call with hotplug lock held
6065 */
51888ca2 6066int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
1a20ff27
DG
6067{
6068 cpumask_t change_map;
51888ca2 6069 int err = 0;
1a20ff27
DG
6070
6071 cpus_and(*partition1, *partition1, cpu_online_map);
6072 cpus_and(*partition2, *partition2, cpu_online_map);
6073 cpus_or(change_map, *partition1, *partition2);
6074
6075 /* Detach sched domains from all of the affected cpus */
6076 detach_destroy_domains(&change_map);
6077 if (!cpus_empty(*partition1))
51888ca2
SV
6078 err = build_sched_domains(partition1);
6079 if (!err && !cpus_empty(*partition2))
6080 err = build_sched_domains(partition2);
6081
6082 return err;
1a20ff27
DG
6083}
6084
5c45bf27
SS
6085#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6086int arch_reinit_sched_domains(void)
6087{
6088 int err;
6089
5be9361c 6090 mutex_lock(&sched_hotcpu_mutex);
5c45bf27
SS
6091 detach_destroy_domains(&cpu_online_map);
6092 err = arch_init_sched_domains(&cpu_online_map);
5be9361c 6093 mutex_unlock(&sched_hotcpu_mutex);
5c45bf27
SS
6094
6095 return err;
6096}
6097
6098static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6099{
6100 int ret;
6101
6102 if (buf[0] != '0' && buf[0] != '1')
6103 return -EINVAL;
6104
6105 if (smt)
6106 sched_smt_power_savings = (buf[0] == '1');
6107 else
6108 sched_mc_power_savings = (buf[0] == '1');
6109
6110 ret = arch_reinit_sched_domains();
6111
6112 return ret ? ret : count;
6113}
6114
6115int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6116{
6117 int err = 0;
48f24c4d 6118
5c45bf27
SS
6119#ifdef CONFIG_SCHED_SMT
6120 if (smt_capable())
6121 err = sysfs_create_file(&cls->kset.kobj,
6122 &attr_sched_smt_power_savings.attr);
6123#endif
6124#ifdef CONFIG_SCHED_MC
6125 if (!err && mc_capable())
6126 err = sysfs_create_file(&cls->kset.kobj,
6127 &attr_sched_mc_power_savings.attr);
6128#endif
6129 return err;
6130}
6131#endif
6132
6133#ifdef CONFIG_SCHED_MC
6134static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6135{
6136 return sprintf(page, "%u\n", sched_mc_power_savings);
6137}
48f24c4d
IM
6138static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6139 const char *buf, size_t count)
5c45bf27
SS
6140{
6141 return sched_power_savings_store(buf, count, 0);
6142}
6143SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6144 sched_mc_power_savings_store);
6145#endif
6146
6147#ifdef CONFIG_SCHED_SMT
6148static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6149{
6150 return sprintf(page, "%u\n", sched_smt_power_savings);
6151}
48f24c4d
IM
6152static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6153 const char *buf, size_t count)
5c45bf27
SS
6154{
6155 return sched_power_savings_store(buf, count, 1);
6156}
6157SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6158 sched_smt_power_savings_store);
6159#endif
6160
1da177e4
LT
6161/*
6162 * Force a reinitialization of the sched domains hierarchy. The domains
6163 * and groups cannot be updated in place without racing with the balancing
41c7ce9a 6164 * code, so we temporarily attach all running cpus to the NULL domain
1da177e4
LT
6165 * which will prevent rebalancing while the sched domains are recalculated.
6166 */
6167static int update_sched_domains(struct notifier_block *nfb,
6168 unsigned long action, void *hcpu)
6169{
1da177e4
LT
6170 switch (action) {
6171 case CPU_UP_PREPARE:
8bb78442 6172 case CPU_UP_PREPARE_FROZEN:
1da177e4 6173 case CPU_DOWN_PREPARE:
8bb78442 6174 case CPU_DOWN_PREPARE_FROZEN:
1a20ff27 6175 detach_destroy_domains(&cpu_online_map);
1da177e4
LT
6176 return NOTIFY_OK;
6177
6178 case CPU_UP_CANCELED:
8bb78442 6179 case CPU_UP_CANCELED_FROZEN:
1da177e4 6180 case CPU_DOWN_FAILED:
8bb78442 6181 case CPU_DOWN_FAILED_FROZEN:
1da177e4 6182 case CPU_ONLINE:
8bb78442 6183 case CPU_ONLINE_FROZEN:
1da177e4 6184 case CPU_DEAD:
8bb78442 6185 case CPU_DEAD_FROZEN:
1da177e4
LT
6186 /*
6187 * Fall through and re-initialise the domains.
6188 */
6189 break;
6190 default:
6191 return NOTIFY_DONE;
6192 }
6193
6194 /* The hotplug lock is already held by cpu_up/cpu_down */
1a20ff27 6195 arch_init_sched_domains(&cpu_online_map);
1da177e4
LT
6196
6197 return NOTIFY_OK;
6198}
1da177e4
LT
6199
6200void __init sched_init_smp(void)
6201{
5c1e1767
NP
6202 cpumask_t non_isolated_cpus;
6203
5be9361c 6204 mutex_lock(&sched_hotcpu_mutex);
1a20ff27 6205 arch_init_sched_domains(&cpu_online_map);
e5e5673f 6206 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
5c1e1767
NP
6207 if (cpus_empty(non_isolated_cpus))
6208 cpu_set(smp_processor_id(), non_isolated_cpus);
5be9361c 6209 mutex_unlock(&sched_hotcpu_mutex);
1da177e4
LT
6210 /* XXX: Theoretical race here - CPU may be hotplugged now */
6211 hotcpu_notifier(update_sched_domains, 0);
5c1e1767
NP
6212
6213 /* Move init over to a non-isolated CPU */
6214 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6215 BUG();
dd41f596 6216 sched_init_granularity();
1da177e4
LT
6217}
6218#else
6219void __init sched_init_smp(void)
6220{
dd41f596 6221 sched_init_granularity();
1da177e4
LT
6222}
6223#endif /* CONFIG_SMP */
6224
6225int in_sched_functions(unsigned long addr)
6226{
6227 /* Linker adds these: start and end of __sched functions */
6228 extern char __sched_text_start[], __sched_text_end[];
48f24c4d 6229
1da177e4
LT
6230 return in_lock_functions(addr) ||
6231 (addr >= (unsigned long)__sched_text_start
6232 && addr < (unsigned long)__sched_text_end);
6233}
6234
dd41f596
IM
6235static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6236{
6237 cfs_rq->tasks_timeline = RB_ROOT;
6238 cfs_rq->fair_clock = 1;
6239#ifdef CONFIG_FAIR_GROUP_SCHED
6240 cfs_rq->rq = rq;
6241#endif
6242}
6243
1da177e4
LT
6244void __init sched_init(void)
6245{
dd41f596 6246 u64 now = sched_clock();
476f3534 6247 int highest_cpu = 0;
dd41f596
IM
6248 int i, j;
6249
6250 /*
6251 * Link up the scheduling class hierarchy:
6252 */
6253 rt_sched_class.next = &fair_sched_class;
6254 fair_sched_class.next = &idle_sched_class;
6255 idle_sched_class.next = NULL;
1da177e4 6256
0a945022 6257 for_each_possible_cpu(i) {
dd41f596 6258 struct rt_prio_array *array;
70b97a7f 6259 struct rq *rq;
1da177e4
LT
6260
6261 rq = cpu_rq(i);
6262 spin_lock_init(&rq->lock);
fcb99371 6263 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
7897986b 6264 rq->nr_running = 0;
dd41f596
IM
6265 rq->clock = 1;
6266 init_cfs_rq(&rq->cfs, rq);
6267#ifdef CONFIG_FAIR_GROUP_SCHED
6268 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6269 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6270#endif
6271 rq->ls.load_update_last = now;
6272 rq->ls.load_update_start = now;
1da177e4 6273
dd41f596
IM
6274 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6275 rq->cpu_load[j] = 0;
1da177e4 6276#ifdef CONFIG_SMP
41c7ce9a 6277 rq->sd = NULL;
1da177e4 6278 rq->active_balance = 0;
dd41f596 6279 rq->next_balance = jiffies;
1da177e4 6280 rq->push_cpu = 0;
0a2966b4 6281 rq->cpu = i;
1da177e4
LT
6282 rq->migration_thread = NULL;
6283 INIT_LIST_HEAD(&rq->migration_queue);
6284#endif
6285 atomic_set(&rq->nr_iowait, 0);
6286
dd41f596
IM
6287 array = &rq->rt.active;
6288 for (j = 0; j < MAX_RT_PRIO; j++) {
6289 INIT_LIST_HEAD(array->queue + j);
6290 __clear_bit(j, array->bitmap);
1da177e4 6291 }
476f3534 6292 highest_cpu = i;
dd41f596
IM
6293 /* delimiter for bitsearch: */
6294 __set_bit(MAX_RT_PRIO, array->bitmap);
1da177e4
LT
6295 }
6296
2dd73a4f 6297 set_load_weight(&init_task);
b50f60ce 6298
c9819f45 6299#ifdef CONFIG_SMP
476f3534 6300 nr_cpu_ids = highest_cpu + 1;
c9819f45
CL
6301 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6302#endif
6303
b50f60ce
HC
6304#ifdef CONFIG_RT_MUTEXES
6305 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6306#endif
6307
1da177e4
LT
6308 /*
6309 * The boot idle thread does lazy MMU switching as well:
6310 */
6311 atomic_inc(&init_mm.mm_count);
6312 enter_lazy_tlb(&init_mm, current);
6313
6314 /*
6315 * Make us the idle thread. Technically, schedule() should not be
6316 * called from this thread, however somewhere below it might be,
6317 * but because we are the idle thread, we just pick up running again
6318 * when this runqueue becomes "idle".
6319 */
6320 init_idle(current, smp_processor_id());
dd41f596
IM
6321 /*
6322 * During early bootup we pretend to be a normal task:
6323 */
6324 current->sched_class = &fair_sched_class;
1da177e4
LT
6325}
6326
6327#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6328void __might_sleep(char *file, int line)
6329{
48f24c4d 6330#ifdef in_atomic
1da177e4
LT
6331 static unsigned long prev_jiffy; /* ratelimiting */
6332
6333 if ((in_atomic() || irqs_disabled()) &&
6334 system_state == SYSTEM_RUNNING && !oops_in_progress) {
6335 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6336 return;
6337 prev_jiffy = jiffies;
91368d73 6338 printk(KERN_ERR "BUG: sleeping function called from invalid"
1da177e4
LT
6339 " context at %s:%d\n", file, line);
6340 printk("in_atomic():%d, irqs_disabled():%d\n",
6341 in_atomic(), irqs_disabled());
a4c410f0 6342 debug_show_held_locks(current);
3117df04
IM
6343 if (irqs_disabled())
6344 print_irqtrace_events(current);
1da177e4
LT
6345 dump_stack();
6346 }
6347#endif
6348}
6349EXPORT_SYMBOL(__might_sleep);
6350#endif
6351
6352#ifdef CONFIG_MAGIC_SYSRQ
6353void normalize_rt_tasks(void)
6354{
a0f98a1c 6355 struct task_struct *g, *p;
1da177e4 6356 unsigned long flags;
70b97a7f 6357 struct rq *rq;
dd41f596 6358 int on_rq;
1da177e4
LT
6359
6360 read_lock_irq(&tasklist_lock);
a0f98a1c 6361 do_each_thread(g, p) {
dd41f596
IM
6362 p->se.fair_key = 0;
6363 p->se.wait_runtime = 0;
6364 p->se.wait_start_fair = 0;
6365 p->se.wait_start = 0;
6366 p->se.exec_start = 0;
6367 p->se.sleep_start = 0;
6368 p->se.sleep_start_fair = 0;
6369 p->se.block_start = 0;
6370 task_rq(p)->cfs.fair_clock = 0;
6371 task_rq(p)->clock = 0;
6372
6373 if (!rt_task(p)) {
6374 /*
6375 * Renice negative nice level userspace
6376 * tasks back to 0:
6377 */
6378 if (TASK_NICE(p) < 0 && p->mm)
6379 set_user_nice(p, 0);
1da177e4 6380 continue;
dd41f596 6381 }
1da177e4 6382
b29739f9
IM
6383 spin_lock_irqsave(&p->pi_lock, flags);
6384 rq = __task_rq_lock(p);
dd41f596
IM
6385#ifdef CONFIG_SMP
6386 /*
6387 * Do not touch the migration thread:
6388 */
6389 if (p == rq->migration_thread)
6390 goto out_unlock;
6391#endif
1da177e4 6392
dd41f596
IM
6393 on_rq = p->se.on_rq;
6394 if (on_rq)
6395 deactivate_task(task_rq(p), p, 0);
6396 __setscheduler(rq, p, SCHED_NORMAL, 0);
6397 if (on_rq) {
6398 activate_task(task_rq(p), p, 0);
1da177e4
LT
6399 resched_task(rq->curr);
6400 }
dd41f596
IM
6401#ifdef CONFIG_SMP
6402 out_unlock:
6403#endif
b29739f9
IM
6404 __task_rq_unlock(rq);
6405 spin_unlock_irqrestore(&p->pi_lock, flags);
a0f98a1c
IM
6406 } while_each_thread(g, p);
6407
1da177e4
LT
6408 read_unlock_irq(&tasklist_lock);
6409}
6410
6411#endif /* CONFIG_MAGIC_SYSRQ */
1df5c10a
LT
6412
6413#ifdef CONFIG_IA64
6414/*
6415 * These functions are only useful for the IA64 MCA handling.
6416 *
6417 * They can only be called when the whole system has been
6418 * stopped - every CPU needs to be quiescent, and no scheduling
6419 * activity can take place. Using them for anything else would
6420 * be a serious bug, and as a result, they aren't even visible
6421 * under any other configuration.
6422 */
6423
6424/**
6425 * curr_task - return the current task for a given cpu.
6426 * @cpu: the processor in question.
6427 *
6428 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6429 */
36c8b586 6430struct task_struct *curr_task(int cpu)
1df5c10a
LT
6431{
6432 return cpu_curr(cpu);
6433}
6434
6435/**
6436 * set_curr_task - set the current task for a given cpu.
6437 * @cpu: the processor in question.
6438 * @p: the task pointer to set.
6439 *
6440 * Description: This function must only be used when non-maskable interrupts
6441 * are serviced on a separate stack. It allows the architecture to switch the
6442 * notion of the current task on a cpu in a non-blocking manner. This function
6443 * must be called with all CPU's synchronized, and interrupts disabled, the
6444 * and caller must save the original value of the current task (see
6445 * curr_task() above) and restore that value before reenabling interrupts and
6446 * re-starting the system.
6447 *
6448 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6449 */
36c8b586 6450void set_curr_task(int cpu, struct task_struct *p)
1df5c10a
LT
6451{
6452 cpu_curr(cpu) = p;
6453}
6454
6455#endif
This page took 0.73643 seconds and 5 git commands to generate.