[PATCH] sched: introduce child field in sched_domain
[deliverable/linux.git] / kernel / sched.c
index f9b3c6a414f189bf443044b9947715cb94825fa5..0feeacb9149749a04219eba1f6f36c4aed30065c 100644 (file)
@@ -49,7 +49,7 @@
 #include <linux/seq_file.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
-#include <linux/acct.h>
+#include <linux/tsacct_kern.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
 #include <asm/tlb.h>
@@ -1286,21 +1286,29 @@ static int sched_balance_self(int cpu, int flag)
        while (sd) {
                cpumask_t span;
                struct sched_group *group;
-               int new_cpu;
-               int weight;
+               int new_cpu, weight;
+
+               if (!(sd->flags & flag)) {
+                       sd = sd->child;
+                       continue;
+               }
 
                span = sd->span;
                group = find_idlest_group(sd, t, cpu);
-               if (!group)
-                       goto nextlevel;
+               if (!group) {
+                       sd = sd->child;
+                       continue;
+               }
 
                new_cpu = find_idlest_cpu(group, t, cpu);
-               if (new_cpu == -1 || new_cpu == cpu)
-                       goto nextlevel;
+               if (new_cpu == -1 || new_cpu == cpu) {
+                       /* Now try balancing at a lower domain level of cpu */
+                       sd = sd->child;
+                       continue;
+               }
 
-               /* Now try balancing at a lower domain level */
+               /* Now try balancing at a lower domain level of new_cpu */
                cpu = new_cpu;
-nextlevel:
                sd = NULL;
                weight = cpus_weight(span);
                for_each_domain(cpu, tmp) {
@@ -1755,27 +1763,27 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
        __releases(rq->lock)
 {
        struct mm_struct *mm = rq->prev_mm;
-       unsigned long prev_task_flags;
+       long prev_state;
 
        rq->prev_mm = NULL;
 
        /*
         * A task struct has one reference for the use as "current".
-        * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
-        * calls schedule one last time. The schedule call will never return,
-        * and the scheduled task must drop that reference.
-        * The test for EXIT_ZOMBIE must occur while the runqueue locks are
+        * If a task dies, then it sets TASK_DEAD in tsk->state and calls
+        * schedule one last time. The schedule call will never return, and
+        * the scheduled task must drop that reference.
+        * The test for TASK_DEAD must occur while the runqueue locks are
         * still held, otherwise prev could be scheduled on another cpu, die
         * there before we look at prev->state, and then the reference would
         * be dropped twice.
         *              Manfred Spraul <manfred@colorfullife.com>
         */
-       prev_task_flags = prev->flags;
+       prev_state = prev->state;
        finish_arch_switch(prev);
        finish_lock_switch(rq, prev);
        if (mm)
                mmdrop(mm);
-       if (unlikely(prev_task_flags & PF_DEAD)) {
+       if (unlikely(prev_state == TASK_DEAD)) {
                /*
                 * Remove function-return probe instances associated with this
                 * task and put them back on the free list.
@@ -3348,9 +3356,6 @@ need_resched_nonpreemptible:
 
        spin_lock_irq(&rq->lock);
 
-       if (unlikely(prev->flags & PF_DEAD))
-               prev->state = EXIT_DEAD;
-
        switch_count = &prev->nivcsw;
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
                switch_count = &prev->nvcsw;
@@ -4109,35 +4114,32 @@ recheck:
            (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
            (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
                return -EINVAL;
-       if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
-                                       != (param->sched_priority == 0))
+       if (is_rt_policy(policy) != (param->sched_priority != 0))
                return -EINVAL;
 
        /*
         * Allow unprivileged RT tasks to decrease priority:
         */
        if (!capable(CAP_SYS_NICE)) {
-               unsigned long rlim_rtprio;
-               unsigned long flags;
-
-               if (!lock_task_sighand(p, &flags))
-                       return -ESRCH;
-               rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
-               unlock_task_sighand(p, &flags);
+               if (is_rt_policy(policy)) {
+                       unsigned long rlim_rtprio;
+                       unsigned long flags;
+
+                       if (!lock_task_sighand(p, &flags))
+                               return -ESRCH;
+                       rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+                       unlock_task_sighand(p, &flags);
+
+                       /* can't set/change the rt policy */
+                       if (policy != p->policy && !rlim_rtprio)
+                               return -EPERM;
+
+                       /* can't increase priority */
+                       if (param->sched_priority > p->rt_priority &&
+                           param->sched_priority > rlim_rtprio)
+                               return -EPERM;
+               }
 
-               /*
-                * can't change policy, except between SCHED_NORMAL
-                * and SCHED_BATCH:
-                */
-               if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
-                       (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
-                               !rlim_rtprio)
-                       return -EPERM;
-               /* can't increase priority */
-               if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
-                   param->sched_priority > p->rt_priority &&
-                   param->sched_priority > rlim_rtprio)
-                       return -EPERM;
                /* can't change other user's priorities */
                if ((current->euid != p->euid) &&
                    (current->euid != p->uid))
@@ -4390,7 +4392,10 @@ EXPORT_SYMBOL(cpu_present_map);
 
 #ifndef CONFIG_SMP
 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_online_map);
+
 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_possible_map);
 #endif
 
 long sched_getaffinity(pid_t pid, cpumask_t *mask)
@@ -4820,7 +4825,7 @@ void show_state(void)
  * NOTE: this function does not set the idle thread's NEED_RESCHED
  * flag, to make booting more robust.
  */
-void __devinit init_idle(struct task_struct *idle, int cpu)
+void __cpuinit init_idle(struct task_struct *idle, int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
@@ -5159,7 +5164,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
        BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
 
        /* Cannot have done final schedule yet: would have vanished. */
-       BUG_ON(p->flags & PF_DEAD);
+       BUG_ON(p->state == TASK_DEAD);
 
        get_task_struct(p);
 
@@ -5451,12 +5456,18 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
                struct sched_domain *parent = tmp->parent;
                if (!parent)
                        break;
-               if (sd_parent_degenerate(tmp, parent))
+               if (sd_parent_degenerate(tmp, parent)) {
                        tmp->parent = parent->parent;
+                       if (parent->parent)
+                               parent->parent->child = tmp;
+               }
        }
 
-       if (sd && sd_degenerate(sd))
+       if (sd && sd_degenerate(sd)) {
                sd = sd->parent;
+               if (sd)
+                       sd->child = NULL;
+       }
 
        sched_domain_debug(sd, cpu);
 
@@ -5464,7 +5475,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 }
 
 /* cpus with isolated domains */
-static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t __cpuinitdata cpu_isolated_map = CPU_MASK_NONE;
 
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
@@ -5492,15 +5503,17 @@ __setup ("isolcpus=", isolated_cpu_setup);
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
-static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
-                                   int (*group_fn)(int cpu))
+static void
+init_sched_build_groups(struct sched_group groups[], cpumask_t span,
+                       const cpumask_t *cpu_map,
+                       int (*group_fn)(int cpu, const cpumask_t *cpu_map))
 {
        struct sched_group *first = NULL, *last = NULL;
        cpumask_t covered = CPU_MASK_NONE;
        int i;
 
        for_each_cpu_mask(i, span) {
-               int group = group_fn(i);
+               int group = group_fn(i, cpu_map);
                struct sched_group *sg = &groups[group];
                int j;
 
@@ -5511,7 +5524,7 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
                sg->cpu_power = 0;
 
                for_each_cpu_mask(j, span) {
-                       if (group_fn(j) != group)
+                       if (group_fn(j, cpu_map) != group)
                                continue;
 
                        cpu_set(j, covered);
@@ -5978,13 +5991,15 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map)
 #endif
                );
        if (system_state == SYSTEM_BOOTING) {
-               printk("migration_cost=");
-               for (distance = 0; distance <= max_distance; distance++) {
-                       if (distance)
-                               printk(",");
-                       printk("%ld", (long)migration_cost[distance] / 1000);
+               if (num_online_cpus() > 1) {
+                       printk("migration_cost=");
+                       for (distance = 0; distance <= max_distance; distance++) {
+                               if (distance)
+                                       printk(",");
+                               printk("%ld", (long)migration_cost[distance] / 1000);
+                       }
+                       printk("\n");
                }
-               printk("\n");
        }
        j1 = jiffies;
        if (migration_debug)
@@ -6087,7 +6102,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
 
-static int cpu_to_cpu_group(int cpu)
+static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map)
 {
        return cpu;
 }
@@ -6098,31 +6113,36 @@ static int cpu_to_cpu_group(int cpu)
  */
 #ifdef CONFIG_SCHED_MC
 static DEFINE_PER_CPU(struct sched_domain, core_domains);
-static struct sched_group *sched_group_core_bycpu[NR_CPUS];
+static struct sched_group sched_group_core[NR_CPUS];
 #endif
 
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
-static int cpu_to_core_group(int cpu)
+static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map)
 {
-       return first_cpu(cpu_sibling_map[cpu]);
+       cpumask_t mask = cpu_sibling_map[cpu];
+       cpus_and(mask, mask, *cpu_map);
+       return first_cpu(mask);
 }
 #elif defined(CONFIG_SCHED_MC)
-static int cpu_to_core_group(int cpu)
+static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map)
 {
        return cpu;
 }
 #endif
 
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group *sched_group_phys_bycpu[NR_CPUS];
+static struct sched_group sched_group_phys[NR_CPUS];
 
-static int cpu_to_phys_group(int cpu)
+static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map)
 {
 #ifdef CONFIG_SCHED_MC
        cpumask_t mask = cpu_coregroup_map(cpu);
+       cpus_and(mask, mask, *cpu_map);
        return first_cpu(mask);
 #elif defined(CONFIG_SCHED_SMT)
-       return first_cpu(cpu_sibling_map[cpu]);
+       cpumask_t mask = cpu_sibling_map[cpu];
+       cpus_and(mask, mask, *cpu_map);
+       return first_cpu(mask);
 #else
        return cpu;
 #endif
@@ -6140,7 +6160,7 @@ static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
 static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
 
-static int cpu_to_allnodes_group(int cpu)
+static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map)
 {
        return cpu_to_node(cpu);
 }
@@ -6172,12 +6192,11 @@ next_sg:
 }
 #endif
 
+#ifdef CONFIG_NUMA
 /* Free memory allocated for various sched_group structures */
 static void free_sched_groups(const cpumask_t *cpu_map)
 {
-       int cpu;
-#ifdef CONFIG_NUMA
-       int i;
+       int cpu, i;
 
        for_each_cpu_mask(cpu, *cpu_map) {
                struct sched_group *sched_group_allnodes
@@ -6214,20 +6233,12 @@ next_sg:
                kfree(sched_group_nodes);
                sched_group_nodes_bycpu[cpu] = NULL;
        }
-#endif
-       for_each_cpu_mask(cpu, *cpu_map) {
-               if (sched_group_phys_bycpu[cpu]) {
-                       kfree(sched_group_phys_bycpu[cpu]);
-                       sched_group_phys_bycpu[cpu] = NULL;
-               }
-#ifdef CONFIG_SCHED_MC
-               if (sched_group_core_bycpu[cpu]) {
-                       kfree(sched_group_core_bycpu[cpu]);
-                       sched_group_core_bycpu[cpu] = NULL;
-               }
-#endif
-       }
 }
+#else
+static void free_sched_groups(const cpumask_t *cpu_map)
+{
+}
+#endif
 
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
@@ -6236,10 +6247,6 @@ next_sg:
 static int build_sched_domains(const cpumask_t *cpu_map)
 {
        int i;
-       struct sched_group *sched_group_phys = NULL;
-#ifdef CONFIG_SCHED_MC
-       struct sched_group *sched_group_core = NULL;
-#endif
 #ifdef CONFIG_NUMA
        struct sched_group **sched_group_nodes = NULL;
        struct sched_group *sched_group_allnodes = NULL;
@@ -6285,7 +6292,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                        sd = &per_cpu(allnodes_domains, i);
                        *sd = SD_ALLNODES_INIT;
                        sd->span = *cpu_map;
-                       group = cpu_to_allnodes_group(i);
+                       group = cpu_to_allnodes_group(i, cpu_map);
                        sd->groups = &sched_group_allnodes[group];
                        p = sd;
                } else
@@ -6295,60 +6302,42 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                *sd = SD_NODE_INIT;
                sd->span = sched_domain_node_span(cpu_to_node(i));
                sd->parent = p;
+               if (p)
+                       p->child = sd;
                cpus_and(sd->span, sd->span, *cpu_map);
 #endif
 
-               if (!sched_group_phys) {
-                       sched_group_phys
-                               = kmalloc(sizeof(struct sched_group) * NR_CPUS,
-                                         GFP_KERNEL);
-                       if (!sched_group_phys) {
-                               printk (KERN_WARNING "Can not alloc phys sched"
-                                                    "group\n");
-                               goto error;
-                       }
-                       sched_group_phys_bycpu[i] = sched_group_phys;
-               }
-
                p = sd;
                sd = &per_cpu(phys_domains, i);
-               group = cpu_to_phys_group(i);
+               group = cpu_to_phys_group(i, cpu_map);
                *sd = SD_CPU_INIT;
                sd->span = nodemask;
                sd->parent = p;
+               if (p)
+                       p->child = sd;
                sd->groups = &sched_group_phys[group];
 
 #ifdef CONFIG_SCHED_MC
-               if (!sched_group_core) {
-                       sched_group_core
-                               = kmalloc(sizeof(struct sched_group) * NR_CPUS,
-                                         GFP_KERNEL);
-                       if (!sched_group_core) {
-                               printk (KERN_WARNING "Can not alloc core sched"
-                                                    "group\n");
-                               goto error;
-                       }
-                       sched_group_core_bycpu[i] = sched_group_core;
-               }
-
                p = sd;
                sd = &per_cpu(core_domains, i);
-               group = cpu_to_core_group(i);
+               group = cpu_to_core_group(i, cpu_map);
                *sd = SD_MC_INIT;
                sd->span = cpu_coregroup_map(i);
                cpus_and(sd->span, sd->span, *cpu_map);
                sd->parent = p;
+               p->child = sd;
                sd->groups = &sched_group_core[group];
 #endif
 
 #ifdef CONFIG_SCHED_SMT
                p = sd;
                sd = &per_cpu(cpu_domains, i);
-               group = cpu_to_cpu_group(i);
+               group = cpu_to_cpu_group(i, cpu_map);
                *sd = SD_SIBLING_INIT;
                sd->span = cpu_sibling_map[i];
                cpus_and(sd->span, sd->span, *cpu_map);
                sd->parent = p;
+               p->child = sd;
                sd->groups = &sched_group_cpus[group];
 #endif
        }
@@ -6362,7 +6351,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                        continue;
 
                init_sched_build_groups(sched_group_cpus, this_sibling_map,
-                                               &cpu_to_cpu_group);
+                                       cpu_map, &cpu_to_cpu_group);
        }
 #endif
 
@@ -6374,7 +6363,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                if (i != first_cpu(this_core_map))
                        continue;
                init_sched_build_groups(sched_group_core, this_core_map,
-                                       &cpu_to_core_group);
+                                       cpu_map, &cpu_to_core_group);
        }
 #endif
 
@@ -6388,14 +6377,14 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                        continue;
 
                init_sched_build_groups(sched_group_phys, nodemask,
-                                               &cpu_to_phys_group);
+                                       cpu_map, &cpu_to_phys_group);
        }
 
 #ifdef CONFIG_NUMA
        /* Set up node groups */
        if (sched_group_allnodes)
                init_sched_build_groups(sched_group_allnodes, *cpu_map,
-                                       &cpu_to_allnodes_group);
+                                       cpu_map, &cpu_to_allnodes_group);
 
        for (i = 0; i < MAX_NUMNODES; i++) {
                /* Set up node groups */
@@ -6540,7 +6529,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                init_numa_sched_groups_power(sched_group_nodes[i]);
 
        if (sched_group_allnodes) {
-               int group = cpu_to_allnodes_group(first_cpu(*cpu_map));
+               int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map);
                struct sched_group *sg = &sched_group_allnodes[group];
 
                init_numa_sched_groups_power(sg);
@@ -6566,9 +6555,11 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 
        return 0;
 
+#ifdef CONFIG_NUMA
 error:
        free_sched_groups(cpu_map);
        return -ENOMEM;
+#endif
 }
 /*
  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
@@ -6750,11 +6741,20 @@ static int update_sched_domains(struct notifier_block *nfb,
 
 void __init sched_init_smp(void)
 {
+       cpumask_t non_isolated_cpus;
+
        lock_cpu_hotplug();
        arch_init_sched_domains(&cpu_online_map);
+       cpus_andnot(non_isolated_cpus, cpu_online_map, cpu_isolated_map);
+       if (cpus_empty(non_isolated_cpus))
+               cpu_set(smp_processor_id(), non_isolated_cpus);
        unlock_cpu_hotplug();
        /* XXX: Theoretical race here - CPU may be hotplugged now */
        hotcpu_notifier(update_sched_domains, 0);
+
+       /* Move init over to a non-isolated CPU */
+       if (set_cpus_allowed(current, non_isolated_cpus) < 0)
+               BUG();
 }
 #else
 void __init sched_init_smp(void)
This page took 0.062741 seconds and 5 git commands to generate.