Merge remote-tracking branch 'rcu/rcu/next'

[deliverable/linux.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 2a18856f00ab906b6b553ed58c37c8e4f04397bb..f5f7b3cdf0be0577d69fc44e4390e7d9a1e75c88 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,6 +74,7 @@
  #include <linux/context_tracking.h>
  #include <linux/compiler.h>
  #include <linux/frame.h>
+#include <linux/prefetch.h>
  
  #include <asm/switch_to.h>
  #include <asm/tlb.h>
@@ -1271,7 +1272,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
                 /*
                  * Task isn't running anymore; make it appear like we migrated
                  * it before it went to sleep. This means on wakeup we make the
-                * previous cpu our targer instead of where it really is.
+                * previous cpu our target instead of where it really is.
                  */
                 p->wake_cpu = cpu;
         }
@@ -1635,23 +1636,25 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
  static void
  ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
  {
-#ifdef CONFIG_SCHEDSTATS
-       struct rq *rq = this_rq();
+       struct rq *rq;
  
-#ifdef CONFIG_SMP
-       int this_cpu = smp_processor_id();
+       if (!schedstat_enabled())
+               return;
+
+       rq = this_rq();
  
-       if (cpu == this_cpu) {
-               schedstat_inc(rq, ttwu_local);
-               schedstat_inc(p, se.statistics.nr_wakeups_local);
+#ifdef CONFIG_SMP
+       if (cpu == rq->cpu) {
+               schedstat_inc(rq->ttwu_local);
+               schedstat_inc(p->se.statistics.nr_wakeups_local);
         } else {
                 struct sched_domain *sd;
  
-               schedstat_inc(p, se.statistics.nr_wakeups_remote);
+               schedstat_inc(p->se.statistics.nr_wakeups_remote);
                 rcu_read_lock();
-               for_each_domain(this_cpu, sd) {
+               for_each_domain(rq->cpu, sd) {
                         if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-                               schedstat_inc(sd, ttwu_wake_remote);
+                               schedstat_inc(sd->ttwu_wake_remote);
                                 break;
                         }
                 }
@@ -1659,17 +1662,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
         }
  
         if (wake_flags & WF_MIGRATED)
-               schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-
+               schedstat_inc(p->se.statistics.nr_wakeups_migrate);
  #endif /* CONFIG_SMP */
  
-       schedstat_inc(rq, ttwu_count);
-       schedstat_inc(p, se.statistics.nr_wakeups);
+       schedstat_inc(rq->ttwu_count);
+       schedstat_inc(p->se.statistics.nr_wakeups);
  
         if (wake_flags & WF_SYNC)
-               schedstat_inc(p, se.statistics.nr_wakeups_sync);
-
-#endif /* CONFIG_SCHEDSTATS */
+               schedstat_inc(p->se.statistics.nr_wakeups_sync);
  }
  
  static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
@@ -2022,6 +2022,28 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         success = 1; /* we're going to change ->state */
         cpu = task_cpu(p);
  
+       /*
+        * Ensure we load p->on_rq _after_ p->state, otherwise it would
+        * be possible to, falsely, observe p->on_rq == 0 and get stuck
+        * in smp_cond_load_acquire() below.
+        *
+        * sched_ttwu_pending()                 try_to_wake_up()
+        *   [S] p->on_rq = 1;                  [L] P->state
+        *       UNLOCK rq->lock  -----.
+        *                              \
+        *                               +---   RMB
+        * schedule()                   /
+        *       LOCK rq->lock    -----'
+        *       UNLOCK rq->lock
+        *
+        * [task p]
+        *   [S] p->state = UNINTERRUPTIBLE     [L] p->on_rq
+        *
+        * Pairs with the UNLOCK+LOCK on rq->lock from the
+        * last wakeup of our task and the schedule that got our task
+        * current.
+        */
+       smp_rmb();
         if (p->on_rq && ttwu_remote(p, wake_flags))
                 goto stat;
  
@@ -2068,8 +2090,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
  
         ttwu_queue(p, cpu, wake_flags);
  stat:
-       if (schedstat_enabled())
-               ttwu_stat(p, cpu, wake_flags);
+       ttwu_stat(p, cpu, wake_flags);
  out:
         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
  
@@ -2079,6 +2100,7 @@ out:
  /**
   * try_to_wake_up_local - try to wake up a local task with rq lock held
   * @p: the thread to be awakened
+ * @cookie: context's cookie for pinning
   *
   * Put @p on the run-queue if it's not already there. The caller must
   * ensure that this_rq() is locked, @p is bound to this_rq() and not
@@ -2117,8 +2139,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
                 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
  
         ttwu_do_wakeup(rq, p, 0, cookie);
-       if (schedstat_enabled())
-               ttwu_stat(p, smp_processor_id(), 0);
+       ttwu_stat(p, smp_processor_id(), 0);
  out:
         raw_spin_unlock(&p->pi_lock);
  }
@@ -2978,6 +2999,23 @@ DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
  EXPORT_PER_CPU_SYMBOL(kstat);
  EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
  
+/*
+ * The function fair_sched_class.update_curr accesses the struct curr
+ * and its field curr->exec_start; when called from task_sched_runtime(),
+ * we observe a high rate of cache misses in practice.
+ * Prefetching this data results in improved performance.
+ */
+static inline void prefetch_curr_exec_start(struct task_struct *p)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       struct sched_entity *curr = (&p->se)->cfs_rq->curr;
+#else
+       struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
+#endif
+       prefetch(curr);
+       prefetch(&curr->exec_start);
+}
+
  /*
   * Return accounted runtime for the task.
   * In case the task is currently running, return the runtime plus current's
@@ -3012,6 +3050,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
          * thread, breaking clock_gettime().
          */
         if (task_current(rq, p) && task_on_rq_queued(p)) {
+               prefetch_curr_exec_start(p);
                 update_rq_clock(rq);
                 p->sched_class->update_curr(rq);
         }
@@ -3158,6 +3197,9 @@ static inline void preempt_latency_stop(int val) { }
   */
  static noinline void __schedule_bug(struct task_struct *prev)
  {
+       /* Save this before calling printk(), since that will clobber it */
+       unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
+
         if (oops_in_progress)
                 return;
  
@@ -3168,13 +3210,12 @@ static noinline void __schedule_bug(struct task_struct *prev)
         print_modules();
         if (irqs_disabled())
                 print_irqtrace_events(prev);
-#ifdef CONFIG_DEBUG_PREEMPT
-       if (in_atomic_preempt_off()) {
+       if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+           && in_atomic_preempt_off()) {
                 pr_err("Preemption disabled at:");
-               print_ip_sym(current->preempt_disable_ip);
+               print_ip_sym(preempt_disable_ip);
                 pr_cont("\n");
         }
-#endif
         if (panic_on_warn)
                 panic("scheduling while atomic\n");
  
@@ -3200,7 +3241,7 @@ static inline void schedule_debug(struct task_struct *prev)
  
         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
  
-       schedstat_inc(this_rq(), sched_count);
+       schedstat_inc(this_rq()->sched_count);
  }
  
  /*
@@ -3369,7 +3410,6 @@ static void __sched notrace __schedule(bool preempt)
  
         balance_callback(rq);
  }
-STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
  
  static inline void sched_submit_work(struct task_struct *tsk)
  {
@@ -4812,7 +4852,7 @@ SYSCALL_DEFINE0(sched_yield)
  {
         struct rq *rq = this_rq_lock();
  
-       schedstat_inc(rq, yld_count);
+       schedstat_inc(rq->yld_count);
         current->sched_class->yield_task(rq);
  
         /*
@@ -4963,7 +5003,7 @@ again:
  
         yielded = curr->sched_class->yield_to_task(rq, p, preempt);
         if (yielded) {
-               schedstat_inc(rq, yld_count);
+               schedstat_inc(rq->yld_count);
                 /*
                  * Make p's CPU reschedule; pick_next_entity takes care of
                  * fairness.
@@ -5701,6 +5741,7 @@ static int sd_degenerate(struct sched_domain *sd)
                          SD_BALANCE_FORK |
                          SD_BALANCE_EXEC |
                          SD_SHARE_CPUCAPACITY |
+                        SD_ASYM_CPUCAPACITY |
                          SD_SHARE_PKG_RESOURCES |
                          SD_SHARE_POWERDOMAIN)) {
                 if (sd->groups != sd->groups->next)
@@ -5731,6 +5772,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                                 SD_BALANCE_NEWIDLE |
                                 SD_BALANCE_FORK |
                                 SD_BALANCE_EXEC |
+                               SD_ASYM_CPUCAPACITY |
                                 SD_SHARE_CPUCAPACITY |
                                 SD_SHARE_PKG_RESOURCES |
                                 SD_PREFER_SIBLING |
@@ -6340,23 +6382,32 @@ static int sched_domains_curr_level;
  /*
   * SD_flags allowed in topology descriptions.
   *
- * SD_SHARE_CPUCAPACITY      - describes SMT topologies
- * SD_SHARE_PKG_RESOURCES - describes shared caches
- * SD_NUMA                - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN   - describes shared power domain
+ * These flags are purely descriptive of the topology and do not prescribe
+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
+ * function:
+ *
+ *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
+ *   SD_SHARE_PKG_RESOURCES - describes shared caches
+ *   SD_NUMA                - describes NUMA topologies
+ *   SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies
+ *
+ * Odd one out, which beside describing the topology has a quirk also
+ * prescribes the desired behaviour that goes along with it:
   *
- * Odd one out:
- * SD_ASYM_PACKING        - describes SMT quirks
+ *   SD_ASYM_PACKING        - describes SMT quirks
   */
  #define TOPOLOGY_SD_FLAGS              \
         (SD_SHARE_CPUCAPACITY |         \
          SD_SHARE_PKG_RESOURCES |       \
          SD_NUMA |                      \
          SD_ASYM_PACKING |              \
+        SD_ASYM_CPUCAPACITY |          \
          SD_SHARE_POWERDOMAIN)
  
  static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl,
+       struct sched_domain *child, int cpu)
  {
         struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
         int sd_weight, sd_flags = 0;
@@ -6408,6 +6459,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
                 .smt_gain               = 0,
                 .max_newidle_lb_cost    = 0,
                 .next_decay_max_lb_cost = jiffies,
+               .child                  = child,
  #ifdef CONFIG_SCHED_DEBUG
                 .name                   = tl->name,
  #endif
@@ -6417,6 +6469,13 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
          * Convert topological properties into behaviour.
          */
  
+       if (sd->flags & SD_ASYM_CPUCAPACITY) {
+               struct sched_domain *t = sd;
+
+               for_each_lower_domain(t)
+                       t->flags |= SD_BALANCE_WAKE;
+       }
+
         if (sd->flags & SD_SHARE_CPUCAPACITY) {
                 sd->flags |= SD_PREFER_SIBLING;
                 sd->imbalance_pct = 110;
@@ -6832,16 +6891,13 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
                 struct sched_domain *child, int cpu)
  {
-       struct sched_domain *sd = sd_init(tl, cpu);
-       if (!sd)
-               return child;
+       struct sched_domain *sd = sd_init(tl, child, cpu);
  
         cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
         if (child) {
                 sd->level = child->level + 1;
                 sched_domain_level_max = max(sched_domain_level_max, sd->level);
                 child->parent = sd;
-               sd->child = child;
  
                 if (!cpumask_subset(sched_domain_span(child),
                                     sched_domain_span(sd))) {
@@ -6872,6 +6928,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
         enum s_alloc alloc_state;
         struct sched_domain *sd;
         struct s_data d;
+       struct rq *rq = NULL;
         int i, ret = -ENOMEM;
  
         alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@@ -6922,11 +6979,22 @@ static int build_sched_domains(const struct cpumask *cpu_map,
         /* Attach the domains */
         rcu_read_lock();
         for_each_cpu(i, cpu_map) {
+               rq = cpu_rq(i);
                 sd = *per_cpu_ptr(d.sd, i);
+
+               /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
+               if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
+                       WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
+
                 cpu_attach_domain(sd, d.rd, i);
         }
         rcu_read_unlock();
  
+       if (rq) {
+               pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
+                       cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
+       }
+
         ret = 0;
  error:
         __free_domain_allocs(&d, alloc_state, cpu_map);
@@ -7489,10 +7557,6 @@ void __init sched_init(void)
  
         set_load_weight(&init_task);
  
-#ifdef CONFIG_PREEMPT_NOTIFIERS
-       INIT_HLIST_HEAD(&init_task.preempt_notifiers);
-#endif
-
         /*
          * The boot idle thread does lazy MMU switching as well:
          */
@@ -7558,6 +7622,7 @@ EXPORT_SYMBOL(__might_sleep);
  void ___might_sleep(const char *file, int line, int preempt_offset)
  {
         static unsigned long prev_jiffy;        /* ratelimiting */
+       unsigned long preempt_disable_ip;
  
         rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
         if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
@@ -7568,6 +7633,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
                 return;
         prev_jiffy = jiffies;
  
+       /* Save this before calling printk(), since that will clobber it */
+       preempt_disable_ip = get_preempt_disable_ip(current);
+
         printk(KERN_ERR
                 "BUG: sleeping function called from invalid context at %s:%d\n",
                         file, line);
@@ -7582,14 +7650,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
         debug_show_held_locks(current);
         if (irqs_disabled())
                 print_irqtrace_events(current);
-#ifdef CONFIG_DEBUG_PREEMPT
-       if (!preempt_count_equals(preempt_offset)) {
+       if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+           && !preempt_count_equals(preempt_offset)) {
                 pr_err("Preemption disabled at:");
-               print_ip_sym(current->preempt_disable_ip);
+               print_ip_sym(preempt_disable_ip);
                 pr_cont("\n");
         }
-#endif
         dump_stack();
+       add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
  }
  EXPORT_SYMBOL(___might_sleep);
  #endif
@@ -7610,12 +7678,10 @@ void normalize_rt_tasks(void)
                 if (p->flags & PF_KTHREAD)
                         continue;
  
-               p->se.exec_start                = 0;
-#ifdef CONFIG_SCHEDSTATS
-               p->se.statistics.wait_start     = 0;
-               p->se.statistics.sleep_start    = 0;
-               p->se.statistics.block_start    = 0;
-#endif
+               p->se.exec_start = 0;
+               schedstat_set(p->se.statistics.wait_start,  0);
+               schedstat_set(p->se.statistics.sleep_start, 0);
+               schedstat_set(p->se.statistics.block_start, 0);
  
                 if (!dl_task(p) && !rt_task(p)) {
                         /*