drivers/cpufreq/cpufreq_governor.c

   1 /*
   2  * drivers/cpufreq/cpufreq_governor.c
   3  *
   4  * CPUFREQ governors common code
   5  *
   6  * Copyright    (C) 2001 Russell King
   7  *              (C) 2003 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>.
   8  *              (C) 2003 Jun Nakajima <jun.nakajima@intel.com>
   9  *              (C) 2009 Alexander Clouter <alex@digriz.org.uk>
  10  *              (c) 2012 Viresh Kumar <viresh.kumar@linaro.org>
  11  *
  12  * This program is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU General Public License version 2 as
  14  * published by the Free Software Foundation.
  15  */
  16
  17 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  18
  19 #include <linux/export.h>
  20 #include <linux/kernel_stat.h>
  21 #include <linux/slab.h>
  22
  23 #include "cpufreq_governor.h"
  24
  25 DEFINE_MUTEX(dbs_data_mutex);
  26 EXPORT_SYMBOL_GPL(dbs_data_mutex);
  27
  28 /* Common sysfs tunables */
  29 /**
  30  * store_sampling_rate - update sampling rate effective immediately if needed.
  31  *
  32  * If new rate is smaller than the old, simply updating
  33  * dbs.sampling_rate might not be appropriate. For example, if the
  34  * original sampling_rate was 1 second and the requested new sampling rate is 10
  35  * ms because the user needs immediate reaction from ondemand governor, but not
  36  * sure if higher frequency will be required or not, then, the governor may
  37  * change the sampling rate too late; up to 1 second later. Thus, if we are
  38  * reducing the sampling rate, we need to make the new value effective
  39  * immediately.
  40  *
  41  * This must be called with dbs_data->mutex held, otherwise traversing
  42  * policy_dbs_list isn't safe.
  43  */
  44 ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
  45                             size_t count)
  46 {
  47         struct policy_dbs_info *policy_dbs;
  48         unsigned int rate;
  49         int ret;
  50         ret = sscanf(buf, "%u", &rate);
  51         if (ret != 1)
  52                 return -EINVAL;
  53
  54         dbs_data->sampling_rate = max(rate, dbs_data->min_sampling_rate);
  55
  56         /*
  57          * We are operating under dbs_data->mutex and so the list and its
  58          * entries can't be freed concurrently.
  59          */
  60         list_for_each_entry(policy_dbs, &dbs_data->policy_dbs_list, list) {
  61                 mutex_lock(&policy_dbs->timer_mutex);
  62                 /*
  63                  * On 32-bit architectures this may race with the
  64                  * sample_delay_ns read in dbs_update_util_handler(), but that
  65                  * really doesn't matter.  If the read returns a value that's
  66                  * too big, the sample will be skipped, but the next invocation
  67                  * of dbs_update_util_handler() (when the update has been
  68                  * completed) will take a sample.
  69                  *
  70                  * If this runs in parallel with dbs_work_handler(), we may end
  71                  * up overwriting the sample_delay_ns value that it has just
  72                  * written, but it will be corrected next time a sample is
  73                  * taken, so it shouldn't be significant.
  74                  */
  75                 gov_update_sample_delay(policy_dbs, 0);
  76                 mutex_unlock(&policy_dbs->timer_mutex);
  77         }
  78
  79         return count;
  80 }
  81 EXPORT_SYMBOL_GPL(store_sampling_rate);
  82
  83 static inline struct dbs_data *to_dbs_data(struct kobject *kobj)
  84 {
  85         return container_of(kobj, struct dbs_data, kobj);
  86 }
  87
  88 static inline struct governor_attr *to_gov_attr(struct attribute *attr)
  89 {
  90         return container_of(attr, struct governor_attr, attr);
  91 }
  92
  93 static ssize_t governor_show(struct kobject *kobj, struct attribute *attr,
  94                              char *buf)
  95 {
  96         struct dbs_data *dbs_data = to_dbs_data(kobj);
  97         struct governor_attr *gattr = to_gov_attr(attr);
  98         int ret = -EIO;
  99
 100         if (gattr->show)
 101                 ret = gattr->show(dbs_data, buf);
 102
 103         return ret;
 104 }
 105
 106 static ssize_t governor_store(struct kobject *kobj, struct attribute *attr,
 107                               const char *buf, size_t count)
 108 {
 109         struct dbs_data *dbs_data = to_dbs_data(kobj);
 110         struct governor_attr *gattr = to_gov_attr(attr);
 111         int ret = -EIO;
 112
 113         mutex_lock(&dbs_data->mutex);
 114
 115         if (dbs_data->usage_count && gattr->store)
 116                 ret = gattr->store(dbs_data, buf, count);
 117
 118         mutex_unlock(&dbs_data->mutex);
 119
 120         return ret;
 121 }
 122
 123 /*
 124  * Sysfs Ops for accessing governor attributes.
 125  *
 126  * All show/store invocations for governor specific sysfs attributes, will first
 127  * call the below show/store callbacks and the attribute specific callback will
 128  * be called from within it.
 129  */
 130 static const struct sysfs_ops governor_sysfs_ops = {
 131         .show   = governor_show,
 132         .store  = governor_store,
 133 };
 134
 135 unsigned int dbs_update(struct cpufreq_policy *policy)
 136 {
 137         struct dbs_governor *gov = dbs_governor_of(policy);
 138         struct policy_dbs_info *policy_dbs = policy->governor_data;
 139         struct dbs_data *dbs_data = policy_dbs->dbs_data;
 140         unsigned int ignore_nice = dbs_data->ignore_nice_load;
 141         unsigned int max_load = 0;
 142         unsigned int sampling_rate, io_busy, j;
 143
 144         /*
 145          * Sometimes governors may use an additional multiplier to increase
 146          * sample delays temporarily.  Apply that multiplier to sampling_rate
 147          * so as to keep the wake-up-from-idle detection logic a bit
 148          * conservative.
 149          */
 150         sampling_rate = dbs_data->sampling_rate * policy_dbs->rate_mult;
 151         /*
 152          * For the purpose of ondemand, waiting for disk IO is an indication
 153          * that you're performance critical, and not that the system is actually
 154          * idle, so do not add the iowait time to the CPU idle time then.
 155          */
 156         io_busy = dbs_data->io_is_busy;
 157
 158         /* Get Absolute Load */
 159         for_each_cpu(j, policy->cpus) {
 160                 struct cpu_dbs_info *j_cdbs;
 161                 u64 cur_wall_time, cur_idle_time;
 162                 unsigned int idle_time, wall_time;
 163                 unsigned int load;
 164
 165                 j_cdbs = gov->get_cpu_cdbs(j);
 166
 167                 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time, io_busy);
 168
 169                 wall_time = cur_wall_time - j_cdbs->prev_cpu_wall;
 170                 j_cdbs->prev_cpu_wall = cur_wall_time;
 171
 172                 if (cur_idle_time <= j_cdbs->prev_cpu_idle) {
 173                         idle_time = 0;
 174                 } else {
 175                         idle_time = cur_idle_time - j_cdbs->prev_cpu_idle;
 176                         j_cdbs->prev_cpu_idle = cur_idle_time;
 177                 }
 178
 179                 if (ignore_nice) {
 180                         u64 cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
 181
 182                         idle_time += cputime_to_usecs(cur_nice - j_cdbs->prev_cpu_nice);
 183                         j_cdbs->prev_cpu_nice = cur_nice;
 184                 }
 185
 186                 if (unlikely(!wall_time || wall_time < idle_time))
 187                         continue;
 188
 189                 /*
 190                  * If the CPU had gone completely idle, and a task just woke up
 191                  * on this CPU now, it would be unfair to calculate 'load' the
 192                  * usual way for this elapsed time-window, because it will show
 193                  * near-zero load, irrespective of how CPU intensive that task
 194                  * actually is. This is undesirable for latency-sensitive bursty
 195                  * workloads.
 196                  *
 197                  * To avoid this, we reuse the 'load' from the previous
 198                  * time-window and give this task a chance to start with a
 199                  * reasonably high CPU frequency. (However, we shouldn't over-do
 200                  * this copy, lest we get stuck at a high load (high frequency)
 201                  * for too long, even when the current system load has actually
 202                  * dropped down. So we perform the copy only once, upon the
 203                  * first wake-up from idle.)
 204                  *
 205                  * Detecting this situation is easy: the governor's utilization
 206                  * update handler would not have run during CPU-idle periods.
 207                  * Hence, an unusually large 'wall_time' (as compared to the
 208                  * sampling rate) indicates this scenario.
 209                  *
 210                  * prev_load can be zero in two cases and we must recalculate it
 211                  * for both cases:
 212                  * - during long idle intervals
 213                  * - explicitly set to zero
 214                  */
 215                 if (unlikely(wall_time > (2 * sampling_rate) &&
 216                              j_cdbs->prev_load)) {
 217                         load = j_cdbs->prev_load;
 218
 219                         /*
 220                          * Perform a destructive copy, to ensure that we copy
 221                          * the previous load only once, upon the first wake-up
 222                          * from idle.
 223                          */
 224                         j_cdbs->prev_load = 0;
 225                 } else {
 226                         load = 100 * (wall_time - idle_time) / wall_time;
 227                         j_cdbs->prev_load = load;
 228                 }
 229
 230                 if (load > max_load)
 231                         max_load = load;
 232         }
 233         return max_load;
 234 }
 235 EXPORT_SYMBOL_GPL(dbs_update);
 236
 237 void gov_set_update_util(struct policy_dbs_info *policy_dbs,
 238                          unsigned int delay_us)
 239 {
 240         struct cpufreq_policy *policy = policy_dbs->policy;
 241         struct dbs_governor *gov = dbs_governor_of(policy);
 242         int cpu;
 243
 244         gov_update_sample_delay(policy_dbs, delay_us);
 245         policy_dbs->last_sample_time = 0;
 246
 247         for_each_cpu(cpu, policy->cpus) {
 248                 struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(cpu);
 249
 250                 cpufreq_set_update_util_data(cpu, &cdbs->update_util);
 251         }
 252 }
 253 EXPORT_SYMBOL_GPL(gov_set_update_util);
 254
 255 static inline void gov_clear_update_util(struct cpufreq_policy *policy)
 256 {
 257         int i;
 258
 259         for_each_cpu(i, policy->cpus)
 260                 cpufreq_set_update_util_data(i, NULL);
 261
 262         synchronize_rcu();
 263 }
 264
 265 static void gov_cancel_work(struct cpufreq_policy *policy)
 266 {
 267         struct policy_dbs_info *policy_dbs = policy->governor_data;
 268
 269         gov_clear_update_util(policy_dbs->policy);
 270         irq_work_sync(&policy_dbs->irq_work);
 271         cancel_work_sync(&policy_dbs->work);
 272         atomic_set(&policy_dbs->work_count, 0);
 273         policy_dbs->work_in_progress = false;
 274 }
 275
 276 static void dbs_work_handler(struct work_struct *work)
 277 {
 278         struct policy_dbs_info *policy_dbs;
 279         struct cpufreq_policy *policy;
 280         struct dbs_governor *gov;
 281
 282         policy_dbs = container_of(work, struct policy_dbs_info, work);
 283         policy = policy_dbs->policy;
 284         gov = dbs_governor_of(policy);
 285
 286         /*
 287          * Make sure cpufreq_governor_limits() isn't evaluating load or the
 288          * ondemand governor isn't updating the sampling rate in parallel.
 289          */
 290         mutex_lock(&policy_dbs->timer_mutex);
 291         gov_update_sample_delay(policy_dbs, gov->gov_dbs_timer(policy));
 292         mutex_unlock(&policy_dbs->timer_mutex);
 293
 294         /* Allow the utilization update handler to queue up more work. */
 295         atomic_set(&policy_dbs->work_count, 0);
 296         /*
 297          * If the update below is reordered with respect to the sample delay
 298          * modification, the utilization update handler may end up using a stale
 299          * sample delay value.
 300          */
 301         smp_wmb();
 302         policy_dbs->work_in_progress = false;
 303 }
 304
 305 static void dbs_irq_work(struct irq_work *irq_work)
 306 {
 307         struct policy_dbs_info *policy_dbs;
 308
 309         policy_dbs = container_of(irq_work, struct policy_dbs_info, irq_work);
 310         schedule_work(&policy_dbs->work);
 311 }
 312
 313 static void dbs_update_util_handler(struct update_util_data *data, u64 time,
 314                                     unsigned long util, unsigned long max)
 315 {
 316         struct cpu_dbs_info *cdbs = container_of(data, struct cpu_dbs_info, update_util);
 317         struct policy_dbs_info *policy_dbs = cdbs->policy_dbs;
 318         u64 delta_ns;
 319
 320         /*
 321          * The work may not be allowed to be queued up right now.
 322          * Possible reasons:
 323          * - Work has already been queued up or is in progress.
 324          * - It is too early (too little time from the previous sample).
 325          */
 326         if (policy_dbs->work_in_progress)
 327                 return;
 328
 329         /*
 330          * If the reads below are reordered before the check above, the value
 331          * of sample_delay_ns used in the computation may be stale.
 332          */
 333         smp_rmb();
 334         delta_ns = time - policy_dbs->last_sample_time;
 335         if ((s64)delta_ns < policy_dbs->sample_delay_ns)
 336                 return;
 337
 338         /*
 339          * If the policy is not shared, the irq_work may be queued up right away
 340          * at this point.  Otherwise, we need to ensure that only one of the
 341          * CPUs sharing the policy will do that.
 342          */
 343         if (policy_dbs->is_shared &&
 344             !atomic_add_unless(&policy_dbs->work_count, 1, 1))
 345                 return;
 346
 347         policy_dbs->last_sample_time = time;
 348         policy_dbs->work_in_progress = true;
 349         irq_work_queue(&policy_dbs->irq_work);
 350 }
 351
 352 static struct policy_dbs_info *alloc_policy_dbs_info(struct cpufreq_policy *policy,
 353                                                      struct dbs_governor *gov)
 354 {
 355         struct policy_dbs_info *policy_dbs;
 356         int j;
 357
 358         /* Allocate memory for the common information for policy->cpus */
 359         policy_dbs = kzalloc(sizeof(*policy_dbs), GFP_KERNEL);
 360         if (!policy_dbs)
 361                 return NULL;
 362
 363         policy_dbs->policy = policy;
 364         mutex_init(&policy_dbs->timer_mutex);
 365         atomic_set(&policy_dbs->work_count, 0);
 366         init_irq_work(&policy_dbs->irq_work, dbs_irq_work);
 367         INIT_WORK(&policy_dbs->work, dbs_work_handler);
 368
 369         /* Set policy_dbs for all CPUs, online+offline */
 370         for_each_cpu(j, policy->related_cpus) {
 371                 struct cpu_dbs_info *j_cdbs = gov->get_cpu_cdbs(j);
 372
 373                 j_cdbs->policy_dbs = policy_dbs;
 374                 j_cdbs->update_util.func = dbs_update_util_handler;
 375         }
 376         return policy_dbs;
 377 }
 378
 379 static void free_policy_dbs_info(struct cpufreq_policy *policy,
 380                                  struct dbs_governor *gov)
 381 {
 382         struct cpu_dbs_info *cdbs = gov->get_cpu_cdbs(policy->cpu);
 383         struct policy_dbs_info *policy_dbs = cdbs->policy_dbs;
 384         int j;
 385
 386         mutex_destroy(&policy_dbs->timer_mutex);
 387
 388         for_each_cpu(j, policy->related_cpus) {
 389                 struct cpu_dbs_info *j_cdbs = gov->get_cpu_cdbs(j);
 390
 391                 j_cdbs->policy_dbs = NULL;
 392                 j_cdbs->update_util.func = NULL;
 393         }
 394         kfree(policy_dbs);
 395 }
 396
 397 static int cpufreq_governor_init(struct cpufreq_policy *policy)
 398 {
 399         struct dbs_governor *gov = dbs_governor_of(policy);
 400         struct dbs_data *dbs_data = gov->gdbs_data;
 401         struct policy_dbs_info *policy_dbs;
 402         unsigned int latency;
 403         int ret;
 404
 405         /* State should be equivalent to EXIT */
 406         if (policy->governor_data)
 407                 return -EBUSY;
 408
 409         policy_dbs = alloc_policy_dbs_info(policy, gov);
 410         if (!policy_dbs)
 411                 return -ENOMEM;
 412
 413         if (dbs_data) {
 414                 if (WARN_ON(have_governor_per_policy())) {
 415                         ret = -EINVAL;
 416                         goto free_policy_dbs_info;
 417                 }
 418                 policy_dbs->dbs_data = dbs_data;
 419                 policy->governor_data = policy_dbs;
 420
 421                 mutex_lock(&dbs_data->mutex);
 422                 dbs_data->usage_count++;
 423                 list_add(&policy_dbs->list, &dbs_data->policy_dbs_list);
 424                 mutex_unlock(&dbs_data->mutex);
 425
 426                 return 0;
 427         }
 428
 429         dbs_data = kzalloc(sizeof(*dbs_data), GFP_KERNEL);
 430         if (!dbs_data) {
 431                 ret = -ENOMEM;
 432                 goto free_policy_dbs_info;
 433         }
 434
 435         INIT_LIST_HEAD(&dbs_data->policy_dbs_list);
 436         mutex_init(&dbs_data->mutex);
 437
 438         ret = gov->init(dbs_data, !policy->governor->initialized);
 439         if (ret)
 440                 goto free_policy_dbs_info;
 441
 442         /* policy latency is in ns. Convert it to us first */
 443         latency = policy->cpuinfo.transition_latency / 1000;
 444         if (latency == 0)
 445                 latency = 1;
 446
 447         /* Bring kernel and HW constraints together */
 448         dbs_data->min_sampling_rate = max(dbs_data->min_sampling_rate,
 449                                           MIN_LATENCY_MULTIPLIER * latency);
 450         dbs_data->sampling_rate = max(dbs_data->min_sampling_rate,
 451                                       LATENCY_MULTIPLIER * latency);
 452
 453         if (!have_governor_per_policy())
 454                 gov->gdbs_data = dbs_data;
 455
 456         policy->governor_data = policy_dbs;
 457
 458         policy_dbs->dbs_data = dbs_data;
 459         dbs_data->usage_count = 1;
 460         list_add(&policy_dbs->list, &dbs_data->policy_dbs_list);
 461
 462         gov->kobj_type.sysfs_ops = &governor_sysfs_ops;
 463         ret = kobject_init_and_add(&dbs_data->kobj, &gov->kobj_type,
 464                                    get_governor_parent_kobj(policy),
 465                                    "%s", gov->gov.name);
 466         if (!ret)
 467                 return 0;
 468
 469         /* Failure, so roll back. */
 470         pr_err("cpufreq: Governor initialization failed (dbs_data kobject init error %d)\n", ret);
 471
 472         policy->governor_data = NULL;
 473
 474         if (!have_governor_per_policy())
 475                 gov->gdbs_data = NULL;
 476         gov->exit(dbs_data, !policy->governor->initialized);
 477         kfree(dbs_data);
 478
 479 free_policy_dbs_info:
 480         free_policy_dbs_info(policy, gov);
 481         return ret;
 482 }
 483
 484 static int cpufreq_governor_exit(struct cpufreq_policy *policy)
 485 {
 486         struct dbs_governor *gov = dbs_governor_of(policy);
 487         struct policy_dbs_info *policy_dbs = policy->governor_data;
 488         struct dbs_data *dbs_data = policy_dbs->dbs_data;
 489         int count;
 490
 491         mutex_lock(&dbs_data->mutex);
 492         list_del(&policy_dbs->list);
 493         count = --dbs_data->usage_count;
 494         mutex_unlock(&dbs_data->mutex);
 495
 496         if (!count) {
 497                 kobject_put(&dbs_data->kobj);
 498
 499                 policy->governor_data = NULL;
 500
 501                 if (!have_governor_per_policy())
 502                         gov->gdbs_data = NULL;
 503
 504                 gov->exit(dbs_data, policy->governor->initialized == 1);
 505                 mutex_destroy(&dbs_data->mutex);
 506                 kfree(dbs_data);
 507         } else {
 508                 policy->governor_data = NULL;
 509         }
 510
 511         free_policy_dbs_info(policy, gov);
 512         return 0;
 513 }
 514
 515 static int cpufreq_governor_start(struct cpufreq_policy *policy)
 516 {
 517         struct dbs_governor *gov = dbs_governor_of(policy);
 518         struct policy_dbs_info *policy_dbs = policy->governor_data;
 519         struct dbs_data *dbs_data = policy_dbs->dbs_data;
 520         unsigned int sampling_rate, ignore_nice, j;
 521         unsigned int io_busy;
 522
 523         if (!policy->cur)
 524                 return -EINVAL;
 525
 526         policy_dbs->is_shared = policy_is_shared(policy);
 527         policy_dbs->rate_mult = 1;
 528
 529         sampling_rate = dbs_data->sampling_rate;
 530         ignore_nice = dbs_data->ignore_nice_load;
 531         io_busy = dbs_data->io_is_busy;
 532
 533         for_each_cpu(j, policy->cpus) {
 534                 struct cpu_dbs_info *j_cdbs = gov->get_cpu_cdbs(j);
 535                 unsigned int prev_load;
 536
 537                 j_cdbs->prev_cpu_idle = get_cpu_idle_time(j, &j_cdbs->prev_cpu_wall, io_busy);
 538
 539                 prev_load = j_cdbs->prev_cpu_wall - j_cdbs->prev_cpu_idle;
 540                 j_cdbs->prev_load = 100 * prev_load / (unsigned int)j_cdbs->prev_cpu_wall;
 541
 542                 if (ignore_nice)
 543                         j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
 544         }
 545
 546         gov->start(policy);
 547
 548         gov_set_update_util(policy_dbs, sampling_rate);
 549         return 0;
 550 }
 551
 552 static int cpufreq_governor_stop(struct cpufreq_policy *policy)
 553 {
 554         gov_cancel_work(policy);
 555
 556         return 0;
 557 }
 558
 559 static int cpufreq_governor_limits(struct cpufreq_policy *policy)
 560 {
 561         struct policy_dbs_info *policy_dbs = policy->governor_data;
 562
 563         mutex_lock(&policy_dbs->timer_mutex);
 564
 565         if (policy->max < policy->cur)
 566                 __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H);
 567         else if (policy->min > policy->cur)
 568                 __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L);
 569
 570         gov_update_sample_delay(policy_dbs, 0);
 571
 572         mutex_unlock(&policy_dbs->timer_mutex);
 573
 574         return 0;
 575 }
 576
 577 int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event)
 578 {
 579         int ret = -EINVAL;
 580
 581         /* Lock governor to block concurrent initialization of governor */
 582         mutex_lock(&dbs_data_mutex);
 583
 584         if (event == CPUFREQ_GOV_POLICY_INIT) {
 585                 ret = cpufreq_governor_init(policy);
 586         } else if (policy->governor_data) {
 587                 switch (event) {
 588                 case CPUFREQ_GOV_POLICY_EXIT:
 589                         ret = cpufreq_governor_exit(policy);
 590                         break;
 591                 case CPUFREQ_GOV_START:
 592                         ret = cpufreq_governor_start(policy);
 593                         break;
 594                 case CPUFREQ_GOV_STOP:
 595                         ret = cpufreq_governor_stop(policy);
 596                         break;
 597                 case CPUFREQ_GOV_LIMITS:
 598                         ret = cpufreq_governor_limits(policy);
 599                         break;
 600                 }
 601         }
 602
 603         mutex_unlock(&dbs_data_mutex);
 604         return ret;
 605 }
 606 EXPORT_SYMBOL_GPL(cpufreq_governor_dbs);