2 * Common Block IO controller cgroup interface
4 * Based on ideas and code from CFQ, CFS and BFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11 * Nauman Rafique <nauman@google.com>
13 #include <linux/ioprio.h>
14 #include <linux/seq_file.h>
15 #include <linux/kdev_t.h>
16 #include <linux/module.h>
17 #include <linux/err.h>
18 #include <linux/blkdev.h>
19 #include <linux/slab.h>
20 #include <linux/genhd.h>
21 #include <linux/delay.h>
22 #include "blk-cgroup.h"
25 #define MAX_KEY_LEN 100
27 static DEFINE_SPINLOCK(blkio_list_lock
);
28 static LIST_HEAD(blkio_list
);
30 static DEFINE_MUTEX(all_q_mutex
);
31 static LIST_HEAD(all_q_list
);
33 struct blkio_cgroup blkio_root_cgroup
= { .weight
= 2*BLKIO_WEIGHT_DEFAULT
};
34 EXPORT_SYMBOL_GPL(blkio_root_cgroup
);
36 static struct blkio_policy_type
*blkio_policy
[BLKIO_NR_POLICIES
];
38 static struct cgroup_subsys_state
*blkiocg_create(struct cgroup_subsys
*,
40 static int blkiocg_can_attach(struct cgroup_subsys
*, struct cgroup
*,
41 struct cgroup_taskset
*);
42 static void blkiocg_attach(struct cgroup_subsys
*, struct cgroup
*,
43 struct cgroup_taskset
*);
44 static int blkiocg_pre_destroy(struct cgroup_subsys
*, struct cgroup
*);
45 static void blkiocg_destroy(struct cgroup_subsys
*, struct cgroup
*);
46 static int blkiocg_populate(struct cgroup_subsys
*, struct cgroup
*);
48 /* for encoding cft->private value on file */
49 #define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val))
50 /* What policy owns the file, proportional or throttle */
51 #define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff)
52 #define BLKIOFILE_ATTR(val) ((val) & 0xffff)
54 struct cgroup_subsys blkio_subsys
= {
56 .create
= blkiocg_create
,
57 .can_attach
= blkiocg_can_attach
,
58 .attach
= blkiocg_attach
,
59 .pre_destroy
= blkiocg_pre_destroy
,
60 .destroy
= blkiocg_destroy
,
61 .populate
= blkiocg_populate
,
62 .subsys_id
= blkio_subsys_id
,
63 .module
= THIS_MODULE
,
65 EXPORT_SYMBOL_GPL(blkio_subsys
);
67 struct blkio_cgroup
*cgroup_to_blkio_cgroup(struct cgroup
*cgroup
)
69 return container_of(cgroup_subsys_state(cgroup
, blkio_subsys_id
),
70 struct blkio_cgroup
, css
);
72 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup
);
74 struct blkio_cgroup
*task_blkio_cgroup(struct task_struct
*tsk
)
76 return container_of(task_subsys_state(tsk
, blkio_subsys_id
),
77 struct blkio_cgroup
, css
);
79 EXPORT_SYMBOL_GPL(task_blkio_cgroup
);
82 blkio_update_group_weight(struct blkio_group
*blkg
, unsigned int weight
)
84 struct blkio_policy_type
*blkiop
;
86 list_for_each_entry(blkiop
, &blkio_list
, list
) {
87 /* If this policy does not own the blkg, do not send updates */
88 if (blkiop
->plid
!= blkg
->plid
)
90 if (blkiop
->ops
.blkio_update_group_weight_fn
)
91 blkiop
->ops
.blkio_update_group_weight_fn(blkg
->q
,
96 static inline void blkio_update_group_bps(struct blkio_group
*blkg
, u64 bps
,
99 struct blkio_policy_type
*blkiop
;
101 list_for_each_entry(blkiop
, &blkio_list
, list
) {
103 /* If this policy does not own the blkg, do not send updates */
104 if (blkiop
->plid
!= blkg
->plid
)
107 if (fileid
== BLKIO_THROTL_read_bps_device
108 && blkiop
->ops
.blkio_update_group_read_bps_fn
)
109 blkiop
->ops
.blkio_update_group_read_bps_fn(blkg
->q
,
112 if (fileid
== BLKIO_THROTL_write_bps_device
113 && blkiop
->ops
.blkio_update_group_write_bps_fn
)
114 blkiop
->ops
.blkio_update_group_write_bps_fn(blkg
->q
,
119 static inline void blkio_update_group_iops(struct blkio_group
*blkg
,
120 unsigned int iops
, int fileid
)
122 struct blkio_policy_type
*blkiop
;
124 list_for_each_entry(blkiop
, &blkio_list
, list
) {
126 /* If this policy does not own the blkg, do not send updates */
127 if (blkiop
->plid
!= blkg
->plid
)
130 if (fileid
== BLKIO_THROTL_read_iops_device
131 && blkiop
->ops
.blkio_update_group_read_iops_fn
)
132 blkiop
->ops
.blkio_update_group_read_iops_fn(blkg
->q
,
135 if (fileid
== BLKIO_THROTL_write_iops_device
136 && blkiop
->ops
.blkio_update_group_write_iops_fn
)
137 blkiop
->ops
.blkio_update_group_write_iops_fn(blkg
->q
,
143 * Add to the appropriate stat variable depending on the request type.
144 * This should be called with the blkg->stats_lock held.
146 static void blkio_add_stat(uint64_t *stat
, uint64_t add
, bool direction
,
150 stat
[BLKIO_STAT_WRITE
] += add
;
152 stat
[BLKIO_STAT_READ
] += add
;
154 stat
[BLKIO_STAT_SYNC
] += add
;
156 stat
[BLKIO_STAT_ASYNC
] += add
;
160 * Decrements the appropriate stat variable if non-zero depending on the
161 * request type. Panics on value being zero.
162 * This should be called with the blkg->stats_lock held.
164 static void blkio_check_and_dec_stat(uint64_t *stat
, bool direction
, bool sync
)
167 BUG_ON(stat
[BLKIO_STAT_WRITE
] == 0);
168 stat
[BLKIO_STAT_WRITE
]--;
170 BUG_ON(stat
[BLKIO_STAT_READ
] == 0);
171 stat
[BLKIO_STAT_READ
]--;
174 BUG_ON(stat
[BLKIO_STAT_SYNC
] == 0);
175 stat
[BLKIO_STAT_SYNC
]--;
177 BUG_ON(stat
[BLKIO_STAT_ASYNC
] == 0);
178 stat
[BLKIO_STAT_ASYNC
]--;
182 #ifdef CONFIG_DEBUG_BLK_CGROUP
183 /* This should be called with the blkg->stats_lock held. */
184 static void blkio_set_start_group_wait_time(struct blkio_group
*blkg
,
185 struct blkio_group
*curr_blkg
)
187 if (blkio_blkg_waiting(&blkg
->stats
))
189 if (blkg
== curr_blkg
)
191 blkg
->stats
.start_group_wait_time
= sched_clock();
192 blkio_mark_blkg_waiting(&blkg
->stats
);
195 /* This should be called with the blkg->stats_lock held. */
196 static void blkio_update_group_wait_time(struct blkio_group_stats
*stats
)
198 unsigned long long now
;
200 if (!blkio_blkg_waiting(stats
))
204 if (time_after64(now
, stats
->start_group_wait_time
))
205 stats
->group_wait_time
+= now
- stats
->start_group_wait_time
;
206 blkio_clear_blkg_waiting(stats
);
209 /* This should be called with the blkg->stats_lock held. */
210 static void blkio_end_empty_time(struct blkio_group_stats
*stats
)
212 unsigned long long now
;
214 if (!blkio_blkg_empty(stats
))
218 if (time_after64(now
, stats
->start_empty_time
))
219 stats
->empty_time
+= now
- stats
->start_empty_time
;
220 blkio_clear_blkg_empty(stats
);
223 void blkiocg_update_set_idle_time_stats(struct blkio_group
*blkg
)
227 spin_lock_irqsave(&blkg
->stats_lock
, flags
);
228 BUG_ON(blkio_blkg_idling(&blkg
->stats
));
229 blkg
->stats
.start_idle_time
= sched_clock();
230 blkio_mark_blkg_idling(&blkg
->stats
);
231 spin_unlock_irqrestore(&blkg
->stats_lock
, flags
);
233 EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats
);
235 void blkiocg_update_idle_time_stats(struct blkio_group
*blkg
)
238 unsigned long long now
;
239 struct blkio_group_stats
*stats
;
241 spin_lock_irqsave(&blkg
->stats_lock
, flags
);
242 stats
= &blkg
->stats
;
243 if (blkio_blkg_idling(stats
)) {
245 if (time_after64(now
, stats
->start_idle_time
))
246 stats
->idle_time
+= now
- stats
->start_idle_time
;
247 blkio_clear_blkg_idling(stats
);
249 spin_unlock_irqrestore(&blkg
->stats_lock
, flags
);
251 EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats
);
253 void blkiocg_update_avg_queue_size_stats(struct blkio_group
*blkg
)
256 struct blkio_group_stats
*stats
;
258 spin_lock_irqsave(&blkg
->stats_lock
, flags
);
259 stats
= &blkg
->stats
;
260 stats
->avg_queue_size_sum
+=
261 stats
->stat_arr
[BLKIO_STAT_QUEUED
][BLKIO_STAT_READ
] +
262 stats
->stat_arr
[BLKIO_STAT_QUEUED
][BLKIO_STAT_WRITE
];
263 stats
->avg_queue_size_samples
++;
264 blkio_update_group_wait_time(stats
);
265 spin_unlock_irqrestore(&blkg
->stats_lock
, flags
);
267 EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats
);
269 void blkiocg_set_start_empty_time(struct blkio_group
*blkg
)
272 struct blkio_group_stats
*stats
;
274 spin_lock_irqsave(&blkg
->stats_lock
, flags
);
275 stats
= &blkg
->stats
;
277 if (stats
->stat_arr
[BLKIO_STAT_QUEUED
][BLKIO_STAT_READ
] ||
278 stats
->stat_arr
[BLKIO_STAT_QUEUED
][BLKIO_STAT_WRITE
]) {
279 spin_unlock_irqrestore(&blkg
->stats_lock
, flags
);
284 * group is already marked empty. This can happen if cfqq got new
285 * request in parent group and moved to this group while being added
286 * to service tree. Just ignore the event and move on.
288 if(blkio_blkg_empty(stats
)) {
289 spin_unlock_irqrestore(&blkg
->stats_lock
, flags
);
293 stats
->start_empty_time
= sched_clock();
294 blkio_mark_blkg_empty(stats
);
295 spin_unlock_irqrestore(&blkg
->stats_lock
, flags
);
297 EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time
);
299 void blkiocg_update_dequeue_stats(struct blkio_group
*blkg
,
300 unsigned long dequeue
)
302 blkg
->stats
.dequeue
+= dequeue
;
304 EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats
);
306 static inline void blkio_set_start_group_wait_time(struct blkio_group
*blkg
,
307 struct blkio_group
*curr_blkg
) {}
308 static inline void blkio_end_empty_time(struct blkio_group_stats
*stats
) {}
311 void blkiocg_update_io_add_stats(struct blkio_group
*blkg
,
312 struct blkio_group
*curr_blkg
, bool direction
,
317 spin_lock_irqsave(&blkg
->stats_lock
, flags
);
318 blkio_add_stat(blkg
->stats
.stat_arr
[BLKIO_STAT_QUEUED
], 1, direction
,
320 blkio_end_empty_time(&blkg
->stats
);
321 blkio_set_start_group_wait_time(blkg
, curr_blkg
);
322 spin_unlock_irqrestore(&blkg
->stats_lock
, flags
);
324 EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats
);
326 void blkiocg_update_io_remove_stats(struct blkio_group
*blkg
,
327 bool direction
, bool sync
)
331 spin_lock_irqsave(&blkg
->stats_lock
, flags
);
332 blkio_check_and_dec_stat(blkg
->stats
.stat_arr
[BLKIO_STAT_QUEUED
],
334 spin_unlock_irqrestore(&blkg
->stats_lock
, flags
);
336 EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats
);
338 void blkiocg_update_timeslice_used(struct blkio_group
*blkg
, unsigned long time
,
339 unsigned long unaccounted_time
)
343 spin_lock_irqsave(&blkg
->stats_lock
, flags
);
344 blkg
->stats
.time
+= time
;
345 #ifdef CONFIG_DEBUG_BLK_CGROUP
346 blkg
->stats
.unaccounted_time
+= unaccounted_time
;
348 spin_unlock_irqrestore(&blkg
->stats_lock
, flags
);
350 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used
);
353 * should be called under rcu read lock or queue lock to make sure blkg pointer
356 void blkiocg_update_dispatch_stats(struct blkio_group
*blkg
,
357 uint64_t bytes
, bool direction
, bool sync
)
359 struct blkio_group_stats_cpu
*stats_cpu
;
363 * Disabling interrupts to provide mutual exclusion between two
364 * writes on same cpu. It probably is not needed for 64bit. Not
365 * optimizing that case yet.
367 local_irq_save(flags
);
369 stats_cpu
= this_cpu_ptr(blkg
->stats_cpu
);
371 u64_stats_update_begin(&stats_cpu
->syncp
);
372 stats_cpu
->sectors
+= bytes
>> 9;
373 blkio_add_stat(stats_cpu
->stat_arr_cpu
[BLKIO_STAT_CPU_SERVICED
],
375 blkio_add_stat(stats_cpu
->stat_arr_cpu
[BLKIO_STAT_CPU_SERVICE_BYTES
],
376 bytes
, direction
, sync
);
377 u64_stats_update_end(&stats_cpu
->syncp
);
378 local_irq_restore(flags
);
380 EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats
);
382 void blkiocg_update_completion_stats(struct blkio_group
*blkg
,
383 uint64_t start_time
, uint64_t io_start_time
, bool direction
, bool sync
)
385 struct blkio_group_stats
*stats
;
387 unsigned long long now
= sched_clock();
389 spin_lock_irqsave(&blkg
->stats_lock
, flags
);
390 stats
= &blkg
->stats
;
391 if (time_after64(now
, io_start_time
))
392 blkio_add_stat(stats
->stat_arr
[BLKIO_STAT_SERVICE_TIME
],
393 now
- io_start_time
, direction
, sync
);
394 if (time_after64(io_start_time
, start_time
))
395 blkio_add_stat(stats
->stat_arr
[BLKIO_STAT_WAIT_TIME
],
396 io_start_time
- start_time
, direction
, sync
);
397 spin_unlock_irqrestore(&blkg
->stats_lock
, flags
);
399 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats
);
401 /* Merged stats are per cpu. */
402 void blkiocg_update_io_merged_stats(struct blkio_group
*blkg
, bool direction
,
405 struct blkio_group_stats_cpu
*stats_cpu
;
409 * Disabling interrupts to provide mutual exclusion between two
410 * writes on same cpu. It probably is not needed for 64bit. Not
411 * optimizing that case yet.
413 local_irq_save(flags
);
415 stats_cpu
= this_cpu_ptr(blkg
->stats_cpu
);
417 u64_stats_update_begin(&stats_cpu
->syncp
);
418 blkio_add_stat(stats_cpu
->stat_arr_cpu
[BLKIO_STAT_CPU_MERGED
], 1,
420 u64_stats_update_end(&stats_cpu
->syncp
);
421 local_irq_restore(flags
);
423 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats
);
425 struct blkio_group
*blkg_lookup_create(struct blkio_cgroup
*blkcg
,
426 struct request_queue
*q
,
427 enum blkio_policy_id plid
,
429 __releases(q
->queue_lock
) __acquires(q
->queue_lock
)
431 struct blkio_policy_type
*pol
= blkio_policy
[plid
];
432 struct blkio_group
*blkg
, *new_blkg
;
434 WARN_ON_ONCE(!rcu_read_lock_held());
435 lockdep_assert_held(q
->queue_lock
);
438 * This could be the first entry point of blkcg implementation and
439 * we shouldn't allow anything to go through for a bypassing queue.
440 * The following can be removed if blkg lookup is guaranteed to
441 * fail on a bypassing queue.
443 if (unlikely(blk_queue_bypass(q
)) && !for_root
)
444 return ERR_PTR(blk_queue_dead(q
) ? -EINVAL
: -EBUSY
);
446 blkg
= blkg_lookup(blkcg
, q
, plid
);
450 /* blkg holds a reference to blkcg */
451 if (!css_tryget(&blkcg
->css
))
452 return ERR_PTR(-EINVAL
);
455 * Allocate and initialize.
457 * FIXME: The following is broken. Percpu memory allocation
458 * requires %GFP_KERNEL context and can't be performed from IO
459 * path. Allocation here should inherently be atomic and the
460 * following lock dancing can be removed once the broken percpu
461 * allocation is fixed.
463 spin_unlock_irq(q
->queue_lock
);
466 new_blkg
= pol
->ops
.blkio_alloc_group_fn(q
, blkcg
);
468 new_blkg
->stats_cpu
= alloc_percpu(struct blkio_group_stats_cpu
);
470 spin_lock_init(&new_blkg
->stats_lock
);
471 rcu_assign_pointer(new_blkg
->q
, q
);
472 new_blkg
->blkcg
= blkcg
;
473 new_blkg
->plid
= plid
;
474 cgroup_path(blkcg
->css
.cgroup
, new_blkg
->path
,
475 sizeof(new_blkg
->path
));
477 css_put(&blkcg
->css
);
481 spin_lock_irq(q
->queue_lock
);
483 /* did bypass get turned on inbetween? */
484 if (unlikely(blk_queue_bypass(q
)) && !for_root
) {
485 blkg
= ERR_PTR(blk_queue_dead(q
) ? -EINVAL
: -EBUSY
);
489 /* did someone beat us to it? */
490 blkg
= blkg_lookup(blkcg
, q
, plid
);
494 /* did alloc fail? */
495 if (unlikely(!new_blkg
|| !new_blkg
->stats_cpu
)) {
496 blkg
= ERR_PTR(-ENOMEM
);
501 spin_lock(&blkcg
->lock
);
502 swap(blkg
, new_blkg
);
503 hlist_add_head_rcu(&blkg
->blkcg_node
, &blkcg
->blkg_list
);
504 pol
->ops
.blkio_link_group_fn(q
, blkg
);
505 spin_unlock(&blkcg
->lock
);
508 free_percpu(new_blkg
->stats_cpu
);
510 css_put(&blkcg
->css
);
514 EXPORT_SYMBOL_GPL(blkg_lookup_create
);
516 static void __blkiocg_del_blkio_group(struct blkio_group
*blkg
)
518 hlist_del_init_rcu(&blkg
->blkcg_node
);
522 * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
523 * indicating that blk_group was unhashed by the time we got to it.
525 int blkiocg_del_blkio_group(struct blkio_group
*blkg
)
527 struct blkio_cgroup
*blkcg
= blkg
->blkcg
;
531 spin_lock_irqsave(&blkcg
->lock
, flags
);
532 if (!hlist_unhashed(&blkg
->blkcg_node
)) {
533 __blkiocg_del_blkio_group(blkg
);
536 spin_unlock_irqrestore(&blkcg
->lock
, flags
);
540 EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group
);
542 /* called under rcu_read_lock(). */
543 struct blkio_group
*blkg_lookup(struct blkio_cgroup
*blkcg
,
544 struct request_queue
*q
,
545 enum blkio_policy_id plid
)
547 struct blkio_group
*blkg
;
548 struct hlist_node
*n
;
550 hlist_for_each_entry_rcu(blkg
, n
, &blkcg
->blkg_list
, blkcg_node
)
551 if (blkg
->q
== q
&& blkg
->plid
== plid
)
555 EXPORT_SYMBOL_GPL(blkg_lookup
);
557 void blkg_destroy_all(struct request_queue
*q
)
559 struct blkio_policy_type
*pol
;
564 spin_lock(&blkio_list_lock
);
565 spin_lock_irq(q
->queue_lock
);
568 * clear_queue_fn() might return with non-empty group list
569 * if it raced cgroup removal and lost. cgroup removal is
570 * guaranteed to make forward progress and retrying after a
571 * while is enough. This ugliness is scheduled to be
572 * removed after locking update.
574 list_for_each_entry(pol
, &blkio_list
, list
)
575 if (!pol
->ops
.blkio_clear_queue_fn(q
))
578 spin_unlock_irq(q
->queue_lock
);
579 spin_unlock(&blkio_list_lock
);
584 msleep(10); /* just some random duration I like */
588 static void blkio_reset_stats_cpu(struct blkio_group
*blkg
)
590 struct blkio_group_stats_cpu
*stats_cpu
;
593 * Note: On 64 bit arch this should not be an issue. This has the
594 * possibility of returning some inconsistent value on 32bit arch
595 * as 64bit update on 32bit is non atomic. Taking care of this
596 * corner case makes code very complicated, like sending IPIs to
597 * cpus, taking care of stats of offline cpus etc.
599 * reset stats is anyway more of a debug feature and this sounds a
600 * corner case. So I am not complicating the code yet until and
601 * unless this becomes a real issue.
603 for_each_possible_cpu(i
) {
604 stats_cpu
= per_cpu_ptr(blkg
->stats_cpu
, i
);
605 stats_cpu
->sectors
= 0;
606 for(j
= 0; j
< BLKIO_STAT_CPU_NR
; j
++)
607 for (k
= 0; k
< BLKIO_STAT_TOTAL
; k
++)
608 stats_cpu
->stat_arr_cpu
[j
][k
] = 0;
613 blkiocg_reset_stats(struct cgroup
*cgroup
, struct cftype
*cftype
, u64 val
)
615 struct blkio_cgroup
*blkcg
;
616 struct blkio_group
*blkg
;
617 struct blkio_group_stats
*stats
;
618 struct hlist_node
*n
;
619 uint64_t queued
[BLKIO_STAT_TOTAL
];
621 #ifdef CONFIG_DEBUG_BLK_CGROUP
622 bool idling
, waiting
, empty
;
623 unsigned long long now
= sched_clock();
626 blkcg
= cgroup_to_blkio_cgroup(cgroup
);
627 spin_lock_irq(&blkcg
->lock
);
628 hlist_for_each_entry(blkg
, n
, &blkcg
->blkg_list
, blkcg_node
) {
629 spin_lock(&blkg
->stats_lock
);
630 stats
= &blkg
->stats
;
631 #ifdef CONFIG_DEBUG_BLK_CGROUP
632 idling
= blkio_blkg_idling(stats
);
633 waiting
= blkio_blkg_waiting(stats
);
634 empty
= blkio_blkg_empty(stats
);
636 for (i
= 0; i
< BLKIO_STAT_TOTAL
; i
++)
637 queued
[i
] = stats
->stat_arr
[BLKIO_STAT_QUEUED
][i
];
638 memset(stats
, 0, sizeof(struct blkio_group_stats
));
639 for (i
= 0; i
< BLKIO_STAT_TOTAL
; i
++)
640 stats
->stat_arr
[BLKIO_STAT_QUEUED
][i
] = queued
[i
];
641 #ifdef CONFIG_DEBUG_BLK_CGROUP
643 blkio_mark_blkg_idling(stats
);
644 stats
->start_idle_time
= now
;
647 blkio_mark_blkg_waiting(stats
);
648 stats
->start_group_wait_time
= now
;
651 blkio_mark_blkg_empty(stats
);
652 stats
->start_empty_time
= now
;
655 spin_unlock(&blkg
->stats_lock
);
657 /* Reset Per cpu stats which don't take blkg->stats_lock */
658 blkio_reset_stats_cpu(blkg
);
661 spin_unlock_irq(&blkcg
->lock
);
665 static void blkio_get_key_name(enum stat_sub_type type
, const char *dname
,
666 char *str
, int chars_left
, bool diskname_only
)
668 snprintf(str
, chars_left
, "%s", dname
);
669 chars_left
-= strlen(str
);
670 if (chars_left
<= 0) {
672 "Possibly incorrect cgroup stat display format");
678 case BLKIO_STAT_READ
:
679 strlcat(str
, " Read", chars_left
);
681 case BLKIO_STAT_WRITE
:
682 strlcat(str
, " Write", chars_left
);
684 case BLKIO_STAT_SYNC
:
685 strlcat(str
, " Sync", chars_left
);
687 case BLKIO_STAT_ASYNC
:
688 strlcat(str
, " Async", chars_left
);
690 case BLKIO_STAT_TOTAL
:
691 strlcat(str
, " Total", chars_left
);
694 strlcat(str
, " Invalid", chars_left
);
698 static uint64_t blkio_fill_stat(char *str
, int chars_left
, uint64_t val
,
699 struct cgroup_map_cb
*cb
, const char *dname
)
701 blkio_get_key_name(0, dname
, str
, chars_left
, true);
702 cb
->fill(cb
, str
, val
);
707 static uint64_t blkio_read_stat_cpu(struct blkio_group
*blkg
,
708 enum stat_type_cpu type
, enum stat_sub_type sub_type
)
711 struct blkio_group_stats_cpu
*stats_cpu
;
714 for_each_possible_cpu(cpu
) {
716 stats_cpu
= per_cpu_ptr(blkg
->stats_cpu
, cpu
);
719 start
= u64_stats_fetch_begin(&stats_cpu
->syncp
);
720 if (type
== BLKIO_STAT_CPU_SECTORS
)
721 tval
= stats_cpu
->sectors
;
723 tval
= stats_cpu
->stat_arr_cpu
[type
][sub_type
];
724 } while(u64_stats_fetch_retry(&stats_cpu
->syncp
, start
));
732 static uint64_t blkio_get_stat_cpu(struct blkio_group
*blkg
,
733 struct cgroup_map_cb
*cb
, const char *dname
,
734 enum stat_type_cpu type
)
736 uint64_t disk_total
, val
;
737 char key_str
[MAX_KEY_LEN
];
738 enum stat_sub_type sub_type
;
740 if (type
== BLKIO_STAT_CPU_SECTORS
) {
741 val
= blkio_read_stat_cpu(blkg
, type
, 0);
742 return blkio_fill_stat(key_str
, MAX_KEY_LEN
- 1, val
, cb
,
746 for (sub_type
= BLKIO_STAT_READ
; sub_type
< BLKIO_STAT_TOTAL
;
748 blkio_get_key_name(sub_type
, dname
, key_str
, MAX_KEY_LEN
,
750 val
= blkio_read_stat_cpu(blkg
, type
, sub_type
);
751 cb
->fill(cb
, key_str
, val
);
754 disk_total
= blkio_read_stat_cpu(blkg
, type
, BLKIO_STAT_READ
) +
755 blkio_read_stat_cpu(blkg
, type
, BLKIO_STAT_WRITE
);
757 blkio_get_key_name(BLKIO_STAT_TOTAL
, dname
, key_str
, MAX_KEY_LEN
,
759 cb
->fill(cb
, key_str
, disk_total
);
763 /* This should be called with blkg->stats_lock held */
764 static uint64_t blkio_get_stat(struct blkio_group
*blkg
,
765 struct cgroup_map_cb
*cb
, const char *dname
,
769 char key_str
[MAX_KEY_LEN
];
770 enum stat_sub_type sub_type
;
772 if (type
== BLKIO_STAT_TIME
)
773 return blkio_fill_stat(key_str
, MAX_KEY_LEN
- 1,
774 blkg
->stats
.time
, cb
, dname
);
775 #ifdef CONFIG_DEBUG_BLK_CGROUP
776 if (type
== BLKIO_STAT_UNACCOUNTED_TIME
)
777 return blkio_fill_stat(key_str
, MAX_KEY_LEN
- 1,
778 blkg
->stats
.unaccounted_time
, cb
, dname
);
779 if (type
== BLKIO_STAT_AVG_QUEUE_SIZE
) {
780 uint64_t sum
= blkg
->stats
.avg_queue_size_sum
;
781 uint64_t samples
= blkg
->stats
.avg_queue_size_samples
;
783 do_div(sum
, samples
);
786 return blkio_fill_stat(key_str
, MAX_KEY_LEN
- 1,
789 if (type
== BLKIO_STAT_GROUP_WAIT_TIME
)
790 return blkio_fill_stat(key_str
, MAX_KEY_LEN
- 1,
791 blkg
->stats
.group_wait_time
, cb
, dname
);
792 if (type
== BLKIO_STAT_IDLE_TIME
)
793 return blkio_fill_stat(key_str
, MAX_KEY_LEN
- 1,
794 blkg
->stats
.idle_time
, cb
, dname
);
795 if (type
== BLKIO_STAT_EMPTY_TIME
)
796 return blkio_fill_stat(key_str
, MAX_KEY_LEN
- 1,
797 blkg
->stats
.empty_time
, cb
, dname
);
798 if (type
== BLKIO_STAT_DEQUEUE
)
799 return blkio_fill_stat(key_str
, MAX_KEY_LEN
- 1,
800 blkg
->stats
.dequeue
, cb
, dname
);
803 for (sub_type
= BLKIO_STAT_READ
; sub_type
< BLKIO_STAT_TOTAL
;
805 blkio_get_key_name(sub_type
, dname
, key_str
, MAX_KEY_LEN
,
807 cb
->fill(cb
, key_str
, blkg
->stats
.stat_arr
[type
][sub_type
]);
809 disk_total
= blkg
->stats
.stat_arr
[type
][BLKIO_STAT_READ
] +
810 blkg
->stats
.stat_arr
[type
][BLKIO_STAT_WRITE
];
811 blkio_get_key_name(BLKIO_STAT_TOTAL
, dname
, key_str
, MAX_KEY_LEN
,
813 cb
->fill(cb
, key_str
, disk_total
);
817 static int blkio_policy_parse_and_set(char *buf
, enum blkio_policy_id plid
,
818 int fileid
, struct blkio_cgroup
*blkcg
)
820 struct gendisk
*disk
= NULL
;
821 struct blkio_group
*blkg
= NULL
;
822 char *s
[4], *p
, *major_s
= NULL
, *minor_s
= NULL
;
823 unsigned long major
, minor
;
824 int i
= 0, ret
= -EINVAL
;
829 memset(s
, 0, sizeof(s
));
831 while ((p
= strsep(&buf
, " ")) != NULL
) {
837 /* Prevent from inputing too many things */
845 p
= strsep(&s
[0], ":");
855 if (strict_strtoul(major_s
, 10, &major
))
858 if (strict_strtoul(minor_s
, 10, &minor
))
861 dev
= MKDEV(major
, minor
);
863 if (strict_strtoull(s
[1], 10, &temp
))
866 disk
= get_gendisk(dev
, &part
);
872 spin_lock_irq(disk
->queue
->queue_lock
);
873 blkg
= blkg_lookup_create(blkcg
, disk
->queue
, plid
, false);
874 spin_unlock_irq(disk
->queue
->queue_lock
);
882 case BLKIO_POLICY_PROP
:
883 if ((temp
< BLKIO_WEIGHT_MIN
&& temp
> 0) ||
884 temp
> BLKIO_WEIGHT_MAX
)
887 blkg
->conf
.weight
= temp
;
888 blkio_update_group_weight(blkg
, temp
?: blkcg
->weight
);
890 case BLKIO_POLICY_THROTL
:
892 case BLKIO_THROTL_read_bps_device
:
893 blkg
->conf
.bps
[READ
] = temp
;
894 blkio_update_group_bps(blkg
, temp
?: -1, fileid
);
896 case BLKIO_THROTL_write_bps_device
:
897 blkg
->conf
.bps
[WRITE
] = temp
;
898 blkio_update_group_bps(blkg
, temp
?: -1, fileid
);
900 case BLKIO_THROTL_read_iops_device
:
901 if (temp
> THROTL_IOPS_MAX
)
903 blkg
->conf
.iops
[READ
] = temp
;
904 blkio_update_group_iops(blkg
, temp
?: -1, fileid
);
906 case BLKIO_THROTL_write_iops_device
:
907 if (temp
> THROTL_IOPS_MAX
)
909 blkg
->conf
.iops
[WRITE
] = temp
;
910 blkio_update_group_iops(blkg
, temp
?: -1, fileid
);
924 * If queue was bypassing, we should retry. Do so after a short
925 * msleep(). It isn't strictly necessary but queue can be
926 * bypassing for some time and it's always nice to avoid busy
931 return restart_syscall();
936 static int blkiocg_file_write(struct cgroup
*cgrp
, struct cftype
*cft
,
941 struct blkio_cgroup
*blkcg
= cgroup_to_blkio_cgroup(cgrp
);
942 enum blkio_policy_id plid
= BLKIOFILE_POLICY(cft
->private);
943 int fileid
= BLKIOFILE_ATTR(cft
->private);
945 buf
= kstrdup(buffer
, GFP_KERNEL
);
949 ret
= blkio_policy_parse_and_set(buf
, plid
, fileid
, blkcg
);
954 static const char *blkg_dev_name(struct blkio_group
*blkg
)
956 /* some drivers (floppy) instantiate a queue w/o disk registered */
957 if (blkg
->q
->backing_dev_info
.dev
)
958 return dev_name(blkg
->q
->backing_dev_info
.dev
);
962 static void blkio_print_group_conf(struct cftype
*cft
, struct blkio_group
*blkg
,
965 const char *dname
= blkg_dev_name(blkg
);
966 int fileid
= BLKIOFILE_ATTR(cft
->private);
972 switch (blkg
->plid
) {
973 case BLKIO_POLICY_PROP
:
974 if (blkg
->conf
.weight
)
975 seq_printf(m
, "%s\t%u\n",
976 dname
, blkg
->conf
.weight
);
978 case BLKIO_POLICY_THROTL
:
980 case BLKIO_THROTL_read_bps_device
:
982 case BLKIO_THROTL_write_bps_device
:
983 if (blkg
->conf
.bps
[rw
])
984 seq_printf(m
, "%s\t%llu\n",
985 dname
, blkg
->conf
.bps
[rw
]);
987 case BLKIO_THROTL_read_iops_device
:
989 case BLKIO_THROTL_write_iops_device
:
990 if (blkg
->conf
.iops
[rw
])
991 seq_printf(m
, "%s\t%u\n",
992 dname
, blkg
->conf
.iops
[rw
]);
1001 /* cgroup files which read their data from policy nodes end up here */
1002 static void blkio_read_conf(struct cftype
*cft
, struct blkio_cgroup
*blkcg
,
1005 struct blkio_group
*blkg
;
1006 struct hlist_node
*n
;
1008 spin_lock_irq(&blkcg
->lock
);
1009 hlist_for_each_entry(blkg
, n
, &blkcg
->blkg_list
, blkcg_node
)
1010 if (BLKIOFILE_POLICY(cft
->private) == blkg
->plid
)
1011 blkio_print_group_conf(cft
, blkg
, m
);
1012 spin_unlock_irq(&blkcg
->lock
);
1015 static int blkiocg_file_read(struct cgroup
*cgrp
, struct cftype
*cft
,
1018 struct blkio_cgroup
*blkcg
;
1019 enum blkio_policy_id plid
= BLKIOFILE_POLICY(cft
->private);
1020 int name
= BLKIOFILE_ATTR(cft
->private);
1022 blkcg
= cgroup_to_blkio_cgroup(cgrp
);
1025 case BLKIO_POLICY_PROP
:
1027 case BLKIO_PROP_weight_device
:
1028 blkio_read_conf(cft
, blkcg
, m
);
1034 case BLKIO_POLICY_THROTL
:
1036 case BLKIO_THROTL_read_bps_device
:
1037 case BLKIO_THROTL_write_bps_device
:
1038 case BLKIO_THROTL_read_iops_device
:
1039 case BLKIO_THROTL_write_iops_device
:
1040 blkio_read_conf(cft
, blkcg
, m
);
1053 static int blkio_read_blkg_stats(struct blkio_cgroup
*blkcg
,
1054 struct cftype
*cft
, struct cgroup_map_cb
*cb
,
1055 enum stat_type type
, bool show_total
, bool pcpu
)
1057 struct blkio_group
*blkg
;
1058 struct hlist_node
*n
;
1059 uint64_t cgroup_total
= 0;
1062 hlist_for_each_entry_rcu(blkg
, n
, &blkcg
->blkg_list
, blkcg_node
) {
1063 const char *dname
= blkg_dev_name(blkg
);
1065 if (!dname
|| BLKIOFILE_POLICY(cft
->private) != blkg
->plid
)
1068 cgroup_total
+= blkio_get_stat_cpu(blkg
, cb
, dname
,
1071 spin_lock_irq(&blkg
->stats_lock
);
1072 cgroup_total
+= blkio_get_stat(blkg
, cb
, dname
, type
);
1073 spin_unlock_irq(&blkg
->stats_lock
);
1077 cb
->fill(cb
, "Total", cgroup_total
);
1082 /* All map kind of cgroup file get serviced by this function */
1083 static int blkiocg_file_read_map(struct cgroup
*cgrp
, struct cftype
*cft
,
1084 struct cgroup_map_cb
*cb
)
1086 struct blkio_cgroup
*blkcg
;
1087 enum blkio_policy_id plid
= BLKIOFILE_POLICY(cft
->private);
1088 int name
= BLKIOFILE_ATTR(cft
->private);
1090 blkcg
= cgroup_to_blkio_cgroup(cgrp
);
1093 case BLKIO_POLICY_PROP
:
1095 case BLKIO_PROP_time
:
1096 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1097 BLKIO_STAT_TIME
, 0, 0);
1098 case BLKIO_PROP_sectors
:
1099 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1100 BLKIO_STAT_CPU_SECTORS
, 0, 1);
1101 case BLKIO_PROP_io_service_bytes
:
1102 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1103 BLKIO_STAT_CPU_SERVICE_BYTES
, 1, 1);
1104 case BLKIO_PROP_io_serviced
:
1105 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1106 BLKIO_STAT_CPU_SERVICED
, 1, 1);
1107 case BLKIO_PROP_io_service_time
:
1108 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1109 BLKIO_STAT_SERVICE_TIME
, 1, 0);
1110 case BLKIO_PROP_io_wait_time
:
1111 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1112 BLKIO_STAT_WAIT_TIME
, 1, 0);
1113 case BLKIO_PROP_io_merged
:
1114 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1115 BLKIO_STAT_CPU_MERGED
, 1, 1);
1116 case BLKIO_PROP_io_queued
:
1117 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1118 BLKIO_STAT_QUEUED
, 1, 0);
1119 #ifdef CONFIG_DEBUG_BLK_CGROUP
1120 case BLKIO_PROP_unaccounted_time
:
1121 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1122 BLKIO_STAT_UNACCOUNTED_TIME
, 0, 0);
1123 case BLKIO_PROP_dequeue
:
1124 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1125 BLKIO_STAT_DEQUEUE
, 0, 0);
1126 case BLKIO_PROP_avg_queue_size
:
1127 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1128 BLKIO_STAT_AVG_QUEUE_SIZE
, 0, 0);
1129 case BLKIO_PROP_group_wait_time
:
1130 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1131 BLKIO_STAT_GROUP_WAIT_TIME
, 0, 0);
1132 case BLKIO_PROP_idle_time
:
1133 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1134 BLKIO_STAT_IDLE_TIME
, 0, 0);
1135 case BLKIO_PROP_empty_time
:
1136 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1137 BLKIO_STAT_EMPTY_TIME
, 0, 0);
1143 case BLKIO_POLICY_THROTL
:
1145 case BLKIO_THROTL_io_service_bytes
:
1146 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1147 BLKIO_STAT_CPU_SERVICE_BYTES
, 1, 1);
1148 case BLKIO_THROTL_io_serviced
:
1149 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1150 BLKIO_STAT_CPU_SERVICED
, 1, 1);
1162 static int blkio_weight_write(struct blkio_cgroup
*blkcg
, int plid
, u64 val
)
1164 struct blkio_group
*blkg
;
1165 struct hlist_node
*n
;
1167 if (val
< BLKIO_WEIGHT_MIN
|| val
> BLKIO_WEIGHT_MAX
)
1170 spin_lock(&blkio_list_lock
);
1171 spin_lock_irq(&blkcg
->lock
);
1172 blkcg
->weight
= (unsigned int)val
;
1174 hlist_for_each_entry(blkg
, n
, &blkcg
->blkg_list
, blkcg_node
)
1175 if (blkg
->plid
== plid
&& !blkg
->conf
.weight
)
1176 blkio_update_group_weight(blkg
, blkcg
->weight
);
1178 spin_unlock_irq(&blkcg
->lock
);
1179 spin_unlock(&blkio_list_lock
);
1183 static u64
blkiocg_file_read_u64 (struct cgroup
*cgrp
, struct cftype
*cft
) {
1184 struct blkio_cgroup
*blkcg
;
1185 enum blkio_policy_id plid
= BLKIOFILE_POLICY(cft
->private);
1186 int name
= BLKIOFILE_ATTR(cft
->private);
1188 blkcg
= cgroup_to_blkio_cgroup(cgrp
);
1191 case BLKIO_POLICY_PROP
:
1193 case BLKIO_PROP_weight
:
1194 return (u64
)blkcg
->weight
;
1204 blkiocg_file_write_u64(struct cgroup
*cgrp
, struct cftype
*cft
, u64 val
)
1206 struct blkio_cgroup
*blkcg
;
1207 enum blkio_policy_id plid
= BLKIOFILE_POLICY(cft
->private);
1208 int name
= BLKIOFILE_ATTR(cft
->private);
1210 blkcg
= cgroup_to_blkio_cgroup(cgrp
);
1213 case BLKIO_POLICY_PROP
:
1215 case BLKIO_PROP_weight
:
1216 return blkio_weight_write(blkcg
, plid
, val
);
1226 struct cftype blkio_files
[] = {
1228 .name
= "weight_device",
1229 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1230 BLKIO_PROP_weight_device
),
1231 .read_seq_string
= blkiocg_file_read
,
1232 .write_string
= blkiocg_file_write
,
1233 .max_write_len
= 256,
1237 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1239 .read_u64
= blkiocg_file_read_u64
,
1240 .write_u64
= blkiocg_file_write_u64
,
1244 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1246 .read_map
= blkiocg_file_read_map
,
1250 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1251 BLKIO_PROP_sectors
),
1252 .read_map
= blkiocg_file_read_map
,
1255 .name
= "io_service_bytes",
1256 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1257 BLKIO_PROP_io_service_bytes
),
1258 .read_map
= blkiocg_file_read_map
,
1261 .name
= "io_serviced",
1262 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1263 BLKIO_PROP_io_serviced
),
1264 .read_map
= blkiocg_file_read_map
,
1267 .name
= "io_service_time",
1268 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1269 BLKIO_PROP_io_service_time
),
1270 .read_map
= blkiocg_file_read_map
,
1273 .name
= "io_wait_time",
1274 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1275 BLKIO_PROP_io_wait_time
),
1276 .read_map
= blkiocg_file_read_map
,
1279 .name
= "io_merged",
1280 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1281 BLKIO_PROP_io_merged
),
1282 .read_map
= blkiocg_file_read_map
,
1285 .name
= "io_queued",
1286 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1287 BLKIO_PROP_io_queued
),
1288 .read_map
= blkiocg_file_read_map
,
1291 .name
= "reset_stats",
1292 .write_u64
= blkiocg_reset_stats
,
1294 #ifdef CONFIG_BLK_DEV_THROTTLING
1296 .name
= "throttle.read_bps_device",
1297 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL
,
1298 BLKIO_THROTL_read_bps_device
),
1299 .read_seq_string
= blkiocg_file_read
,
1300 .write_string
= blkiocg_file_write
,
1301 .max_write_len
= 256,
1305 .name
= "throttle.write_bps_device",
1306 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL
,
1307 BLKIO_THROTL_write_bps_device
),
1308 .read_seq_string
= blkiocg_file_read
,
1309 .write_string
= blkiocg_file_write
,
1310 .max_write_len
= 256,
1314 .name
= "throttle.read_iops_device",
1315 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL
,
1316 BLKIO_THROTL_read_iops_device
),
1317 .read_seq_string
= blkiocg_file_read
,
1318 .write_string
= blkiocg_file_write
,
1319 .max_write_len
= 256,
1323 .name
= "throttle.write_iops_device",
1324 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL
,
1325 BLKIO_THROTL_write_iops_device
),
1326 .read_seq_string
= blkiocg_file_read
,
1327 .write_string
= blkiocg_file_write
,
1328 .max_write_len
= 256,
1331 .name
= "throttle.io_service_bytes",
1332 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL
,
1333 BLKIO_THROTL_io_service_bytes
),
1334 .read_map
= blkiocg_file_read_map
,
1337 .name
= "throttle.io_serviced",
1338 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL
,
1339 BLKIO_THROTL_io_serviced
),
1340 .read_map
= blkiocg_file_read_map
,
1342 #endif /* CONFIG_BLK_DEV_THROTTLING */
1344 #ifdef CONFIG_DEBUG_BLK_CGROUP
1346 .name
= "avg_queue_size",
1347 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1348 BLKIO_PROP_avg_queue_size
),
1349 .read_map
= blkiocg_file_read_map
,
1352 .name
= "group_wait_time",
1353 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1354 BLKIO_PROP_group_wait_time
),
1355 .read_map
= blkiocg_file_read_map
,
1358 .name
= "idle_time",
1359 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1360 BLKIO_PROP_idle_time
),
1361 .read_map
= blkiocg_file_read_map
,
1364 .name
= "empty_time",
1365 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1366 BLKIO_PROP_empty_time
),
1367 .read_map
= blkiocg_file_read_map
,
1371 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1372 BLKIO_PROP_dequeue
),
1373 .read_map
= blkiocg_file_read_map
,
1376 .name
= "unaccounted_time",
1377 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1378 BLKIO_PROP_unaccounted_time
),
1379 .read_map
= blkiocg_file_read_map
,
1384 static int blkiocg_populate(struct cgroup_subsys
*subsys
, struct cgroup
*cgroup
)
1386 return cgroup_add_files(cgroup
, subsys
, blkio_files
,
1387 ARRAY_SIZE(blkio_files
));
1390 static int blkiocg_pre_destroy(struct cgroup_subsys
*subsys
,
1391 struct cgroup
*cgroup
)
1393 struct blkio_cgroup
*blkcg
= cgroup_to_blkio_cgroup(cgroup
);
1394 unsigned long flags
;
1395 struct blkio_group
*blkg
;
1396 struct request_queue
*q
;
1397 struct blkio_policy_type
*blkiop
;
1402 spin_lock_irqsave(&blkcg
->lock
, flags
);
1404 if (hlist_empty(&blkcg
->blkg_list
)) {
1405 spin_unlock_irqrestore(&blkcg
->lock
, flags
);
1409 blkg
= hlist_entry(blkcg
->blkg_list
.first
, struct blkio_group
,
1411 q
= rcu_dereference(blkg
->q
);
1412 __blkiocg_del_blkio_group(blkg
);
1414 spin_unlock_irqrestore(&blkcg
->lock
, flags
);
1417 * This blkio_group is being unlinked as associated cgroup is
1418 * going away. Let all the IO controlling policies know about
1421 spin_lock(&blkio_list_lock
);
1422 list_for_each_entry(blkiop
, &blkio_list
, list
) {
1423 if (blkiop
->plid
!= blkg
->plid
)
1425 blkiop
->ops
.blkio_unlink_group_fn(q
, blkg
);
1427 spin_unlock(&blkio_list_lock
);
1435 static void blkiocg_destroy(struct cgroup_subsys
*subsys
, struct cgroup
*cgroup
)
1437 struct blkio_cgroup
*blkcg
= cgroup_to_blkio_cgroup(cgroup
);
1439 if (blkcg
!= &blkio_root_cgroup
)
1443 static struct cgroup_subsys_state
*
1444 blkiocg_create(struct cgroup_subsys
*subsys
, struct cgroup
*cgroup
)
1446 struct blkio_cgroup
*blkcg
;
1447 struct cgroup
*parent
= cgroup
->parent
;
1450 blkcg
= &blkio_root_cgroup
;
1454 blkcg
= kzalloc(sizeof(*blkcg
), GFP_KERNEL
);
1456 return ERR_PTR(-ENOMEM
);
1458 blkcg
->weight
= BLKIO_WEIGHT_DEFAULT
;
1460 spin_lock_init(&blkcg
->lock
);
1461 INIT_HLIST_HEAD(&blkcg
->blkg_list
);
1467 * blkcg_init_queue - initialize blkcg part of request queue
1468 * @q: request_queue to initialize
1470 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
1471 * part of new request_queue @q.
1474 * 0 on success, -errno on failure.
1476 int blkcg_init_queue(struct request_queue
*q
)
1482 ret
= blk_throtl_init(q
);
1486 mutex_lock(&all_q_mutex
);
1487 INIT_LIST_HEAD(&q
->all_q_node
);
1488 list_add_tail(&q
->all_q_node
, &all_q_list
);
1489 mutex_unlock(&all_q_mutex
);
1495 * blkcg_drain_queue - drain blkcg part of request_queue
1496 * @q: request_queue to drain
1498 * Called from blk_drain_queue(). Responsible for draining blkcg part.
1500 void blkcg_drain_queue(struct request_queue
*q
)
1502 lockdep_assert_held(q
->queue_lock
);
1504 blk_throtl_drain(q
);
1508 * blkcg_exit_queue - exit and release blkcg part of request_queue
1509 * @q: request_queue being released
1511 * Called from blk_release_queue(). Responsible for exiting blkcg part.
1513 void blkcg_exit_queue(struct request_queue
*q
)
1515 mutex_lock(&all_q_mutex
);
1516 list_del_init(&q
->all_q_node
);
1517 mutex_unlock(&all_q_mutex
);
1523 * We cannot support shared io contexts, as we have no mean to support
1524 * two tasks with the same ioc in two different groups without major rework
1525 * of the main cic data structures. For now we allow a task to change
1526 * its cgroup only if it's the only owner of its ioc.
1528 static int blkiocg_can_attach(struct cgroup_subsys
*ss
, struct cgroup
*cgrp
,
1529 struct cgroup_taskset
*tset
)
1531 struct task_struct
*task
;
1532 struct io_context
*ioc
;
1535 /* task_lock() is needed to avoid races with exit_io_context() */
1536 cgroup_taskset_for_each(task
, cgrp
, tset
) {
1538 ioc
= task
->io_context
;
1539 if (ioc
&& atomic_read(&ioc
->nr_tasks
) > 1)
1548 static void blkiocg_attach(struct cgroup_subsys
*ss
, struct cgroup
*cgrp
,
1549 struct cgroup_taskset
*tset
)
1551 struct task_struct
*task
;
1552 struct io_context
*ioc
;
1554 cgroup_taskset_for_each(task
, cgrp
, tset
) {
1555 /* we don't lose anything even if ioc allocation fails */
1556 ioc
= get_task_io_context(task
, GFP_ATOMIC
, NUMA_NO_NODE
);
1558 ioc_cgroup_changed(ioc
);
1559 put_io_context(ioc
);
1564 static void blkcg_bypass_start(void)
1565 __acquires(&all_q_mutex
)
1567 struct request_queue
*q
;
1569 mutex_lock(&all_q_mutex
);
1571 list_for_each_entry(q
, &all_q_list
, all_q_node
) {
1572 blk_queue_bypass_start(q
);
1573 blkg_destroy_all(q
);
1577 static void blkcg_bypass_end(void)
1578 __releases(&all_q_mutex
)
1580 struct request_queue
*q
;
1582 list_for_each_entry(q
, &all_q_list
, all_q_node
)
1583 blk_queue_bypass_end(q
);
1585 mutex_unlock(&all_q_mutex
);
1588 void blkio_policy_register(struct blkio_policy_type
*blkiop
)
1590 blkcg_bypass_start();
1591 spin_lock(&blkio_list_lock
);
1593 BUG_ON(blkio_policy
[blkiop
->plid
]);
1594 blkio_policy
[blkiop
->plid
] = blkiop
;
1595 list_add_tail(&blkiop
->list
, &blkio_list
);
1597 spin_unlock(&blkio_list_lock
);
1600 EXPORT_SYMBOL_GPL(blkio_policy_register
);
1602 void blkio_policy_unregister(struct blkio_policy_type
*blkiop
)
1604 blkcg_bypass_start();
1605 spin_lock(&blkio_list_lock
);
1607 BUG_ON(blkio_policy
[blkiop
->plid
] != blkiop
);
1608 blkio_policy
[blkiop
->plid
] = NULL
;
1609 list_del_init(&blkiop
->list
);
1611 spin_unlock(&blkio_list_lock
);
1614 EXPORT_SYMBOL_GPL(blkio_policy_unregister
);