2 * Common Block IO controller cgroup interface
4 * Based on ideas and code from CFQ, CFS and BFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11 * Nauman Rafique <nauman@google.com>
13 #include <linux/ioprio.h>
14 #include <linux/seq_file.h>
15 #include <linux/kdev_t.h>
16 #include <linux/module.h>
17 #include <linux/err.h>
18 #include <linux/blkdev.h>
19 #include <linux/slab.h>
20 #include <linux/genhd.h>
21 #include <linux/delay.h>
22 #include "blk-cgroup.h"
25 #define MAX_KEY_LEN 100
27 static DEFINE_SPINLOCK(blkio_list_lock
);
28 static LIST_HEAD(blkio_list
);
30 static DEFINE_MUTEX(all_q_mutex
);
31 static LIST_HEAD(all_q_list
);
33 /* List of groups pending per cpu stats allocation */
34 static DEFINE_SPINLOCK(alloc_list_lock
);
35 static LIST_HEAD(alloc_list
);
37 static void blkio_stat_alloc_fn(struct work_struct
*);
38 static DECLARE_DELAYED_WORK(blkio_stat_alloc_work
, blkio_stat_alloc_fn
);
40 struct blkio_cgroup blkio_root_cgroup
= { .weight
= 2*BLKIO_WEIGHT_DEFAULT
};
41 EXPORT_SYMBOL_GPL(blkio_root_cgroup
);
43 static struct blkio_policy_type
*blkio_policy
[BLKIO_NR_POLICIES
];
45 static struct cgroup_subsys_state
*blkiocg_create(struct cgroup_subsys
*,
47 static int blkiocg_can_attach(struct cgroup_subsys
*, struct cgroup
*,
48 struct cgroup_taskset
*);
49 static void blkiocg_attach(struct cgroup_subsys
*, struct cgroup
*,
50 struct cgroup_taskset
*);
51 static int blkiocg_pre_destroy(struct cgroup_subsys
*, struct cgroup
*);
52 static void blkiocg_destroy(struct cgroup_subsys
*, struct cgroup
*);
53 static int blkiocg_populate(struct cgroup_subsys
*, struct cgroup
*);
55 /* for encoding cft->private value on file */
56 #define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val))
57 /* What policy owns the file, proportional or throttle */
58 #define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff)
59 #define BLKIOFILE_ATTR(val) ((val) & 0xffff)
61 struct cgroup_subsys blkio_subsys
= {
63 .create
= blkiocg_create
,
64 .can_attach
= blkiocg_can_attach
,
65 .attach
= blkiocg_attach
,
66 .pre_destroy
= blkiocg_pre_destroy
,
67 .destroy
= blkiocg_destroy
,
68 .populate
= blkiocg_populate
,
69 .subsys_id
= blkio_subsys_id
,
70 .module
= THIS_MODULE
,
72 EXPORT_SYMBOL_GPL(blkio_subsys
);
74 struct blkio_cgroup
*cgroup_to_blkio_cgroup(struct cgroup
*cgroup
)
76 return container_of(cgroup_subsys_state(cgroup
, blkio_subsys_id
),
77 struct blkio_cgroup
, css
);
79 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup
);
81 static struct blkio_cgroup
*task_blkio_cgroup(struct task_struct
*tsk
)
83 return container_of(task_subsys_state(tsk
, blkio_subsys_id
),
84 struct blkio_cgroup
, css
);
87 struct blkio_cgroup
*bio_blkio_cgroup(struct bio
*bio
)
89 if (bio
&& bio
->bi_css
)
90 return container_of(bio
->bi_css
, struct blkio_cgroup
, css
);
91 return task_blkio_cgroup(current
);
93 EXPORT_SYMBOL_GPL(bio_blkio_cgroup
);
95 static inline void blkio_update_group_weight(struct blkio_group
*blkg
,
96 int plid
, unsigned int weight
)
98 struct blkio_policy_type
*blkiop
;
100 list_for_each_entry(blkiop
, &blkio_list
, list
) {
101 /* If this policy does not own the blkg, do not send updates */
102 if (blkiop
->plid
!= plid
)
104 if (blkiop
->ops
.blkio_update_group_weight_fn
)
105 blkiop
->ops
.blkio_update_group_weight_fn(blkg
->q
,
110 static inline void blkio_update_group_bps(struct blkio_group
*blkg
, int plid
,
113 struct blkio_policy_type
*blkiop
;
115 list_for_each_entry(blkiop
, &blkio_list
, list
) {
117 /* If this policy does not own the blkg, do not send updates */
118 if (blkiop
->plid
!= plid
)
121 if (fileid
== BLKIO_THROTL_read_bps_device
122 && blkiop
->ops
.blkio_update_group_read_bps_fn
)
123 blkiop
->ops
.blkio_update_group_read_bps_fn(blkg
->q
,
126 if (fileid
== BLKIO_THROTL_write_bps_device
127 && blkiop
->ops
.blkio_update_group_write_bps_fn
)
128 blkiop
->ops
.blkio_update_group_write_bps_fn(blkg
->q
,
133 static inline void blkio_update_group_iops(struct blkio_group
*blkg
,
134 int plid
, unsigned int iops
,
137 struct blkio_policy_type
*blkiop
;
139 list_for_each_entry(blkiop
, &blkio_list
, list
) {
141 /* If this policy does not own the blkg, do not send updates */
142 if (blkiop
->plid
!= plid
)
145 if (fileid
== BLKIO_THROTL_read_iops_device
146 && blkiop
->ops
.blkio_update_group_read_iops_fn
)
147 blkiop
->ops
.blkio_update_group_read_iops_fn(blkg
->q
,
150 if (fileid
== BLKIO_THROTL_write_iops_device
151 && blkiop
->ops
.blkio_update_group_write_iops_fn
)
152 blkiop
->ops
.blkio_update_group_write_iops_fn(blkg
->q
,
158 * Add to the appropriate stat variable depending on the request type.
159 * This should be called with the blkg->stats_lock held.
161 static void blkio_add_stat(uint64_t *stat
, uint64_t add
, bool direction
,
165 stat
[BLKIO_STAT_WRITE
] += add
;
167 stat
[BLKIO_STAT_READ
] += add
;
169 stat
[BLKIO_STAT_SYNC
] += add
;
171 stat
[BLKIO_STAT_ASYNC
] += add
;
175 * Decrements the appropriate stat variable if non-zero depending on the
176 * request type. Panics on value being zero.
177 * This should be called with the blkg->stats_lock held.
179 static void blkio_check_and_dec_stat(uint64_t *stat
, bool direction
, bool sync
)
182 BUG_ON(stat
[BLKIO_STAT_WRITE
] == 0);
183 stat
[BLKIO_STAT_WRITE
]--;
185 BUG_ON(stat
[BLKIO_STAT_READ
] == 0);
186 stat
[BLKIO_STAT_READ
]--;
189 BUG_ON(stat
[BLKIO_STAT_SYNC
] == 0);
190 stat
[BLKIO_STAT_SYNC
]--;
192 BUG_ON(stat
[BLKIO_STAT_ASYNC
] == 0);
193 stat
[BLKIO_STAT_ASYNC
]--;
197 #ifdef CONFIG_DEBUG_BLK_CGROUP
198 /* This should be called with the blkg->stats_lock held. */
199 static void blkio_set_start_group_wait_time(struct blkio_group
*blkg
,
200 struct blkio_policy_type
*pol
,
201 struct blkio_group
*curr_blkg
)
203 struct blkg_policy_data
*pd
= blkg
->pd
[pol
->plid
];
205 if (blkio_blkg_waiting(&pd
->stats
))
207 if (blkg
== curr_blkg
)
209 pd
->stats
.start_group_wait_time
= sched_clock();
210 blkio_mark_blkg_waiting(&pd
->stats
);
213 /* This should be called with the blkg->stats_lock held. */
214 static void blkio_update_group_wait_time(struct blkio_group_stats
*stats
)
216 unsigned long long now
;
218 if (!blkio_blkg_waiting(stats
))
222 if (time_after64(now
, stats
->start_group_wait_time
))
223 stats
->group_wait_time
+= now
- stats
->start_group_wait_time
;
224 blkio_clear_blkg_waiting(stats
);
227 /* This should be called with the blkg->stats_lock held. */
228 static void blkio_end_empty_time(struct blkio_group_stats
*stats
)
230 unsigned long long now
;
232 if (!blkio_blkg_empty(stats
))
236 if (time_after64(now
, stats
->start_empty_time
))
237 stats
->empty_time
+= now
- stats
->start_empty_time
;
238 blkio_clear_blkg_empty(stats
);
241 void blkiocg_update_set_idle_time_stats(struct blkio_group
*blkg
,
242 struct blkio_policy_type
*pol
)
244 struct blkg_policy_data
*pd
= blkg
->pd
[pol
->plid
];
247 spin_lock_irqsave(&blkg
->stats_lock
, flags
);
248 BUG_ON(blkio_blkg_idling(&pd
->stats
));
249 pd
->stats
.start_idle_time
= sched_clock();
250 blkio_mark_blkg_idling(&pd
->stats
);
251 spin_unlock_irqrestore(&blkg
->stats_lock
, flags
);
253 EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats
);
255 void blkiocg_update_idle_time_stats(struct blkio_group
*blkg
,
256 struct blkio_policy_type
*pol
)
258 struct blkg_policy_data
*pd
= blkg
->pd
[pol
->plid
];
260 unsigned long long now
;
261 struct blkio_group_stats
*stats
;
263 spin_lock_irqsave(&blkg
->stats_lock
, flags
);
265 if (blkio_blkg_idling(stats
)) {
267 if (time_after64(now
, stats
->start_idle_time
))
268 stats
->idle_time
+= now
- stats
->start_idle_time
;
269 blkio_clear_blkg_idling(stats
);
271 spin_unlock_irqrestore(&blkg
->stats_lock
, flags
);
273 EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats
);
275 void blkiocg_update_avg_queue_size_stats(struct blkio_group
*blkg
,
276 struct blkio_policy_type
*pol
)
278 struct blkg_policy_data
*pd
= blkg
->pd
[pol
->plid
];
280 struct blkio_group_stats
*stats
;
282 spin_lock_irqsave(&blkg
->stats_lock
, flags
);
284 stats
->avg_queue_size_sum
+=
285 stats
->stat_arr
[BLKIO_STAT_QUEUED
][BLKIO_STAT_READ
] +
286 stats
->stat_arr
[BLKIO_STAT_QUEUED
][BLKIO_STAT_WRITE
];
287 stats
->avg_queue_size_samples
++;
288 blkio_update_group_wait_time(stats
);
289 spin_unlock_irqrestore(&blkg
->stats_lock
, flags
);
291 EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats
);
293 void blkiocg_set_start_empty_time(struct blkio_group
*blkg
,
294 struct blkio_policy_type
*pol
)
296 struct blkg_policy_data
*pd
= blkg
->pd
[pol
->plid
];
298 struct blkio_group_stats
*stats
;
300 spin_lock_irqsave(&blkg
->stats_lock
, flags
);
303 if (stats
->stat_arr
[BLKIO_STAT_QUEUED
][BLKIO_STAT_READ
] ||
304 stats
->stat_arr
[BLKIO_STAT_QUEUED
][BLKIO_STAT_WRITE
]) {
305 spin_unlock_irqrestore(&blkg
->stats_lock
, flags
);
310 * group is already marked empty. This can happen if cfqq got new
311 * request in parent group and moved to this group while being added
312 * to service tree. Just ignore the event and move on.
314 if(blkio_blkg_empty(stats
)) {
315 spin_unlock_irqrestore(&blkg
->stats_lock
, flags
);
319 stats
->start_empty_time
= sched_clock();
320 blkio_mark_blkg_empty(stats
);
321 spin_unlock_irqrestore(&blkg
->stats_lock
, flags
);
323 EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time
);
325 void blkiocg_update_dequeue_stats(struct blkio_group
*blkg
,
326 struct blkio_policy_type
*pol
,
327 unsigned long dequeue
)
329 struct blkg_policy_data
*pd
= blkg
->pd
[pol
->plid
];
331 pd
->stats
.dequeue
+= dequeue
;
333 EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats
);
335 static inline void blkio_set_start_group_wait_time(struct blkio_group
*blkg
,
336 struct blkio_policy_type
*pol
,
337 struct blkio_group
*curr_blkg
) { }
338 static inline void blkio_end_empty_time(struct blkio_group_stats
*stats
) { }
341 void blkiocg_update_io_add_stats(struct blkio_group
*blkg
,
342 struct blkio_policy_type
*pol
,
343 struct blkio_group
*curr_blkg
, bool direction
,
346 struct blkg_policy_data
*pd
= blkg
->pd
[pol
->plid
];
349 spin_lock_irqsave(&blkg
->stats_lock
, flags
);
350 blkio_add_stat(pd
->stats
.stat_arr
[BLKIO_STAT_QUEUED
], 1, direction
,
352 blkio_end_empty_time(&pd
->stats
);
353 blkio_set_start_group_wait_time(blkg
, pol
, curr_blkg
);
354 spin_unlock_irqrestore(&blkg
->stats_lock
, flags
);
356 EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats
);
358 void blkiocg_update_io_remove_stats(struct blkio_group
*blkg
,
359 struct blkio_policy_type
*pol
,
360 bool direction
, bool sync
)
362 struct blkg_policy_data
*pd
= blkg
->pd
[pol
->plid
];
365 spin_lock_irqsave(&blkg
->stats_lock
, flags
);
366 blkio_check_and_dec_stat(pd
->stats
.stat_arr
[BLKIO_STAT_QUEUED
],
368 spin_unlock_irqrestore(&blkg
->stats_lock
, flags
);
370 EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats
);
372 void blkiocg_update_timeslice_used(struct blkio_group
*blkg
,
373 struct blkio_policy_type
*pol
,
375 unsigned long unaccounted_time
)
377 struct blkg_policy_data
*pd
= blkg
->pd
[pol
->plid
];
380 spin_lock_irqsave(&blkg
->stats_lock
, flags
);
381 pd
->stats
.time
+= time
;
382 #ifdef CONFIG_DEBUG_BLK_CGROUP
383 pd
->stats
.unaccounted_time
+= unaccounted_time
;
385 spin_unlock_irqrestore(&blkg
->stats_lock
, flags
);
387 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used
);
390 * should be called under rcu read lock or queue lock to make sure blkg pointer
393 void blkiocg_update_dispatch_stats(struct blkio_group
*blkg
,
394 struct blkio_policy_type
*pol
,
395 uint64_t bytes
, bool direction
, bool sync
)
397 struct blkg_policy_data
*pd
= blkg
->pd
[pol
->plid
];
398 struct blkio_group_stats_cpu
*stats_cpu
;
401 /* If per cpu stats are not allocated yet, don't do any accounting. */
402 if (pd
->stats_cpu
== NULL
)
406 * Disabling interrupts to provide mutual exclusion between two
407 * writes on same cpu. It probably is not needed for 64bit. Not
408 * optimizing that case yet.
410 local_irq_save(flags
);
412 stats_cpu
= this_cpu_ptr(pd
->stats_cpu
);
414 u64_stats_update_begin(&stats_cpu
->syncp
);
415 stats_cpu
->sectors
+= bytes
>> 9;
416 blkio_add_stat(stats_cpu
->stat_arr_cpu
[BLKIO_STAT_CPU_SERVICED
],
418 blkio_add_stat(stats_cpu
->stat_arr_cpu
[BLKIO_STAT_CPU_SERVICE_BYTES
],
419 bytes
, direction
, sync
);
420 u64_stats_update_end(&stats_cpu
->syncp
);
421 local_irq_restore(flags
);
423 EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats
);
425 void blkiocg_update_completion_stats(struct blkio_group
*blkg
,
426 struct blkio_policy_type
*pol
,
428 uint64_t io_start_time
, bool direction
,
431 struct blkg_policy_data
*pd
= blkg
->pd
[pol
->plid
];
432 struct blkio_group_stats
*stats
;
434 unsigned long long now
= sched_clock();
436 spin_lock_irqsave(&blkg
->stats_lock
, flags
);
438 if (time_after64(now
, io_start_time
))
439 blkio_add_stat(stats
->stat_arr
[BLKIO_STAT_SERVICE_TIME
],
440 now
- io_start_time
, direction
, sync
);
441 if (time_after64(io_start_time
, start_time
))
442 blkio_add_stat(stats
->stat_arr
[BLKIO_STAT_WAIT_TIME
],
443 io_start_time
- start_time
, direction
, sync
);
444 spin_unlock_irqrestore(&blkg
->stats_lock
, flags
);
446 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats
);
448 /* Merged stats are per cpu. */
449 void blkiocg_update_io_merged_stats(struct blkio_group
*blkg
,
450 struct blkio_policy_type
*pol
,
451 bool direction
, bool sync
)
453 struct blkg_policy_data
*pd
= blkg
->pd
[pol
->plid
];
454 struct blkio_group_stats
*stats
;
457 spin_lock_irqsave(&blkg
->stats_lock
, flags
);
459 blkio_add_stat(stats
->stat_arr
[BLKIO_STAT_MERGED
], 1, direction
, sync
);
460 spin_unlock_irqrestore(&blkg
->stats_lock
, flags
);
462 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats
);
465 * Worker for allocating per cpu stat for blk groups. This is scheduled on
466 * the system_nrt_wq once there are some groups on the alloc_list waiting
469 static void blkio_stat_alloc_fn(struct work_struct
*work
)
471 static void *pcpu_stats
[BLKIO_NR_POLICIES
];
472 struct delayed_work
*dwork
= to_delayed_work(work
);
473 struct blkio_group
*blkg
;
478 for (i
= 0; i
< BLKIO_NR_POLICIES
; i
++) {
479 if (pcpu_stats
[i
] != NULL
)
482 pcpu_stats
[i
] = alloc_percpu(struct blkio_group_stats_cpu
);
484 /* Allocation failed. Try again after some time. */
485 if (pcpu_stats
[i
] == NULL
) {
486 queue_delayed_work(system_nrt_wq
, dwork
,
487 msecs_to_jiffies(10));
492 spin_lock_irq(&blkio_list_lock
);
493 spin_lock(&alloc_list_lock
);
495 /* cgroup got deleted or queue exited. */
496 if (!list_empty(&alloc_list
)) {
497 blkg
= list_first_entry(&alloc_list
, struct blkio_group
,
499 for (i
= 0; i
< BLKIO_NR_POLICIES
; i
++) {
500 struct blkg_policy_data
*pd
= blkg
->pd
[i
];
502 if (blkio_policy
[i
] && pd
&& !pd
->stats_cpu
)
503 swap(pd
->stats_cpu
, pcpu_stats
[i
]);
506 list_del_init(&blkg
->alloc_node
);
509 empty
= list_empty(&alloc_list
);
511 spin_unlock(&alloc_list_lock
);
512 spin_unlock_irq(&blkio_list_lock
);
519 * blkg_free - free a blkg
520 * @blkg: blkg to free
522 * Free @blkg which may be partially allocated.
524 static void blkg_free(struct blkio_group
*blkg
)
531 for (i
= 0; i
< BLKIO_NR_POLICIES
; i
++) {
532 struct blkg_policy_data
*pd
= blkg
->pd
[i
];
535 free_percpu(pd
->stats_cpu
);
544 * blkg_alloc - allocate a blkg
545 * @blkcg: block cgroup the new blkg is associated with
546 * @q: request_queue the new blkg is associated with
548 * Allocate a new blkg assocating @blkcg and @q.
550 static struct blkio_group
*blkg_alloc(struct blkio_cgroup
*blkcg
,
551 struct request_queue
*q
)
553 struct blkio_group
*blkg
;
556 /* alloc and init base part */
557 blkg
= kzalloc_node(sizeof(*blkg
), GFP_ATOMIC
, q
->node
);
561 spin_lock_init(&blkg
->stats_lock
);
563 INIT_LIST_HEAD(&blkg
->q_node
);
564 INIT_LIST_HEAD(&blkg
->alloc_node
);
567 cgroup_path(blkcg
->css
.cgroup
, blkg
->path
, sizeof(blkg
->path
));
569 for (i
= 0; i
< BLKIO_NR_POLICIES
; i
++) {
570 struct blkio_policy_type
*pol
= blkio_policy
[i
];
571 struct blkg_policy_data
*pd
;
576 /* alloc per-policy data and attach it to blkg */
577 pd
= kzalloc_node(sizeof(*pd
) + pol
->pdata_size
, GFP_ATOMIC
,
588 /* invoke per-policy init */
589 for (i
= 0; i
< BLKIO_NR_POLICIES
; i
++) {
590 struct blkio_policy_type
*pol
= blkio_policy
[i
];
593 pol
->ops
.blkio_init_group_fn(blkg
);
599 struct blkio_group
*blkg_lookup_create(struct blkio_cgroup
*blkcg
,
600 struct request_queue
*q
,
601 enum blkio_policy_id plid
,
603 __releases(q
->queue_lock
) __acquires(q
->queue_lock
)
605 struct blkio_group
*blkg
;
607 WARN_ON_ONCE(!rcu_read_lock_held());
608 lockdep_assert_held(q
->queue_lock
);
611 * This could be the first entry point of blkcg implementation and
612 * we shouldn't allow anything to go through for a bypassing queue.
613 * The following can be removed if blkg lookup is guaranteed to
614 * fail on a bypassing queue.
616 if (unlikely(blk_queue_bypass(q
)) && !for_root
)
617 return ERR_PTR(blk_queue_dead(q
) ? -EINVAL
: -EBUSY
);
619 blkg
= blkg_lookup(blkcg
, q
);
623 /* blkg holds a reference to blkcg */
624 if (!css_tryget(&blkcg
->css
))
625 return ERR_PTR(-EINVAL
);
628 * Allocate and initialize.
630 blkg
= blkg_alloc(blkcg
, q
);
632 /* did alloc fail? */
633 if (unlikely(!blkg
)) {
634 blkg
= ERR_PTR(-ENOMEM
);
639 spin_lock(&blkcg
->lock
);
640 hlist_add_head_rcu(&blkg
->blkcg_node
, &blkcg
->blkg_list
);
641 list_add(&blkg
->q_node
, &q
->blkg_list
);
642 spin_unlock(&blkcg
->lock
);
644 spin_lock(&alloc_list_lock
);
645 list_add(&blkg
->alloc_node
, &alloc_list
);
646 /* Queue per cpu stat allocation from worker thread. */
647 queue_delayed_work(system_nrt_wq
, &blkio_stat_alloc_work
, 0);
648 spin_unlock(&alloc_list_lock
);
652 EXPORT_SYMBOL_GPL(blkg_lookup_create
);
654 /* called under rcu_read_lock(). */
655 struct blkio_group
*blkg_lookup(struct blkio_cgroup
*blkcg
,
656 struct request_queue
*q
)
658 struct blkio_group
*blkg
;
659 struct hlist_node
*n
;
661 hlist_for_each_entry_rcu(blkg
, n
, &blkcg
->blkg_list
, blkcg_node
)
666 EXPORT_SYMBOL_GPL(blkg_lookup
);
668 static void blkg_destroy(struct blkio_group
*blkg
)
670 struct request_queue
*q
= blkg
->q
;
671 struct blkio_cgroup
*blkcg
= blkg
->blkcg
;
673 lockdep_assert_held(q
->queue_lock
);
674 lockdep_assert_held(&blkcg
->lock
);
676 /* Something wrong if we are trying to remove same group twice */
677 WARN_ON_ONCE(list_empty(&blkg
->q_node
));
678 WARN_ON_ONCE(hlist_unhashed(&blkg
->blkcg_node
));
679 list_del_init(&blkg
->q_node
);
680 hlist_del_init_rcu(&blkg
->blkcg_node
);
682 spin_lock(&alloc_list_lock
);
683 list_del_init(&blkg
->alloc_node
);
684 spin_unlock(&alloc_list_lock
);
687 * Put the reference taken at the time of creation so that when all
688 * queues are gone, group can be destroyed.
694 * XXX: This updates blkg policy data in-place for root blkg, which is
695 * necessary across elevator switch and policy registration as root blkgs
696 * aren't shot down. This broken and racy implementation is temporary.
697 * Eventually, blkg shoot down will be replaced by proper in-place update.
699 void update_root_blkg_pd(struct request_queue
*q
, enum blkio_policy_id plid
)
701 struct blkio_policy_type
*pol
= blkio_policy
[plid
];
702 struct blkio_group
*blkg
= blkg_lookup(&blkio_root_cgroup
, q
);
703 struct blkg_policy_data
*pd
;
708 kfree(blkg
->pd
[plid
]);
709 blkg
->pd
[plid
] = NULL
;
714 pd
= kzalloc(sizeof(*pd
) + pol
->pdata_size
, GFP_KERNEL
);
717 pd
->stats_cpu
= alloc_percpu(struct blkio_group_stats_cpu
);
718 WARN_ON_ONCE(!pd
->stats_cpu
);
722 pol
->ops
.blkio_init_group_fn(blkg
);
724 EXPORT_SYMBOL_GPL(update_root_blkg_pd
);
727 * blkg_destroy_all - destroy all blkgs associated with a request_queue
728 * @q: request_queue of interest
729 * @destroy_root: whether to destroy root blkg or not
731 * Destroy blkgs associated with @q. If @destroy_root is %true, all are
732 * destroyed; otherwise, root blkg is left alone.
734 void blkg_destroy_all(struct request_queue
*q
, bool destroy_root
)
736 struct blkio_group
*blkg
, *n
;
738 spin_lock_irq(q
->queue_lock
);
740 list_for_each_entry_safe(blkg
, n
, &q
->blkg_list
, q_node
) {
741 struct blkio_cgroup
*blkcg
= blkg
->blkcg
;
744 if (!destroy_root
&& blkg
->blkcg
== &blkio_root_cgroup
)
747 spin_lock(&blkcg
->lock
);
749 spin_unlock(&blkcg
->lock
);
752 spin_unlock_irq(q
->queue_lock
);
754 EXPORT_SYMBOL_GPL(blkg_destroy_all
);
756 static void blkg_rcu_free(struct rcu_head
*rcu_head
)
758 blkg_free(container_of(rcu_head
, struct blkio_group
, rcu_head
));
761 void __blkg_release(struct blkio_group
*blkg
)
763 /* release the extra blkcg reference this blkg has been holding */
764 css_put(&blkg
->blkcg
->css
);
767 * A group is freed in rcu manner. But having an rcu lock does not
768 * mean that one can access all the fields of blkg and assume these
769 * are valid. For example, don't try to follow throtl_data and
770 * request queue links.
772 * Having a reference to blkg under an rcu allows acess to only
773 * values local to groups like group stats and group rate limits
775 call_rcu(&blkg
->rcu_head
, blkg_rcu_free
);
777 EXPORT_SYMBOL_GPL(__blkg_release
);
779 static void blkio_reset_stats_cpu(struct blkio_group
*blkg
, int plid
)
781 struct blkg_policy_data
*pd
= blkg
->pd
[plid
];
782 struct blkio_group_stats_cpu
*stats_cpu
;
785 if (pd
->stats_cpu
== NULL
)
788 * Note: On 64 bit arch this should not be an issue. This has the
789 * possibility of returning some inconsistent value on 32bit arch
790 * as 64bit update on 32bit is non atomic. Taking care of this
791 * corner case makes code very complicated, like sending IPIs to
792 * cpus, taking care of stats of offline cpus etc.
794 * reset stats is anyway more of a debug feature and this sounds a
795 * corner case. So I am not complicating the code yet until and
796 * unless this becomes a real issue.
798 for_each_possible_cpu(i
) {
799 stats_cpu
= per_cpu_ptr(pd
->stats_cpu
, i
);
800 stats_cpu
->sectors
= 0;
801 for(j
= 0; j
< BLKIO_STAT_CPU_NR
; j
++)
802 for (k
= 0; k
< BLKIO_STAT_TOTAL
; k
++)
803 stats_cpu
->stat_arr_cpu
[j
][k
] = 0;
808 blkiocg_reset_stats(struct cgroup
*cgroup
, struct cftype
*cftype
, u64 val
)
810 struct blkio_cgroup
*blkcg
;
811 struct blkio_group
*blkg
;
812 struct blkio_group_stats
*stats
;
813 struct hlist_node
*n
;
814 uint64_t queued
[BLKIO_STAT_TOTAL
];
816 #ifdef CONFIG_DEBUG_BLK_CGROUP
817 bool idling
, waiting
, empty
;
818 unsigned long long now
= sched_clock();
821 blkcg
= cgroup_to_blkio_cgroup(cgroup
);
822 spin_lock(&blkio_list_lock
);
823 spin_lock_irq(&blkcg
->lock
);
824 hlist_for_each_entry(blkg
, n
, &blkcg
->blkg_list
, blkcg_node
) {
825 struct blkio_policy_type
*pol
;
827 list_for_each_entry(pol
, &blkio_list
, list
) {
828 struct blkg_policy_data
*pd
= blkg
->pd
[pol
->plid
];
830 spin_lock(&blkg
->stats_lock
);
832 #ifdef CONFIG_DEBUG_BLK_CGROUP
833 idling
= blkio_blkg_idling(stats
);
834 waiting
= blkio_blkg_waiting(stats
);
835 empty
= blkio_blkg_empty(stats
);
837 for (i
= 0; i
< BLKIO_STAT_TOTAL
; i
++)
838 queued
[i
] = stats
->stat_arr
[BLKIO_STAT_QUEUED
][i
];
839 memset(stats
, 0, sizeof(struct blkio_group_stats
));
840 for (i
= 0; i
< BLKIO_STAT_TOTAL
; i
++)
841 stats
->stat_arr
[BLKIO_STAT_QUEUED
][i
] = queued
[i
];
842 #ifdef CONFIG_DEBUG_BLK_CGROUP
844 blkio_mark_blkg_idling(stats
);
845 stats
->start_idle_time
= now
;
848 blkio_mark_blkg_waiting(stats
);
849 stats
->start_group_wait_time
= now
;
852 blkio_mark_blkg_empty(stats
);
853 stats
->start_empty_time
= now
;
856 spin_unlock(&blkg
->stats_lock
);
858 /* Reset Per cpu stats which don't take blkg->stats_lock */
859 blkio_reset_stats_cpu(blkg
, pol
->plid
);
863 spin_unlock_irq(&blkcg
->lock
);
864 spin_unlock(&blkio_list_lock
);
868 static void blkio_get_key_name(enum stat_sub_type type
, const char *dname
,
869 char *str
, int chars_left
, bool diskname_only
)
871 snprintf(str
, chars_left
, "%s", dname
);
872 chars_left
-= strlen(str
);
873 if (chars_left
<= 0) {
875 "Possibly incorrect cgroup stat display format");
881 case BLKIO_STAT_READ
:
882 strlcat(str
, " Read", chars_left
);
884 case BLKIO_STAT_WRITE
:
885 strlcat(str
, " Write", chars_left
);
887 case BLKIO_STAT_SYNC
:
888 strlcat(str
, " Sync", chars_left
);
890 case BLKIO_STAT_ASYNC
:
891 strlcat(str
, " Async", chars_left
);
893 case BLKIO_STAT_TOTAL
:
894 strlcat(str
, " Total", chars_left
);
897 strlcat(str
, " Invalid", chars_left
);
901 static uint64_t blkio_fill_stat(char *str
, int chars_left
, uint64_t val
,
902 struct cgroup_map_cb
*cb
, const char *dname
)
904 blkio_get_key_name(0, dname
, str
, chars_left
, true);
905 cb
->fill(cb
, str
, val
);
910 static uint64_t blkio_read_stat_cpu(struct blkio_group
*blkg
, int plid
,
911 enum stat_type_cpu type
, enum stat_sub_type sub_type
)
913 struct blkg_policy_data
*pd
= blkg
->pd
[plid
];
915 struct blkio_group_stats_cpu
*stats_cpu
;
918 if (pd
->stats_cpu
== NULL
)
921 for_each_possible_cpu(cpu
) {
923 stats_cpu
= per_cpu_ptr(pd
->stats_cpu
, cpu
);
926 start
= u64_stats_fetch_begin(&stats_cpu
->syncp
);
927 if (type
== BLKIO_STAT_CPU_SECTORS
)
928 tval
= stats_cpu
->sectors
;
930 tval
= stats_cpu
->stat_arr_cpu
[type
][sub_type
];
931 } while(u64_stats_fetch_retry(&stats_cpu
->syncp
, start
));
939 static uint64_t blkio_get_stat_cpu(struct blkio_group
*blkg
, int plid
,
940 struct cgroup_map_cb
*cb
, const char *dname
,
941 enum stat_type_cpu type
)
943 uint64_t disk_total
, val
;
944 char key_str
[MAX_KEY_LEN
];
945 enum stat_sub_type sub_type
;
947 if (type
== BLKIO_STAT_CPU_SECTORS
) {
948 val
= blkio_read_stat_cpu(blkg
, plid
, type
, 0);
949 return blkio_fill_stat(key_str
, MAX_KEY_LEN
- 1, val
, cb
,
953 for (sub_type
= BLKIO_STAT_READ
; sub_type
< BLKIO_STAT_TOTAL
;
955 blkio_get_key_name(sub_type
, dname
, key_str
, MAX_KEY_LEN
,
957 val
= blkio_read_stat_cpu(blkg
, plid
, type
, sub_type
);
958 cb
->fill(cb
, key_str
, val
);
961 disk_total
= blkio_read_stat_cpu(blkg
, plid
, type
, BLKIO_STAT_READ
) +
962 blkio_read_stat_cpu(blkg
, plid
, type
, BLKIO_STAT_WRITE
);
964 blkio_get_key_name(BLKIO_STAT_TOTAL
, dname
, key_str
, MAX_KEY_LEN
,
966 cb
->fill(cb
, key_str
, disk_total
);
970 /* This should be called with blkg->stats_lock held */
971 static uint64_t blkio_get_stat(struct blkio_group
*blkg
, int plid
,
972 struct cgroup_map_cb
*cb
, const char *dname
,
975 struct blkg_policy_data
*pd
= blkg
->pd
[plid
];
977 char key_str
[MAX_KEY_LEN
];
978 enum stat_sub_type sub_type
;
980 if (type
== BLKIO_STAT_TIME
)
981 return blkio_fill_stat(key_str
, MAX_KEY_LEN
- 1,
982 pd
->stats
.time
, cb
, dname
);
983 #ifdef CONFIG_DEBUG_BLK_CGROUP
984 if (type
== BLKIO_STAT_UNACCOUNTED_TIME
)
985 return blkio_fill_stat(key_str
, MAX_KEY_LEN
- 1,
986 pd
->stats
.unaccounted_time
, cb
, dname
);
987 if (type
== BLKIO_STAT_AVG_QUEUE_SIZE
) {
988 uint64_t sum
= pd
->stats
.avg_queue_size_sum
;
989 uint64_t samples
= pd
->stats
.avg_queue_size_samples
;
991 do_div(sum
, samples
);
994 return blkio_fill_stat(key_str
, MAX_KEY_LEN
- 1,
997 if (type
== BLKIO_STAT_GROUP_WAIT_TIME
)
998 return blkio_fill_stat(key_str
, MAX_KEY_LEN
- 1,
999 pd
->stats
.group_wait_time
, cb
, dname
);
1000 if (type
== BLKIO_STAT_IDLE_TIME
)
1001 return blkio_fill_stat(key_str
, MAX_KEY_LEN
- 1,
1002 pd
->stats
.idle_time
, cb
, dname
);
1003 if (type
== BLKIO_STAT_EMPTY_TIME
)
1004 return blkio_fill_stat(key_str
, MAX_KEY_LEN
- 1,
1005 pd
->stats
.empty_time
, cb
, dname
);
1006 if (type
== BLKIO_STAT_DEQUEUE
)
1007 return blkio_fill_stat(key_str
, MAX_KEY_LEN
- 1,
1008 pd
->stats
.dequeue
, cb
, dname
);
1011 for (sub_type
= BLKIO_STAT_READ
; sub_type
< BLKIO_STAT_TOTAL
;
1013 blkio_get_key_name(sub_type
, dname
, key_str
, MAX_KEY_LEN
,
1015 cb
->fill(cb
, key_str
, pd
->stats
.stat_arr
[type
][sub_type
]);
1017 disk_total
= pd
->stats
.stat_arr
[type
][BLKIO_STAT_READ
] +
1018 pd
->stats
.stat_arr
[type
][BLKIO_STAT_WRITE
];
1019 blkio_get_key_name(BLKIO_STAT_TOTAL
, dname
, key_str
, MAX_KEY_LEN
,
1021 cb
->fill(cb
, key_str
, disk_total
);
1025 static int blkio_policy_parse_and_set(char *buf
, enum blkio_policy_id plid
,
1026 int fileid
, struct blkio_cgroup
*blkcg
)
1028 struct gendisk
*disk
= NULL
;
1029 struct blkio_group
*blkg
= NULL
;
1030 struct blkg_policy_data
*pd
;
1031 char *s
[4], *p
, *major_s
= NULL
, *minor_s
= NULL
;
1032 unsigned long major
, minor
;
1033 int i
= 0, ret
= -EINVAL
;
1038 memset(s
, 0, sizeof(s
));
1040 while ((p
= strsep(&buf
, " ")) != NULL
) {
1046 /* Prevent from inputing too many things */
1054 p
= strsep(&s
[0], ":");
1064 if (strict_strtoul(major_s
, 10, &major
))
1067 if (strict_strtoul(minor_s
, 10, &minor
))
1070 dev
= MKDEV(major
, minor
);
1072 if (strict_strtoull(s
[1], 10, &temp
))
1075 disk
= get_gendisk(dev
, &part
);
1081 spin_lock_irq(disk
->queue
->queue_lock
);
1082 blkg
= blkg_lookup_create(blkcg
, disk
->queue
, plid
, false);
1083 spin_unlock_irq(disk
->queue
->queue_lock
);
1086 ret
= PTR_ERR(blkg
);
1090 pd
= blkg
->pd
[plid
];
1093 case BLKIO_POLICY_PROP
:
1094 if ((temp
< BLKIO_WEIGHT_MIN
&& temp
> 0) ||
1095 temp
> BLKIO_WEIGHT_MAX
)
1098 pd
->conf
.weight
= temp
;
1099 blkio_update_group_weight(blkg
, plid
, temp
?: blkcg
->weight
);
1101 case BLKIO_POLICY_THROTL
:
1103 case BLKIO_THROTL_read_bps_device
:
1104 pd
->conf
.bps
[READ
] = temp
;
1105 blkio_update_group_bps(blkg
, plid
, temp
?: -1, fileid
);
1107 case BLKIO_THROTL_write_bps_device
:
1108 pd
->conf
.bps
[WRITE
] = temp
;
1109 blkio_update_group_bps(blkg
, plid
, temp
?: -1, fileid
);
1111 case BLKIO_THROTL_read_iops_device
:
1112 if (temp
> THROTL_IOPS_MAX
)
1114 pd
->conf
.iops
[READ
] = temp
;
1115 blkio_update_group_iops(blkg
, plid
, temp
?: -1, fileid
);
1117 case BLKIO_THROTL_write_iops_device
:
1118 if (temp
> THROTL_IOPS_MAX
)
1120 pd
->conf
.iops
[WRITE
] = temp
;
1121 blkio_update_group_iops(blkg
, plid
, temp
?: -1, fileid
);
1135 * If queue was bypassing, we should retry. Do so after a short
1136 * msleep(). It isn't strictly necessary but queue can be
1137 * bypassing for some time and it's always nice to avoid busy
1140 if (ret
== -EBUSY
) {
1142 return restart_syscall();
1147 static int blkiocg_file_write(struct cgroup
*cgrp
, struct cftype
*cft
,
1152 struct blkio_cgroup
*blkcg
= cgroup_to_blkio_cgroup(cgrp
);
1153 enum blkio_policy_id plid
= BLKIOFILE_POLICY(cft
->private);
1154 int fileid
= BLKIOFILE_ATTR(cft
->private);
1156 buf
= kstrdup(buffer
, GFP_KERNEL
);
1160 ret
= blkio_policy_parse_and_set(buf
, plid
, fileid
, blkcg
);
1165 static const char *blkg_dev_name(struct blkio_group
*blkg
)
1167 /* some drivers (floppy) instantiate a queue w/o disk registered */
1168 if (blkg
->q
->backing_dev_info
.dev
)
1169 return dev_name(blkg
->q
->backing_dev_info
.dev
);
1173 static void blkio_print_group_conf(struct cftype
*cft
, struct blkio_group
*blkg
,
1176 int plid
= BLKIOFILE_POLICY(cft
->private);
1177 int fileid
= BLKIOFILE_ATTR(cft
->private);
1178 struct blkg_policy_data
*pd
= blkg
->pd
[plid
];
1179 const char *dname
= blkg_dev_name(blkg
);
1186 case BLKIO_POLICY_PROP
:
1187 if (pd
->conf
.weight
)
1188 seq_printf(m
, "%s\t%u\n",
1189 dname
, pd
->conf
.weight
);
1191 case BLKIO_POLICY_THROTL
:
1193 case BLKIO_THROTL_read_bps_device
:
1195 case BLKIO_THROTL_write_bps_device
:
1196 if (pd
->conf
.bps
[rw
])
1197 seq_printf(m
, "%s\t%llu\n",
1198 dname
, pd
->conf
.bps
[rw
]);
1200 case BLKIO_THROTL_read_iops_device
:
1202 case BLKIO_THROTL_write_iops_device
:
1203 if (pd
->conf
.iops
[rw
])
1204 seq_printf(m
, "%s\t%u\n",
1205 dname
, pd
->conf
.iops
[rw
]);
1214 /* cgroup files which read their data from policy nodes end up here */
1215 static void blkio_read_conf(struct cftype
*cft
, struct blkio_cgroup
*blkcg
,
1218 struct blkio_group
*blkg
;
1219 struct hlist_node
*n
;
1221 spin_lock_irq(&blkcg
->lock
);
1222 hlist_for_each_entry(blkg
, n
, &blkcg
->blkg_list
, blkcg_node
)
1223 blkio_print_group_conf(cft
, blkg
, m
);
1224 spin_unlock_irq(&blkcg
->lock
);
1227 static int blkiocg_file_read(struct cgroup
*cgrp
, struct cftype
*cft
,
1230 struct blkio_cgroup
*blkcg
;
1231 enum blkio_policy_id plid
= BLKIOFILE_POLICY(cft
->private);
1232 int name
= BLKIOFILE_ATTR(cft
->private);
1234 blkcg
= cgroup_to_blkio_cgroup(cgrp
);
1237 case BLKIO_POLICY_PROP
:
1239 case BLKIO_PROP_weight_device
:
1240 blkio_read_conf(cft
, blkcg
, m
);
1246 case BLKIO_POLICY_THROTL
:
1248 case BLKIO_THROTL_read_bps_device
:
1249 case BLKIO_THROTL_write_bps_device
:
1250 case BLKIO_THROTL_read_iops_device
:
1251 case BLKIO_THROTL_write_iops_device
:
1252 blkio_read_conf(cft
, blkcg
, m
);
1265 static int blkio_read_blkg_stats(struct blkio_cgroup
*blkcg
,
1266 struct cftype
*cft
, struct cgroup_map_cb
*cb
,
1267 enum stat_type type
, bool show_total
, bool pcpu
)
1269 struct blkio_group
*blkg
;
1270 struct hlist_node
*n
;
1271 uint64_t cgroup_total
= 0;
1273 spin_lock_irq(&blkcg
->lock
);
1275 hlist_for_each_entry(blkg
, n
, &blkcg
->blkg_list
, blkcg_node
) {
1276 const char *dname
= blkg_dev_name(blkg
);
1277 int plid
= BLKIOFILE_POLICY(cft
->private);
1282 cgroup_total
+= blkio_get_stat_cpu(blkg
, plid
,
1285 spin_lock(&blkg
->stats_lock
);
1286 cgroup_total
+= blkio_get_stat(blkg
, plid
,
1288 spin_unlock(&blkg
->stats_lock
);
1292 cb
->fill(cb
, "Total", cgroup_total
);
1294 spin_unlock_irq(&blkcg
->lock
);
1298 /* All map kind of cgroup file get serviced by this function */
1299 static int blkiocg_file_read_map(struct cgroup
*cgrp
, struct cftype
*cft
,
1300 struct cgroup_map_cb
*cb
)
1302 struct blkio_cgroup
*blkcg
;
1303 enum blkio_policy_id plid
= BLKIOFILE_POLICY(cft
->private);
1304 int name
= BLKIOFILE_ATTR(cft
->private);
1306 blkcg
= cgroup_to_blkio_cgroup(cgrp
);
1309 case BLKIO_POLICY_PROP
:
1311 case BLKIO_PROP_time
:
1312 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1313 BLKIO_STAT_TIME
, 0, 0);
1314 case BLKIO_PROP_sectors
:
1315 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1316 BLKIO_STAT_CPU_SECTORS
, 0, 1);
1317 case BLKIO_PROP_io_service_bytes
:
1318 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1319 BLKIO_STAT_CPU_SERVICE_BYTES
, 1, 1);
1320 case BLKIO_PROP_io_serviced
:
1321 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1322 BLKIO_STAT_CPU_SERVICED
, 1, 1);
1323 case BLKIO_PROP_io_service_time
:
1324 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1325 BLKIO_STAT_SERVICE_TIME
, 1, 0);
1326 case BLKIO_PROP_io_wait_time
:
1327 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1328 BLKIO_STAT_WAIT_TIME
, 1, 0);
1329 case BLKIO_PROP_io_merged
:
1330 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1331 BLKIO_STAT_MERGED
, 1, 0);
1332 case BLKIO_PROP_io_queued
:
1333 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1334 BLKIO_STAT_QUEUED
, 1, 0);
1335 #ifdef CONFIG_DEBUG_BLK_CGROUP
1336 case BLKIO_PROP_unaccounted_time
:
1337 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1338 BLKIO_STAT_UNACCOUNTED_TIME
, 0, 0);
1339 case BLKIO_PROP_dequeue
:
1340 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1341 BLKIO_STAT_DEQUEUE
, 0, 0);
1342 case BLKIO_PROP_avg_queue_size
:
1343 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1344 BLKIO_STAT_AVG_QUEUE_SIZE
, 0, 0);
1345 case BLKIO_PROP_group_wait_time
:
1346 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1347 BLKIO_STAT_GROUP_WAIT_TIME
, 0, 0);
1348 case BLKIO_PROP_idle_time
:
1349 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1350 BLKIO_STAT_IDLE_TIME
, 0, 0);
1351 case BLKIO_PROP_empty_time
:
1352 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1353 BLKIO_STAT_EMPTY_TIME
, 0, 0);
1359 case BLKIO_POLICY_THROTL
:
1361 case BLKIO_THROTL_io_service_bytes
:
1362 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1363 BLKIO_STAT_CPU_SERVICE_BYTES
, 1, 1);
1364 case BLKIO_THROTL_io_serviced
:
1365 return blkio_read_blkg_stats(blkcg
, cft
, cb
,
1366 BLKIO_STAT_CPU_SERVICED
, 1, 1);
1378 static int blkio_weight_write(struct blkio_cgroup
*blkcg
, int plid
, u64 val
)
1380 struct blkio_group
*blkg
;
1381 struct hlist_node
*n
;
1383 if (val
< BLKIO_WEIGHT_MIN
|| val
> BLKIO_WEIGHT_MAX
)
1386 spin_lock(&blkio_list_lock
);
1387 spin_lock_irq(&blkcg
->lock
);
1388 blkcg
->weight
= (unsigned int)val
;
1390 hlist_for_each_entry(blkg
, n
, &blkcg
->blkg_list
, blkcg_node
) {
1391 struct blkg_policy_data
*pd
= blkg
->pd
[plid
];
1393 if (!pd
->conf
.weight
)
1394 blkio_update_group_weight(blkg
, plid
, blkcg
->weight
);
1397 spin_unlock_irq(&blkcg
->lock
);
1398 spin_unlock(&blkio_list_lock
);
1402 static u64
blkiocg_file_read_u64 (struct cgroup
*cgrp
, struct cftype
*cft
) {
1403 struct blkio_cgroup
*blkcg
;
1404 enum blkio_policy_id plid
= BLKIOFILE_POLICY(cft
->private);
1405 int name
= BLKIOFILE_ATTR(cft
->private);
1407 blkcg
= cgroup_to_blkio_cgroup(cgrp
);
1410 case BLKIO_POLICY_PROP
:
1412 case BLKIO_PROP_weight
:
1413 return (u64
)blkcg
->weight
;
1423 blkiocg_file_write_u64(struct cgroup
*cgrp
, struct cftype
*cft
, u64 val
)
1425 struct blkio_cgroup
*blkcg
;
1426 enum blkio_policy_id plid
= BLKIOFILE_POLICY(cft
->private);
1427 int name
= BLKIOFILE_ATTR(cft
->private);
1429 blkcg
= cgroup_to_blkio_cgroup(cgrp
);
1432 case BLKIO_POLICY_PROP
:
1434 case BLKIO_PROP_weight
:
1435 return blkio_weight_write(blkcg
, plid
, val
);
1445 struct cftype blkio_files
[] = {
1447 .name
= "weight_device",
1448 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1449 BLKIO_PROP_weight_device
),
1450 .read_seq_string
= blkiocg_file_read
,
1451 .write_string
= blkiocg_file_write
,
1452 .max_write_len
= 256,
1456 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1458 .read_u64
= blkiocg_file_read_u64
,
1459 .write_u64
= blkiocg_file_write_u64
,
1463 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1465 .read_map
= blkiocg_file_read_map
,
1469 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1470 BLKIO_PROP_sectors
),
1471 .read_map
= blkiocg_file_read_map
,
1474 .name
= "io_service_bytes",
1475 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1476 BLKIO_PROP_io_service_bytes
),
1477 .read_map
= blkiocg_file_read_map
,
1480 .name
= "io_serviced",
1481 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1482 BLKIO_PROP_io_serviced
),
1483 .read_map
= blkiocg_file_read_map
,
1486 .name
= "io_service_time",
1487 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1488 BLKIO_PROP_io_service_time
),
1489 .read_map
= blkiocg_file_read_map
,
1492 .name
= "io_wait_time",
1493 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1494 BLKIO_PROP_io_wait_time
),
1495 .read_map
= blkiocg_file_read_map
,
1498 .name
= "io_merged",
1499 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1500 BLKIO_PROP_io_merged
),
1501 .read_map
= blkiocg_file_read_map
,
1504 .name
= "io_queued",
1505 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1506 BLKIO_PROP_io_queued
),
1507 .read_map
= blkiocg_file_read_map
,
1510 .name
= "reset_stats",
1511 .write_u64
= blkiocg_reset_stats
,
1513 #ifdef CONFIG_BLK_DEV_THROTTLING
1515 .name
= "throttle.read_bps_device",
1516 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL
,
1517 BLKIO_THROTL_read_bps_device
),
1518 .read_seq_string
= blkiocg_file_read
,
1519 .write_string
= blkiocg_file_write
,
1520 .max_write_len
= 256,
1524 .name
= "throttle.write_bps_device",
1525 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL
,
1526 BLKIO_THROTL_write_bps_device
),
1527 .read_seq_string
= blkiocg_file_read
,
1528 .write_string
= blkiocg_file_write
,
1529 .max_write_len
= 256,
1533 .name
= "throttle.read_iops_device",
1534 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL
,
1535 BLKIO_THROTL_read_iops_device
),
1536 .read_seq_string
= blkiocg_file_read
,
1537 .write_string
= blkiocg_file_write
,
1538 .max_write_len
= 256,
1542 .name
= "throttle.write_iops_device",
1543 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL
,
1544 BLKIO_THROTL_write_iops_device
),
1545 .read_seq_string
= blkiocg_file_read
,
1546 .write_string
= blkiocg_file_write
,
1547 .max_write_len
= 256,
1550 .name
= "throttle.io_service_bytes",
1551 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL
,
1552 BLKIO_THROTL_io_service_bytes
),
1553 .read_map
= blkiocg_file_read_map
,
1556 .name
= "throttle.io_serviced",
1557 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL
,
1558 BLKIO_THROTL_io_serviced
),
1559 .read_map
= blkiocg_file_read_map
,
1561 #endif /* CONFIG_BLK_DEV_THROTTLING */
1563 #ifdef CONFIG_DEBUG_BLK_CGROUP
1565 .name
= "avg_queue_size",
1566 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1567 BLKIO_PROP_avg_queue_size
),
1568 .read_map
= blkiocg_file_read_map
,
1571 .name
= "group_wait_time",
1572 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1573 BLKIO_PROP_group_wait_time
),
1574 .read_map
= blkiocg_file_read_map
,
1577 .name
= "idle_time",
1578 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1579 BLKIO_PROP_idle_time
),
1580 .read_map
= blkiocg_file_read_map
,
1583 .name
= "empty_time",
1584 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1585 BLKIO_PROP_empty_time
),
1586 .read_map
= blkiocg_file_read_map
,
1590 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1591 BLKIO_PROP_dequeue
),
1592 .read_map
= blkiocg_file_read_map
,
1595 .name
= "unaccounted_time",
1596 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP
,
1597 BLKIO_PROP_unaccounted_time
),
1598 .read_map
= blkiocg_file_read_map
,
1603 static int blkiocg_populate(struct cgroup_subsys
*subsys
, struct cgroup
*cgroup
)
1605 return cgroup_add_files(cgroup
, subsys
, blkio_files
,
1606 ARRAY_SIZE(blkio_files
));
1610 * blkiocg_pre_destroy - cgroup pre_destroy callback
1611 * @subsys: cgroup subsys
1612 * @cgroup: cgroup of interest
1614 * This function is called when @cgroup is about to go away and responsible
1615 * for shooting down all blkgs associated with @cgroup. blkgs should be
1616 * removed while holding both q and blkcg locks. As blkcg lock is nested
1617 * inside q lock, this function performs reverse double lock dancing.
1619 * This is the blkcg counterpart of ioc_release_fn().
1621 static int blkiocg_pre_destroy(struct cgroup_subsys
*subsys
,
1622 struct cgroup
*cgroup
)
1624 struct blkio_cgroup
*blkcg
= cgroup_to_blkio_cgroup(cgroup
);
1626 spin_lock_irq(&blkcg
->lock
);
1628 while (!hlist_empty(&blkcg
->blkg_list
)) {
1629 struct blkio_group
*blkg
= hlist_entry(blkcg
->blkg_list
.first
,
1630 struct blkio_group
, blkcg_node
);
1631 struct request_queue
*q
= blkg
->q
;
1633 if (spin_trylock(q
->queue_lock
)) {
1635 spin_unlock(q
->queue_lock
);
1637 spin_unlock_irq(&blkcg
->lock
);
1639 spin_lock(&blkcg
->lock
);
1643 spin_unlock_irq(&blkcg
->lock
);
1647 static void blkiocg_destroy(struct cgroup_subsys
*subsys
, struct cgroup
*cgroup
)
1649 struct blkio_cgroup
*blkcg
= cgroup_to_blkio_cgroup(cgroup
);
1651 if (blkcg
!= &blkio_root_cgroup
)
1655 static struct cgroup_subsys_state
*
1656 blkiocg_create(struct cgroup_subsys
*subsys
, struct cgroup
*cgroup
)
1658 struct blkio_cgroup
*blkcg
;
1659 struct cgroup
*parent
= cgroup
->parent
;
1662 blkcg
= &blkio_root_cgroup
;
1666 blkcg
= kzalloc(sizeof(*blkcg
), GFP_KERNEL
);
1668 return ERR_PTR(-ENOMEM
);
1670 blkcg
->weight
= BLKIO_WEIGHT_DEFAULT
;
1672 spin_lock_init(&blkcg
->lock
);
1673 INIT_HLIST_HEAD(&blkcg
->blkg_list
);
1679 * blkcg_init_queue - initialize blkcg part of request queue
1680 * @q: request_queue to initialize
1682 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
1683 * part of new request_queue @q.
1686 * 0 on success, -errno on failure.
1688 int blkcg_init_queue(struct request_queue
*q
)
1694 ret
= blk_throtl_init(q
);
1698 mutex_lock(&all_q_mutex
);
1699 INIT_LIST_HEAD(&q
->all_q_node
);
1700 list_add_tail(&q
->all_q_node
, &all_q_list
);
1701 mutex_unlock(&all_q_mutex
);
1707 * blkcg_drain_queue - drain blkcg part of request_queue
1708 * @q: request_queue to drain
1710 * Called from blk_drain_queue(). Responsible for draining blkcg part.
1712 void blkcg_drain_queue(struct request_queue
*q
)
1714 lockdep_assert_held(q
->queue_lock
);
1716 blk_throtl_drain(q
);
1720 * blkcg_exit_queue - exit and release blkcg part of request_queue
1721 * @q: request_queue being released
1723 * Called from blk_release_queue(). Responsible for exiting blkcg part.
1725 void blkcg_exit_queue(struct request_queue
*q
)
1727 mutex_lock(&all_q_mutex
);
1728 list_del_init(&q
->all_q_node
);
1729 mutex_unlock(&all_q_mutex
);
1731 blkg_destroy_all(q
, true);
1737 * We cannot support shared io contexts, as we have no mean to support
1738 * two tasks with the same ioc in two different groups without major rework
1739 * of the main cic data structures. For now we allow a task to change
1740 * its cgroup only if it's the only owner of its ioc.
1742 static int blkiocg_can_attach(struct cgroup_subsys
*ss
, struct cgroup
*cgrp
,
1743 struct cgroup_taskset
*tset
)
1745 struct task_struct
*task
;
1746 struct io_context
*ioc
;
1749 /* task_lock() is needed to avoid races with exit_io_context() */
1750 cgroup_taskset_for_each(task
, cgrp
, tset
) {
1752 ioc
= task
->io_context
;
1753 if (ioc
&& atomic_read(&ioc
->nr_tasks
) > 1)
1762 static void blkiocg_attach(struct cgroup_subsys
*ss
, struct cgroup
*cgrp
,
1763 struct cgroup_taskset
*tset
)
1765 struct task_struct
*task
;
1766 struct io_context
*ioc
;
1768 cgroup_taskset_for_each(task
, cgrp
, tset
) {
1769 /* we don't lose anything even if ioc allocation fails */
1770 ioc
= get_task_io_context(task
, GFP_ATOMIC
, NUMA_NO_NODE
);
1772 ioc_cgroup_changed(ioc
);
1773 put_io_context(ioc
);
1778 static void blkcg_bypass_start(void)
1779 __acquires(&all_q_mutex
)
1781 struct request_queue
*q
;
1783 mutex_lock(&all_q_mutex
);
1785 list_for_each_entry(q
, &all_q_list
, all_q_node
) {
1786 blk_queue_bypass_start(q
);
1787 blkg_destroy_all(q
, false);
1791 static void blkcg_bypass_end(void)
1792 __releases(&all_q_mutex
)
1794 struct request_queue
*q
;
1796 list_for_each_entry(q
, &all_q_list
, all_q_node
)
1797 blk_queue_bypass_end(q
);
1799 mutex_unlock(&all_q_mutex
);
1802 void blkio_policy_register(struct blkio_policy_type
*blkiop
)
1804 struct request_queue
*q
;
1806 blkcg_bypass_start();
1807 spin_lock(&blkio_list_lock
);
1809 BUG_ON(blkio_policy
[blkiop
->plid
]);
1810 blkio_policy
[blkiop
->plid
] = blkiop
;
1811 list_add_tail(&blkiop
->list
, &blkio_list
);
1813 spin_unlock(&blkio_list_lock
);
1814 list_for_each_entry(q
, &all_q_list
, all_q_node
)
1815 update_root_blkg_pd(q
, blkiop
->plid
);
1818 EXPORT_SYMBOL_GPL(blkio_policy_register
);
1820 void blkio_policy_unregister(struct blkio_policy_type
*blkiop
)
1822 struct request_queue
*q
;
1824 blkcg_bypass_start();
1825 spin_lock(&blkio_list_lock
);
1827 BUG_ON(blkio_policy
[blkiop
->plid
] != blkiop
);
1828 blkio_policy
[blkiop
->plid
] = NULL
;
1829 list_del_init(&blkiop
->list
);
1831 spin_unlock(&blkio_list_lock
);
1832 list_for_each_entry(q
, &all_q_list
, all_q_node
)
1833 update_root_blkg_pd(q
, blkiop
->plid
);
1836 EXPORT_SYMBOL_GPL(blkio_policy_unregister
);