Commit | Line | Data |
---|---|---|
cafe5635 KO |
1 | /* |
2 | * background writeback - scan btree for dirty data and write it to the backing | |
3 | * device | |
4 | * | |
5 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | |
6 | * Copyright 2012 Google, Inc. | |
7 | */ | |
8 | ||
9 | #include "bcache.h" | |
10 | #include "btree.h" | |
11 | #include "debug.h" | |
279afbad | 12 | #include "writeback.h" |
cafe5635 | 13 | |
5e6926da KO |
14 | #include <linux/delay.h> |
15 | #include <linux/freezer.h> | |
16 | #include <linux/kthread.h> | |
c37511b8 KO |
17 | #include <trace/events/bcache.h> |
18 | ||
cafe5635 KO |
19 | /* Rate limiting */ |
20 | ||
21 | static void __update_writeback_rate(struct cached_dev *dc) | |
22 | { | |
23 | struct cache_set *c = dc->disk.c; | |
24 | uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; | |
25 | uint64_t cache_dirty_target = | |
26 | div_u64(cache_sectors * dc->writeback_percent, 100); | |
27 | ||
28 | int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), | |
29 | c->cached_dev_sectors); | |
30 | ||
31 | /* PD controller */ | |
32 | ||
279afbad | 33 | int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); |
cafe5635 | 34 | int64_t derivative = dirty - dc->disk.sectors_dirty_last; |
16749c23 KO |
35 | int64_t proportional = dirty - target; |
36 | int64_t change; | |
cafe5635 KO |
37 | |
38 | dc->disk.sectors_dirty_last = dirty; | |
39 | ||
16749c23 | 40 | /* Scale to sectors per second */ |
cafe5635 | 41 | |
16749c23 KO |
42 | proportional *= dc->writeback_rate_update_seconds; |
43 | proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse); | |
cafe5635 | 44 | |
16749c23 | 45 | derivative = div_s64(derivative, dc->writeback_rate_update_seconds); |
cafe5635 | 46 | |
16749c23 KO |
47 | derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, |
48 | (dc->writeback_rate_d_term / | |
49 | dc->writeback_rate_update_seconds) ?: 1, 0); | |
50 | ||
51 | derivative *= dc->writeback_rate_d_term; | |
52 | derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse); | |
cafe5635 | 53 | |
16749c23 | 54 | change = proportional + derivative; |
cafe5635 KO |
55 | |
56 | /* Don't increase writeback rate if the device isn't keeping up */ | |
57 | if (change > 0 && | |
58 | time_after64(local_clock(), | |
16749c23 | 59 | dc->writeback_rate.next + NSEC_PER_MSEC)) |
cafe5635 KO |
60 | change = 0; |
61 | ||
62 | dc->writeback_rate.rate = | |
16749c23 | 63 | clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change, |
cafe5635 | 64 | 1, NSEC_PER_MSEC); |
16749c23 KO |
65 | |
66 | dc->writeback_rate_proportional = proportional; | |
cafe5635 KO |
67 | dc->writeback_rate_derivative = derivative; |
68 | dc->writeback_rate_change = change; | |
69 | dc->writeback_rate_target = target; | |
cafe5635 KO |
70 | } |
71 | ||
72 | static void update_writeback_rate(struct work_struct *work) | |
73 | { | |
74 | struct cached_dev *dc = container_of(to_delayed_work(work), | |
75 | struct cached_dev, | |
76 | writeback_rate_update); | |
77 | ||
78 | down_read(&dc->writeback_lock); | |
79 | ||
80 | if (atomic_read(&dc->has_dirty) && | |
81 | dc->writeback_percent) | |
82 | __update_writeback_rate(dc); | |
83 | ||
84 | up_read(&dc->writeback_lock); | |
5e6926da KO |
85 | |
86 | schedule_delayed_work(&dc->writeback_rate_update, | |
87 | dc->writeback_rate_update_seconds * HZ); | |
cafe5635 KO |
88 | } |
89 | ||
90 | static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) | |
91 | { | |
c4d951dd | 92 | if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || |
cafe5635 KO |
93 | !dc->writeback_percent) |
94 | return 0; | |
95 | ||
16749c23 | 96 | return bch_next_delay(&dc->writeback_rate, sectors); |
cafe5635 KO |
97 | } |
98 | ||
5e6926da KO |
99 | struct dirty_io { |
100 | struct closure cl; | |
101 | struct cached_dev *dc; | |
102 | struct bio bio; | |
103 | }; | |
72c27061 | 104 | |
cafe5635 KO |
105 | static void dirty_init(struct keybuf_key *w) |
106 | { | |
107 | struct dirty_io *io = w->private; | |
108 | struct bio *bio = &io->bio; | |
109 | ||
110 | bio_init(bio); | |
111 | if (!io->dc->writeback_percent) | |
112 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | |
113 | ||
4f024f37 | 114 | bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9; |
cafe5635 KO |
115 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); |
116 | bio->bi_private = w; | |
117 | bio->bi_io_vec = bio->bi_inline_vecs; | |
169ef1cf | 118 | bch_bio_map(bio, NULL); |
cafe5635 KO |
119 | } |
120 | ||
cafe5635 KO |
121 | static void dirty_io_destructor(struct closure *cl) |
122 | { | |
123 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | |
124 | kfree(io); | |
125 | } | |
126 | ||
127 | static void write_dirty_finish(struct closure *cl) | |
128 | { | |
129 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | |
130 | struct keybuf_key *w = io->bio.bi_private; | |
131 | struct cached_dev *dc = io->dc; | |
8e51e414 KO |
132 | struct bio_vec *bv; |
133 | int i; | |
cafe5635 | 134 | |
8e51e414 | 135 | bio_for_each_segment_all(bv, &io->bio, i) |
cafe5635 KO |
136 | __free_page(bv->bv_page); |
137 | ||
138 | /* This is kind of a dumb way of signalling errors. */ | |
139 | if (KEY_DIRTY(&w->key)) { | |
cc7b8819 | 140 | int ret; |
cafe5635 | 141 | unsigned i; |
0b93207a KO |
142 | struct keylist keys; |
143 | ||
0b93207a | 144 | bch_keylist_init(&keys); |
cafe5635 | 145 | |
1b207d80 KO |
146 | bkey_copy(keys.top, &w->key); |
147 | SET_KEY_DIRTY(keys.top, false); | |
148 | bch_keylist_push(&keys); | |
cafe5635 KO |
149 | |
150 | for (i = 0; i < KEY_PTRS(&w->key); i++) | |
151 | atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); | |
152 | ||
cc7b8819 | 153 | ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key); |
cafe5635 | 154 | |
6054c6d4 | 155 | if (ret) |
c37511b8 KO |
156 | trace_bcache_writeback_collision(&w->key); |
157 | ||
6054c6d4 | 158 | atomic_long_inc(ret |
cafe5635 KO |
159 | ? &dc->disk.c->writeback_keys_failed |
160 | : &dc->disk.c->writeback_keys_done); | |
161 | } | |
162 | ||
163 | bch_keybuf_del(&dc->writeback_keys, w); | |
c2a4f318 | 164 | up(&dc->in_flight); |
cafe5635 KO |
165 | |
166 | closure_return_with_destructor(cl, dirty_io_destructor); | |
167 | } | |
168 | ||
169 | static void dirty_endio(struct bio *bio, int error) | |
170 | { | |
171 | struct keybuf_key *w = bio->bi_private; | |
172 | struct dirty_io *io = w->private; | |
173 | ||
174 | if (error) | |
175 | SET_KEY_DIRTY(&w->key, false); | |
176 | ||
177 | closure_put(&io->cl); | |
178 | } | |
179 | ||
180 | static void write_dirty(struct closure *cl) | |
181 | { | |
182 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | |
183 | struct keybuf_key *w = io->bio.bi_private; | |
184 | ||
185 | dirty_init(w); | |
186 | io->bio.bi_rw = WRITE; | |
4f024f37 | 187 | io->bio.bi_iter.bi_sector = KEY_START(&w->key); |
cafe5635 KO |
188 | io->bio.bi_bdev = io->dc->bdev; |
189 | io->bio.bi_end_io = dirty_endio; | |
190 | ||
cafe5635 KO |
191 | closure_bio_submit(&io->bio, cl, &io->dc->disk); |
192 | ||
c2a4f318 | 193 | continue_at(cl, write_dirty_finish, system_wq); |
cafe5635 KO |
194 | } |
195 | ||
196 | static void read_dirty_endio(struct bio *bio, int error) | |
197 | { | |
198 | struct keybuf_key *w = bio->bi_private; | |
199 | struct dirty_io *io = w->private; | |
200 | ||
201 | bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), | |
202 | error, "reading dirty data from cache"); | |
203 | ||
204 | dirty_endio(bio, error); | |
205 | } | |
206 | ||
207 | static void read_dirty_submit(struct closure *cl) | |
208 | { | |
209 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | |
210 | ||
cafe5635 KO |
211 | closure_bio_submit(&io->bio, cl, &io->dc->disk); |
212 | ||
c2a4f318 | 213 | continue_at(cl, write_dirty, system_wq); |
cafe5635 KO |
214 | } |
215 | ||
5e6926da | 216 | static void read_dirty(struct cached_dev *dc) |
cafe5635 | 217 | { |
5e6926da | 218 | unsigned delay = 0; |
cafe5635 KO |
219 | struct keybuf_key *w; |
220 | struct dirty_io *io; | |
5e6926da KO |
221 | struct closure cl; |
222 | ||
223 | closure_init_stack(&cl); | |
cafe5635 KO |
224 | |
225 | /* | |
226 | * XXX: if we error, background writeback just spins. Should use some | |
227 | * mempools. | |
228 | */ | |
229 | ||
5e6926da KO |
230 | while (!kthread_should_stop()) { |
231 | try_to_freeze(); | |
232 | ||
cafe5635 KO |
233 | w = bch_keybuf_next(&dc->writeback_keys); |
234 | if (!w) | |
235 | break; | |
236 | ||
237 | BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); | |
238 | ||
5e6926da KO |
239 | if (KEY_START(&w->key) != dc->last_read || |
240 | jiffies_to_msecs(delay) > 50) | |
241 | while (!kthread_should_stop() && delay) | |
9e5c3535 | 242 | delay = schedule_timeout_interruptible(delay); |
cafe5635 KO |
243 | |
244 | dc->last_read = KEY_OFFSET(&w->key); | |
245 | ||
246 | io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) | |
247 | * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), | |
248 | GFP_KERNEL); | |
249 | if (!io) | |
250 | goto err; | |
251 | ||
252 | w->private = io; | |
253 | io->dc = dc; | |
254 | ||
255 | dirty_init(w); | |
4f024f37 | 256 | io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); |
cafe5635 KO |
257 | io->bio.bi_bdev = PTR_CACHE(dc->disk.c, |
258 | &w->key, 0)->bdev; | |
259 | io->bio.bi_rw = READ; | |
260 | io->bio.bi_end_io = read_dirty_endio; | |
261 | ||
8e51e414 | 262 | if (bio_alloc_pages(&io->bio, GFP_KERNEL)) |
cafe5635 KO |
263 | goto err_free; |
264 | ||
c37511b8 | 265 | trace_bcache_writeback(&w->key); |
cafe5635 | 266 | |
c2a4f318 | 267 | down(&dc->in_flight); |
5e6926da | 268 | closure_call(&io->cl, read_dirty_submit, NULL, &cl); |
cafe5635 KO |
269 | |
270 | delay = writeback_delay(dc, KEY_SIZE(&w->key)); | |
cafe5635 KO |
271 | } |
272 | ||
273 | if (0) { | |
274 | err_free: | |
275 | kfree(w->private); | |
276 | err: | |
277 | bch_keybuf_del(&dc->writeback_keys, w); | |
278 | } | |
279 | ||
c2a4f318 KO |
280 | /* |
281 | * Wait for outstanding writeback IOs to finish (and keybuf slots to be | |
282 | * freed) before refilling again | |
283 | */ | |
5e6926da KO |
284 | closure_sync(&cl); |
285 | } | |
286 | ||
287 | /* Scan for dirty data */ | |
288 | ||
289 | void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, | |
290 | uint64_t offset, int nr_sectors) | |
291 | { | |
292 | struct bcache_device *d = c->devices[inode]; | |
48a915a8 | 293 | unsigned stripe_offset, stripe, sectors_dirty; |
5e6926da KO |
294 | |
295 | if (!d) | |
296 | return; | |
297 | ||
48a915a8 | 298 | stripe = offset_to_stripe(d, offset); |
5e6926da KO |
299 | stripe_offset = offset & (d->stripe_size - 1); |
300 | ||
301 | while (nr_sectors) { | |
302 | int s = min_t(unsigned, abs(nr_sectors), | |
303 | d->stripe_size - stripe_offset); | |
304 | ||
305 | if (nr_sectors < 0) | |
306 | s = -s; | |
307 | ||
48a915a8 KO |
308 | if (stripe >= d->nr_stripes) |
309 | return; | |
310 | ||
311 | sectors_dirty = atomic_add_return(s, | |
312 | d->stripe_sectors_dirty + stripe); | |
313 | if (sectors_dirty == d->stripe_size) | |
314 | set_bit(stripe, d->full_dirty_stripes); | |
315 | else | |
316 | clear_bit(stripe, d->full_dirty_stripes); | |
317 | ||
5e6926da KO |
318 | nr_sectors -= s; |
319 | stripe_offset = 0; | |
320 | stripe++; | |
321 | } | |
322 | } | |
323 | ||
324 | static bool dirty_pred(struct keybuf *buf, struct bkey *k) | |
325 | { | |
326 | return KEY_DIRTY(k); | |
327 | } | |
328 | ||
48a915a8 | 329 | static void refill_full_stripes(struct cached_dev *dc) |
5e6926da | 330 | { |
48a915a8 KO |
331 | struct keybuf *buf = &dc->writeback_keys; |
332 | unsigned start_stripe, stripe, next_stripe; | |
333 | bool wrapped = false; | |
334 | ||
335 | stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned)); | |
5e6926da | 336 | |
48a915a8 KO |
337 | if (stripe >= dc->disk.nr_stripes) |
338 | stripe = 0; | |
5e6926da | 339 | |
48a915a8 | 340 | start_stripe = stripe; |
5e6926da KO |
341 | |
342 | while (1) { | |
48a915a8 KO |
343 | stripe = find_next_bit(dc->disk.full_dirty_stripes, |
344 | dc->disk.nr_stripes, stripe); | |
5e6926da | 345 | |
48a915a8 KO |
346 | if (stripe == dc->disk.nr_stripes) |
347 | goto next; | |
5e6926da | 348 | |
48a915a8 KO |
349 | next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes, |
350 | dc->disk.nr_stripes, stripe); | |
351 | ||
352 | buf->last_scanned = KEY(dc->disk.id, | |
353 | stripe * dc->disk.stripe_size, 0); | |
354 | ||
355 | bch_refill_keybuf(dc->disk.c, buf, | |
356 | &KEY(dc->disk.id, | |
357 | next_stripe * dc->disk.stripe_size, 0), | |
358 | dirty_pred); | |
359 | ||
360 | if (array_freelist_empty(&buf->freelist)) | |
361 | return; | |
362 | ||
363 | stripe = next_stripe; | |
364 | next: | |
365 | if (wrapped && stripe > start_stripe) | |
366 | return; | |
367 | ||
368 | if (stripe == dc->disk.nr_stripes) { | |
369 | stripe = 0; | |
370 | wrapped = true; | |
371 | } | |
5e6926da KO |
372 | } |
373 | } | |
374 | ||
375 | static bool refill_dirty(struct cached_dev *dc) | |
376 | { | |
377 | struct keybuf *buf = &dc->writeback_keys; | |
5e6926da | 378 | struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); |
48a915a8 KO |
379 | bool searched_from_start = false; |
380 | ||
381 | if (dc->partial_stripes_expensive) { | |
382 | refill_full_stripes(dc); | |
383 | if (array_freelist_empty(&buf->freelist)) | |
384 | return false; | |
385 | } | |
5e6926da KO |
386 | |
387 | if (bkey_cmp(&buf->last_scanned, &end) >= 0) { | |
388 | buf->last_scanned = KEY(dc->disk.id, 0, 0); | |
389 | searched_from_start = true; | |
390 | } | |
391 | ||
48a915a8 | 392 | bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); |
5e6926da KO |
393 | |
394 | return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start; | |
395 | } | |
396 | ||
397 | static int bch_writeback_thread(void *arg) | |
398 | { | |
399 | struct cached_dev *dc = arg; | |
400 | bool searched_full_index; | |
401 | ||
402 | while (!kthread_should_stop()) { | |
403 | down_write(&dc->writeback_lock); | |
404 | if (!atomic_read(&dc->has_dirty) || | |
c4d951dd | 405 | (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && |
5e6926da KO |
406 | !dc->writeback_running)) { |
407 | up_write(&dc->writeback_lock); | |
408 | set_current_state(TASK_INTERRUPTIBLE); | |
409 | ||
410 | if (kthread_should_stop()) | |
411 | return 0; | |
412 | ||
413 | try_to_freeze(); | |
414 | schedule(); | |
415 | continue; | |
416 | } | |
417 | ||
418 | searched_full_index = refill_dirty(dc); | |
419 | ||
420 | if (searched_full_index && | |
421 | RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { | |
422 | atomic_set(&dc->has_dirty, 0); | |
423 | cached_dev_put(dc); | |
424 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); | |
425 | bch_write_bdev_super(dc, NULL); | |
426 | } | |
427 | ||
428 | up_write(&dc->writeback_lock); | |
429 | ||
430 | bch_ratelimit_reset(&dc->writeback_rate); | |
431 | read_dirty(dc); | |
432 | ||
433 | if (searched_full_index) { | |
434 | unsigned delay = dc->writeback_delay * HZ; | |
435 | ||
436 | while (delay && | |
437 | !kthread_should_stop() && | |
c4d951dd | 438 | !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) |
9e5c3535 | 439 | delay = schedule_timeout_interruptible(delay); |
5e6926da KO |
440 | } |
441 | } | |
442 | ||
443 | return 0; | |
cafe5635 KO |
444 | } |
445 | ||
444fc0b6 KO |
446 | /* Init */ |
447 | ||
c18536a7 KO |
448 | struct sectors_dirty_init { |
449 | struct btree_op op; | |
450 | unsigned inode; | |
451 | }; | |
452 | ||
453 | static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, | |
48dad8ba | 454 | struct bkey *k) |
444fc0b6 | 455 | { |
c18536a7 KO |
456 | struct sectors_dirty_init *op = container_of(_op, |
457 | struct sectors_dirty_init, op); | |
48dad8ba KO |
458 | if (KEY_INODE(k) > op->inode) |
459 | return MAP_DONE; | |
444fc0b6 | 460 | |
48dad8ba KO |
461 | if (KEY_DIRTY(k)) |
462 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), | |
463 | KEY_START(k), KEY_SIZE(k)); | |
464 | ||
465 | return MAP_CONTINUE; | |
444fc0b6 KO |
466 | } |
467 | ||
468 | void bch_sectors_dirty_init(struct cached_dev *dc) | |
469 | { | |
c18536a7 | 470 | struct sectors_dirty_init op; |
444fc0b6 | 471 | |
b54d6934 | 472 | bch_btree_op_init(&op.op, -1); |
48dad8ba KO |
473 | op.inode = dc->disk.id; |
474 | ||
c18536a7 | 475 | bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0), |
48dad8ba | 476 | sectors_dirty_init_fn, 0); |
16749c23 KO |
477 | |
478 | dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk); | |
444fc0b6 KO |
479 | } |
480 | ||
9e5c3535 | 481 | void bch_cached_dev_writeback_init(struct cached_dev *dc) |
cafe5635 | 482 | { |
c2a4f318 | 483 | sema_init(&dc->in_flight, 64); |
cafe5635 | 484 | init_rwsem(&dc->writeback_lock); |
72c27061 | 485 | bch_keybuf_init(&dc->writeback_keys); |
cafe5635 KO |
486 | |
487 | dc->writeback_metadata = true; | |
488 | dc->writeback_running = true; | |
489 | dc->writeback_percent = 10; | |
490 | dc->writeback_delay = 30; | |
491 | dc->writeback_rate.rate = 1024; | |
492 | ||
16749c23 KO |
493 | dc->writeback_rate_update_seconds = 5; |
494 | dc->writeback_rate_d_term = 30; | |
495 | dc->writeback_rate_p_term_inverse = 6000; | |
cafe5635 | 496 | |
9e5c3535 SP |
497 | INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); |
498 | } | |
499 | ||
500 | int bch_cached_dev_writeback_start(struct cached_dev *dc) | |
501 | { | |
5e6926da KO |
502 | dc->writeback_thread = kthread_create(bch_writeback_thread, dc, |
503 | "bcache_writeback"); | |
504 | if (IS_ERR(dc->writeback_thread)) | |
505 | return PTR_ERR(dc->writeback_thread); | |
506 | ||
cafe5635 KO |
507 | schedule_delayed_work(&dc->writeback_rate_update, |
508 | dc->writeback_rate_update_seconds * HZ); | |
cafe5635 | 509 | |
9e5c3535 SP |
510 | bch_writeback_queue(dc); |
511 | ||
cafe5635 KO |
512 | return 0; |
513 | } |