Commit | Line | Data |
---|---|---|
a1fec1db BH |
1 | /* |
2 | * Copyright (C) 2011 | |
3 | * Boaz Harrosh <bharrosh@panasas.com> | |
4 | * | |
5 | * This file is part of the objects raid engine (ore). | |
6 | * | |
7 | * It is free software; you can redistribute it and/or modify | |
8 | * it under the terms of the GNU General Public License version 2 as published | |
9 | * by the Free Software Foundation. | |
10 | * | |
11 | * You should have received a copy of the GNU General Public License | |
12 | * along with "ore". If not, write to the Free Software Foundation, Inc: | |
13 | * "Free Software Foundation <info@fsf.org>" | |
14 | */ | |
15 | ||
16 | #include <linux/gfp.h> | |
769ba8d9 | 17 | #include <linux/async_tx.h> |
a1fec1db BH |
18 | |
19 | #include "ore_raid.h" | |
20 | ||
769ba8d9 BH |
21 | #undef ORE_DBGMSG2 |
22 | #define ORE_DBGMSG2 ORE_DBGMSG | |
23 | ||
a1fec1db BH |
24 | struct page *_raid_page_alloc(void) |
25 | { | |
26 | return alloc_page(GFP_KERNEL); | |
27 | } | |
28 | ||
29 | void _raid_page_free(struct page *p) | |
30 | { | |
31 | __free_page(p); | |
32 | } | |
33 | ||
769ba8d9 BH |
34 | /* This struct is forward declare in ore_io_state, but is private to here. |
35 | * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit. | |
36 | * | |
37 | * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn. | |
38 | * Ascending page index access is sp2d(p-minor, c-major). But storage is | |
39 | * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor | |
40 | * API. | |
41 | */ | |
42 | struct __stripe_pages_2d { | |
43 | /* Cache some hot path repeated calculations */ | |
44 | unsigned parity; | |
45 | unsigned data_devs; | |
46 | unsigned pages_in_unit; | |
47 | ||
48 | bool needed ; | |
49 | ||
50 | /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */ | |
51 | struct __1_page_stripe { | |
52 | bool alloc; | |
53 | unsigned write_count; | |
54 | struct async_submit_ctl submit; | |
55 | struct dma_async_tx_descriptor *tx; | |
56 | ||
57 | /* The size of this array is data_devs + parity */ | |
58 | struct page **pages; | |
59 | struct page **scribble; | |
60 | /* bool array, size of this array is data_devs */ | |
61 | char *page_is_read; | |
62 | } _1p_stripes[]; | |
63 | }; | |
64 | ||
65 | /* This can get bigger then a page. So support multiple page allocations | |
66 | * _sp2d_free should be called even if _sp2d_alloc fails (by returning | |
67 | * none-zero). | |
68 | */ | |
69 | static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, | |
70 | unsigned parity, struct __stripe_pages_2d **psp2d) | |
71 | { | |
72 | struct __stripe_pages_2d *sp2d; | |
73 | unsigned data_devs = group_width - parity; | |
74 | struct _alloc_all_bytes { | |
75 | struct __alloc_stripe_pages_2d { | |
76 | struct __stripe_pages_2d sp2d; | |
77 | struct __1_page_stripe _1p_stripes[pages_in_unit]; | |
78 | } __asp2d; | |
79 | struct __alloc_1p_arrays { | |
80 | struct page *pages[group_width]; | |
81 | struct page *scribble[group_width]; | |
82 | char page_is_read[data_devs]; | |
83 | } __a1pa[pages_in_unit]; | |
84 | } *_aab; | |
85 | struct __alloc_1p_arrays *__a1pa; | |
86 | struct __alloc_1p_arrays *__a1pa_end; | |
87 | const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]); | |
88 | unsigned num_a1pa, alloc_size, i; | |
89 | ||
90 | /* FIXME: check these numbers in ore_verify_layout */ | |
91 | BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE); | |
92 | BUG_ON(sizeof__a1pa > PAGE_SIZE); | |
93 | ||
94 | if (sizeof(*_aab) > PAGE_SIZE) { | |
95 | num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa; | |
96 | alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa; | |
97 | } else { | |
98 | num_a1pa = pages_in_unit; | |
99 | alloc_size = sizeof(*_aab); | |
100 | } | |
101 | ||
102 | _aab = kzalloc(alloc_size, GFP_KERNEL); | |
103 | if (unlikely(!_aab)) { | |
104 | ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size); | |
105 | return -ENOMEM; | |
106 | } | |
107 | ||
108 | sp2d = &_aab->__asp2d.sp2d; | |
109 | *psp2d = sp2d; /* From here Just call _sp2d_free */ | |
110 | ||
111 | __a1pa = _aab->__a1pa; | |
112 | __a1pa_end = __a1pa + num_a1pa; | |
113 | ||
114 | for (i = 0; i < pages_in_unit; ++i) { | |
115 | if (unlikely(__a1pa >= __a1pa_end)) { | |
116 | num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, | |
117 | pages_in_unit - i); | |
118 | ||
119 | __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL); | |
120 | if (unlikely(!__a1pa)) { | |
121 | ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", | |
122 | num_a1pa); | |
123 | return -ENOMEM; | |
124 | } | |
125 | __a1pa_end = __a1pa + num_a1pa; | |
126 | /* First *pages is marked for kfree of the buffer */ | |
127 | sp2d->_1p_stripes[i].alloc = true; | |
128 | } | |
129 | ||
130 | sp2d->_1p_stripes[i].pages = __a1pa->pages; | |
131 | sp2d->_1p_stripes[i].scribble = __a1pa->scribble ; | |
132 | sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read; | |
133 | ++__a1pa; | |
134 | } | |
135 | ||
136 | sp2d->parity = parity; | |
137 | sp2d->data_devs = data_devs; | |
138 | sp2d->pages_in_unit = pages_in_unit; | |
139 | return 0; | |
140 | } | |
141 | ||
142 | static void _sp2d_reset(struct __stripe_pages_2d *sp2d, | |
143 | const struct _ore_r4w_op *r4w, void *priv) | |
144 | { | |
145 | unsigned data_devs = sp2d->data_devs; | |
146 | unsigned group_width = data_devs + sp2d->parity; | |
147 | unsigned p; | |
148 | ||
149 | if (!sp2d->needed) | |
150 | return; | |
151 | ||
152 | for (p = 0; p < sp2d->pages_in_unit; p++) { | |
153 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
154 | ||
155 | if (_1ps->write_count < group_width) { | |
156 | unsigned c; | |
157 | ||
158 | for (c = 0; c < data_devs; c++) | |
159 | if (_1ps->page_is_read[c]) { | |
160 | struct page *page = _1ps->pages[c]; | |
161 | ||
162 | r4w->put_page(priv, page); | |
163 | _1ps->page_is_read[c] = false; | |
164 | } | |
165 | } | |
166 | ||
167 | memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages)); | |
168 | _1ps->write_count = 0; | |
169 | _1ps->tx = NULL; | |
170 | } | |
171 | ||
172 | sp2d->needed = false; | |
173 | } | |
174 | ||
175 | static void _sp2d_free(struct __stripe_pages_2d *sp2d) | |
176 | { | |
177 | unsigned i; | |
178 | ||
179 | if (!sp2d) | |
180 | return; | |
181 | ||
182 | for (i = 0; i < sp2d->pages_in_unit; ++i) { | |
183 | if (sp2d->_1p_stripes[i].alloc) | |
184 | kfree(sp2d->_1p_stripes[i].pages); | |
185 | } | |
186 | ||
187 | kfree(sp2d); | |
188 | } | |
189 | ||
190 | static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d) | |
191 | { | |
192 | unsigned p; | |
193 | ||
194 | for (p = 0; p < sp2d->pages_in_unit; p++) { | |
195 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
196 | ||
197 | if (_1ps->write_count) | |
198 | return p; | |
199 | } | |
200 | ||
201 | return ~0; | |
202 | } | |
203 | ||
204 | static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) | |
205 | { | |
206 | unsigned p; | |
207 | ||
208 | for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { | |
209 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
210 | ||
211 | if (_1ps->write_count) | |
212 | return p; | |
213 | } | |
214 | ||
215 | return ~0; | |
216 | } | |
217 | ||
218 | static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) | |
219 | { | |
220 | unsigned p; | |
221 | for (p = 0; p < sp2d->pages_in_unit; p++) { | |
222 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
223 | ||
224 | if (!_1ps->write_count) | |
225 | continue; | |
226 | ||
227 | init_async_submit(&_1ps->submit, | |
228 | ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK, | |
229 | NULL, | |
230 | NULL, NULL, | |
231 | (addr_conv_t *)_1ps->scribble); | |
232 | ||
233 | /* TODO: raid6 */ | |
234 | _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages, | |
235 | 0, sp2d->data_devs, PAGE_SIZE, | |
236 | &_1ps->submit); | |
237 | } | |
238 | ||
239 | for (p = 0; p < sp2d->pages_in_unit; p++) { | |
240 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
241 | /* NOTE: We wait for HW synchronously (I don't have such HW | |
242 | * to test with.) Is parallelism needed with today's multi | |
243 | * cores? | |
244 | */ | |
245 | async_tx_issue_pending(_1ps->tx); | |
246 | } | |
247 | } | |
248 | ||
249 | void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, | |
250 | struct ore_striping_info *si, struct page *page) | |
251 | { | |
252 | struct __1_page_stripe *_1ps; | |
253 | ||
254 | sp2d->needed = true; | |
255 | ||
256 | _1ps = &sp2d->_1p_stripes[si->cur_pg]; | |
257 | _1ps->pages[si->cur_comp] = page; | |
258 | ++_1ps->write_count; | |
259 | ||
260 | si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit; | |
261 | /* si->cur_comp is advanced outside at main loop */ | |
262 | } | |
263 | ||
a1fec1db BH |
264 | void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, |
265 | bool not_last) | |
266 | { | |
267 | struct osd_sg_entry *sge; | |
268 | ||
269 | ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d " | |
270 | "offset=0x%llx length=0x%x last_sgs_total=0x%x\n", | |
271 | per_dev->dev, cur_len, not_last, per_dev->cur_sg, | |
272 | _LLU(per_dev->offset), per_dev->length, | |
273 | per_dev->last_sgs_total); | |
274 | ||
275 | if (!per_dev->cur_sg) { | |
276 | sge = per_dev->sglist; | |
277 | ||
278 | /* First time we prepare two entries */ | |
279 | if (per_dev->length) { | |
280 | ++per_dev->cur_sg; | |
281 | sge->offset = per_dev->offset; | |
282 | sge->len = per_dev->length; | |
283 | } else { | |
284 | /* Here the parity is the first unit of this object. | |
285 | * This happens every time we reach a parity device on | |
286 | * the same stripe as the per_dev->offset. We need to | |
287 | * just skip this unit. | |
288 | */ | |
289 | per_dev->offset += cur_len; | |
290 | return; | |
291 | } | |
292 | } else { | |
293 | /* finalize the last one */ | |
294 | sge = &per_dev->sglist[per_dev->cur_sg - 1]; | |
295 | sge->len = per_dev->length - per_dev->last_sgs_total; | |
296 | } | |
297 | ||
298 | if (not_last) { | |
299 | /* Partly prepare the next one */ | |
300 | struct osd_sg_entry *next_sge = sge + 1; | |
301 | ||
302 | ++per_dev->cur_sg; | |
303 | next_sge->offset = sge->offset + sge->len + cur_len; | |
304 | /* Save cur len so we know how mutch was added next time */ | |
305 | per_dev->last_sgs_total = per_dev->length; | |
306 | next_sge->len = 0; | |
307 | } else if (!sge->len) { | |
308 | /* Optimize for when the last unit is a parity */ | |
309 | --per_dev->cur_sg; | |
310 | } | |
311 | } | |
312 | ||
769ba8d9 BH |
313 | static int _alloc_read_4_write(struct ore_io_state *ios) |
314 | { | |
315 | struct ore_layout *layout = ios->layout; | |
316 | int ret; | |
317 | /* We want to only read those pages not in cache so worst case | |
318 | * is a stripe populated with every other page | |
319 | */ | |
320 | unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2; | |
321 | ||
322 | ret = _ore_get_io_state(layout, ios->oc, | |
323 | layout->group_width * layout->mirrors_p1, | |
324 | sgs_per_dev, 0, &ios->ios_read_4_write); | |
325 | return ret; | |
326 | } | |
327 | ||
328 | /* @si contains info of the to-be-inserted page. Update of @si should be | |
329 | * maintained by caller. Specificaly si->dev, si->obj_offset, ... | |
330 | */ | |
331 | static int _add_to_read_4_write(struct ore_io_state *ios, | |
332 | struct ore_striping_info *si, struct page *page) | |
333 | { | |
334 | struct request_queue *q; | |
335 | struct ore_per_dev_state *per_dev; | |
336 | struct ore_io_state *read_ios; | |
337 | unsigned first_dev = si->dev - (si->dev % | |
338 | (ios->layout->group_width * ios->layout->mirrors_p1)); | |
339 | unsigned comp = si->dev - first_dev; | |
340 | unsigned added_len; | |
341 | ||
342 | if (!ios->ios_read_4_write) { | |
343 | int ret = _alloc_read_4_write(ios); | |
344 | ||
345 | if (unlikely(ret)) | |
346 | return ret; | |
347 | } | |
348 | ||
349 | read_ios = ios->ios_read_4_write; | |
350 | read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1; | |
351 | ||
352 | per_dev = &read_ios->per_dev[comp]; | |
353 | if (!per_dev->length) { | |
354 | per_dev->bio = bio_kmalloc(GFP_KERNEL, | |
355 | ios->sp2d->pages_in_unit); | |
356 | if (unlikely(!per_dev->bio)) { | |
357 | ORE_DBGMSG("Failed to allocate BIO size=%u\n", | |
358 | ios->sp2d->pages_in_unit); | |
359 | return -ENOMEM; | |
360 | } | |
361 | per_dev->offset = si->obj_offset; | |
362 | per_dev->dev = si->dev; | |
363 | } else if (si->obj_offset != (per_dev->offset + per_dev->length)) { | |
364 | u64 gap = si->obj_offset - (per_dev->offset + per_dev->length); | |
365 | ||
366 | _ore_add_sg_seg(per_dev, gap, true); | |
367 | } | |
368 | q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); | |
369 | added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0); | |
370 | if (unlikely(added_len != PAGE_SIZE)) { | |
371 | ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", | |
372 | per_dev->bio->bi_vcnt); | |
373 | return -ENOMEM; | |
374 | } | |
375 | ||
376 | per_dev->length += PAGE_SIZE; | |
377 | return 0; | |
378 | } | |
379 | ||
380 | static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) | |
381 | { | |
382 | struct bio_vec *bv; | |
383 | unsigned i, d; | |
384 | ||
385 | /* loop on all devices all pages */ | |
386 | for (d = 0; d < ios->numdevs; d++) { | |
387 | struct bio *bio = ios->per_dev[d].bio; | |
388 | ||
389 | if (!bio) | |
390 | continue; | |
391 | ||
392 | __bio_for_each_segment(bv, bio, i, 0) { | |
393 | struct page *page = bv->bv_page; | |
394 | ||
395 | SetPageUptodate(page); | |
396 | if (PageError(page)) | |
397 | ClearPageError(page); | |
398 | } | |
399 | } | |
400 | } | |
401 | ||
402 | /* read_4_write is hacked to read the start of the first stripe and/or | |
403 | * the end of the last stripe. If needed, with an sg-gap at each device/page. | |
404 | * It is assumed to be called after the to_be_written pages of the first stripe | |
405 | * are populating ios->sp2d[][] | |
406 | * | |
407 | * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations | |
408 | * These pages are held at sp2d[p].pages[c] but with | |
409 | * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are | |
410 | * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is | |
411 | * @uptodate=true, so we don't need to read it, only unlock, after IO. | |
412 | * | |
413 | * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then | |
414 | * to-be-written count, we should consider the xor-in-place mode. | |
415 | * need_to_read_pages_count is the actual number of pages not present in cache. | |
416 | * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough | |
417 | * approximation? In this mode the read pages are put in the empty places of | |
418 | * ios->sp2d[p][*], xor is calculated the same way. These pages are | |
419 | * allocated/freed and don't go through cache | |
420 | */ | |
421 | static int _read_4_write(struct ore_io_state *ios) | |
422 | { | |
423 | struct ore_io_state *ios_read; | |
424 | struct ore_striping_info read_si; | |
425 | struct __stripe_pages_2d *sp2d = ios->sp2d; | |
426 | u64 offset = ios->si.first_stripe_start; | |
427 | u64 last_stripe_end; | |
428 | unsigned bytes_in_stripe = ios->si.bytes_in_stripe; | |
429 | unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1; | |
430 | int ret; | |
431 | ||
432 | if (offset == ios->offset) /* Go to start collect $200 */ | |
433 | goto read_last_stripe; | |
434 | ||
435 | min_p = _sp2d_min_pg(sp2d); | |
436 | max_p = _sp2d_max_pg(sp2d); | |
437 | ||
438 | for (c = 0; ; c++) { | |
439 | ore_calc_stripe_info(ios->layout, offset, 0, &read_si); | |
440 | read_si.obj_offset += min_p * PAGE_SIZE; | |
441 | offset += min_p * PAGE_SIZE; | |
442 | for (p = min_p; p <= max_p; p++) { | |
443 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
444 | struct page **pp = &_1ps->pages[c]; | |
445 | bool uptodate; | |
446 | ||
447 | if (*pp) | |
448 | /* to-be-written pages start here */ | |
449 | goto read_last_stripe; | |
450 | ||
451 | *pp = ios->r4w->get_page(ios->private, offset, | |
452 | &uptodate); | |
453 | if (unlikely(!*pp)) | |
454 | return -ENOMEM; | |
455 | ||
456 | if (!uptodate) | |
457 | _add_to_read_4_write(ios, &read_si, *pp); | |
458 | ||
459 | /* Mark read-pages to be cache_released */ | |
460 | _1ps->page_is_read[c] = true; | |
461 | read_si.obj_offset += PAGE_SIZE; | |
462 | offset += PAGE_SIZE; | |
463 | } | |
464 | offset += (sp2d->pages_in_unit - p) * PAGE_SIZE; | |
465 | } | |
466 | ||
467 | read_last_stripe: | |
468 | offset = ios->offset + (ios->length + PAGE_SIZE - 1) / | |
469 | PAGE_SIZE * PAGE_SIZE; | |
470 | last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) | |
471 | * bytes_in_stripe; | |
472 | if (offset == last_stripe_end) /* Optimize for the aligned case */ | |
473 | goto read_it; | |
474 | ||
475 | ore_calc_stripe_info(ios->layout, offset, 0, &read_si); | |
476 | p = read_si.unit_off / PAGE_SIZE; | |
477 | c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, | |
478 | ios->layout->mirrors_p1, read_si.par_dev, read_si.dev); | |
479 | ||
480 | BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end); | |
481 | /* unaligned IO must be within a single stripe */ | |
482 | ||
483 | if (min_p == sp2d->pages_in_unit) { | |
484 | /* Didn't do it yet */ | |
485 | min_p = _sp2d_min_pg(sp2d); | |
486 | max_p = _sp2d_max_pg(sp2d); | |
487 | } | |
488 | ||
489 | while (offset < last_stripe_end) { | |
490 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
491 | ||
492 | if ((min_p <= p) && (p <= max_p)) { | |
493 | struct page *page; | |
494 | bool uptodate; | |
495 | ||
496 | BUG_ON(_1ps->pages[c]); | |
497 | page = ios->r4w->get_page(ios->private, offset, | |
498 | &uptodate); | |
499 | if (unlikely(!page)) | |
500 | return -ENOMEM; | |
501 | ||
502 | _1ps->pages[c] = page; | |
503 | /* Mark read-pages to be cache_released */ | |
504 | _1ps->page_is_read[c] = true; | |
505 | if (!uptodate) | |
506 | _add_to_read_4_write(ios, &read_si, page); | |
507 | } | |
508 | ||
509 | offset += PAGE_SIZE; | |
510 | if (p == (sp2d->pages_in_unit - 1)) { | |
511 | ++c; | |
512 | p = 0; | |
513 | ore_calc_stripe_info(ios->layout, offset, 0, &read_si); | |
514 | } else { | |
515 | read_si.obj_offset += PAGE_SIZE; | |
516 | ++p; | |
517 | } | |
518 | } | |
519 | ||
520 | read_it: | |
521 | ios_read = ios->ios_read_4_write; | |
522 | if (!ios_read) | |
523 | return 0; | |
524 | ||
525 | /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change | |
526 | * to check for per_dev->bio | |
527 | */ | |
528 | ios_read->pages = ios->pages; | |
529 | ||
530 | /* Now read these devices */ | |
531 | for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) { | |
532 | ret = _ore_read_mirror(ios_read, i); | |
533 | if (unlikely(ret)) | |
534 | return ret; | |
535 | } | |
536 | ||
537 | ret = ore_io_execute(ios_read); /* Synchronus execution */ | |
538 | if (unlikely(ret)) { | |
539 | ORE_DBGMSG("!! ore_io_execute => %d\n", ret); | |
540 | return ret; | |
541 | } | |
542 | ||
543 | _mark_read4write_pages_uptodate(ios_read, ret); | |
544 | return 0; | |
545 | } | |
546 | ||
a1fec1db BH |
547 | /* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ |
548 | int _ore_add_parity_unit(struct ore_io_state *ios, | |
549 | struct ore_striping_info *si, | |
550 | struct ore_per_dev_state *per_dev, | |
551 | unsigned cur_len) | |
552 | { | |
553 | if (ios->reading) { | |
554 | BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev); | |
555 | _ore_add_sg_seg(per_dev, cur_len, true); | |
556 | } else { | |
769ba8d9 | 557 | struct __stripe_pages_2d *sp2d = ios->sp2d; |
a1fec1db | 558 | struct page **pages = ios->parity_pages + ios->cur_par_page; |
769ba8d9 | 559 | unsigned num_pages; |
a1fec1db BH |
560 | unsigned array_start = 0; |
561 | unsigned i; | |
562 | int ret; | |
563 | ||
769ba8d9 BH |
564 | si->cur_pg = _sp2d_min_pg(sp2d); |
565 | num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; | |
566 | ||
567 | if (!cur_len) /* If last stripe operate on parity comp */ | |
568 | si->cur_comp = sp2d->data_devs; | |
569 | ||
570 | if (!per_dev->length) { | |
571 | per_dev->offset += si->cur_pg * PAGE_SIZE; | |
572 | /* If first stripe, Read in all read4write pages | |
573 | * (if needed) before we calculate the first parity. | |
574 | */ | |
575 | _read_4_write(ios); | |
576 | } | |
577 | ||
a1fec1db BH |
578 | for (i = 0; i < num_pages; i++) { |
579 | pages[i] = _raid_page_alloc(); | |
580 | if (unlikely(!pages[i])) | |
581 | return -ENOMEM; | |
582 | ||
583 | ++(ios->cur_par_page); | |
a1fec1db BH |
584 | } |
585 | ||
769ba8d9 BH |
586 | BUG_ON(si->cur_comp != sp2d->data_devs); |
587 | BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); | |
a1fec1db BH |
588 | |
589 | ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, | |
590 | per_dev, num_pages * PAGE_SIZE); | |
591 | if (unlikely(ret)) | |
592 | return ret; | |
769ba8d9 BH |
593 | |
594 | /* TODO: raid6 if (last_parity_dev) */ | |
595 | _gen_xor_unit(sp2d); | |
596 | _sp2d_reset(sp2d, ios->r4w, ios->private); | |
a1fec1db BH |
597 | } |
598 | return 0; | |
599 | } | |
600 | ||
601 | int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) | |
602 | { | |
769ba8d9 BH |
603 | struct ore_layout *layout = ios->layout; |
604 | ||
605 | if (ios->parity_pages) { | |
606 | unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; | |
607 | unsigned stripe_size = ios->si.bytes_in_stripe; | |
608 | u64 last_stripe, first_stripe; | |
609 | ||
610 | if (_sp2d_alloc(pages_in_unit, layout->group_width, | |
611 | layout->parity, &ios->sp2d)) { | |
612 | return -ENOMEM; | |
613 | } | |
614 | ||
615 | BUG_ON(ios->offset % PAGE_SIZE); | |
616 | ||
617 | /* Round io down to last full strip */ | |
618 | first_stripe = div_u64(ios->offset, stripe_size); | |
619 | last_stripe = div_u64(ios->offset + ios->length, stripe_size); | |
620 | ||
621 | /* If an IO spans more then a single stripe it must end at | |
622 | * a stripe boundary. The reminder at the end is pushed into the | |
623 | * next IO. | |
624 | */ | |
625 | if (last_stripe != first_stripe) { | |
626 | ios->length = last_stripe * stripe_size - ios->offset; | |
627 | ||
628 | BUG_ON(!ios->length); | |
629 | ios->nr_pages = (ios->length + PAGE_SIZE - 1) / | |
630 | PAGE_SIZE; | |
631 | ios->si.length = ios->length; /*make it consistent */ | |
632 | } | |
633 | } | |
a1fec1db BH |
634 | return 0; |
635 | } | |
636 | ||
637 | void _ore_free_raid_stuff(struct ore_io_state *ios) | |
638 | { | |
769ba8d9 | 639 | if (ios->sp2d) { /* writing and raid */ |
a1fec1db BH |
640 | unsigned i; |
641 | ||
642 | for (i = 0; i < ios->cur_par_page; i++) { | |
643 | struct page *page = ios->parity_pages[i]; | |
644 | ||
645 | if (page) | |
646 | _raid_page_free(page); | |
647 | } | |
648 | if (ios->extra_part_alloc) | |
649 | kfree(ios->parity_pages); | |
769ba8d9 BH |
650 | /* If IO returned an error pages might need unlocking */ |
651 | _sp2d_reset(ios->sp2d, ios->r4w, ios->private); | |
652 | _sp2d_free(ios->sp2d); | |
a1fec1db BH |
653 | } else { |
654 | /* Will only be set if raid reading && sglist is big */ | |
655 | if (ios->extra_part_alloc) | |
656 | kfree(ios->per_dev[0].sglist); | |
657 | } | |
769ba8d9 BH |
658 | if (ios->ios_read_4_write) |
659 | ore_put_io_state(ios->ios_read_4_write); | |
a1fec1db | 660 | } |