Commit | Line | Data |
---|---|---|
a1fec1db BH |
1 | /* |
2 | * Copyright (C) 2011 | |
3 | * Boaz Harrosh <bharrosh@panasas.com> | |
4 | * | |
5 | * This file is part of the objects raid engine (ore). | |
6 | * | |
7 | * It is free software; you can redistribute it and/or modify | |
8 | * it under the terms of the GNU General Public License version 2 as published | |
9 | * by the Free Software Foundation. | |
10 | * | |
11 | * You should have received a copy of the GNU General Public License | |
12 | * along with "ore". If not, write to the Free Software Foundation, Inc: | |
13 | * "Free Software Foundation <info@fsf.org>" | |
14 | */ | |
15 | ||
16 | #include <linux/gfp.h> | |
769ba8d9 | 17 | #include <linux/async_tx.h> |
a1fec1db BH |
18 | |
19 | #include "ore_raid.h" | |
20 | ||
769ba8d9 BH |
21 | #undef ORE_DBGMSG2 |
22 | #define ORE_DBGMSG2 ORE_DBGMSG | |
23 | ||
a1fec1db BH |
24 | struct page *_raid_page_alloc(void) |
25 | { | |
26 | return alloc_page(GFP_KERNEL); | |
27 | } | |
28 | ||
29 | void _raid_page_free(struct page *p) | |
30 | { | |
31 | __free_page(p); | |
32 | } | |
33 | ||
769ba8d9 BH |
34 | /* This struct is forward declare in ore_io_state, but is private to here. |
35 | * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit. | |
36 | * | |
37 | * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn. | |
38 | * Ascending page index access is sp2d(p-minor, c-major). But storage is | |
39 | * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor | |
40 | * API. | |
41 | */ | |
42 | struct __stripe_pages_2d { | |
43 | /* Cache some hot path repeated calculations */ | |
44 | unsigned parity; | |
45 | unsigned data_devs; | |
46 | unsigned pages_in_unit; | |
47 | ||
48 | bool needed ; | |
49 | ||
50 | /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */ | |
51 | struct __1_page_stripe { | |
52 | bool alloc; | |
53 | unsigned write_count; | |
54 | struct async_submit_ctl submit; | |
55 | struct dma_async_tx_descriptor *tx; | |
56 | ||
57 | /* The size of this array is data_devs + parity */ | |
58 | struct page **pages; | |
59 | struct page **scribble; | |
60 | /* bool array, size of this array is data_devs */ | |
61 | char *page_is_read; | |
62 | } _1p_stripes[]; | |
63 | }; | |
64 | ||
65 | /* This can get bigger then a page. So support multiple page allocations | |
66 | * _sp2d_free should be called even if _sp2d_alloc fails (by returning | |
67 | * none-zero). | |
68 | */ | |
69 | static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, | |
70 | unsigned parity, struct __stripe_pages_2d **psp2d) | |
71 | { | |
72 | struct __stripe_pages_2d *sp2d; | |
73 | unsigned data_devs = group_width - parity; | |
74 | struct _alloc_all_bytes { | |
75 | struct __alloc_stripe_pages_2d { | |
76 | struct __stripe_pages_2d sp2d; | |
77 | struct __1_page_stripe _1p_stripes[pages_in_unit]; | |
78 | } __asp2d; | |
79 | struct __alloc_1p_arrays { | |
80 | struct page *pages[group_width]; | |
81 | struct page *scribble[group_width]; | |
82 | char page_is_read[data_devs]; | |
83 | } __a1pa[pages_in_unit]; | |
84 | } *_aab; | |
85 | struct __alloc_1p_arrays *__a1pa; | |
86 | struct __alloc_1p_arrays *__a1pa_end; | |
87 | const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]); | |
88 | unsigned num_a1pa, alloc_size, i; | |
89 | ||
90 | /* FIXME: check these numbers in ore_verify_layout */ | |
91 | BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE); | |
92 | BUG_ON(sizeof__a1pa > PAGE_SIZE); | |
93 | ||
94 | if (sizeof(*_aab) > PAGE_SIZE) { | |
95 | num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa; | |
96 | alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa; | |
97 | } else { | |
98 | num_a1pa = pages_in_unit; | |
99 | alloc_size = sizeof(*_aab); | |
100 | } | |
101 | ||
102 | _aab = kzalloc(alloc_size, GFP_KERNEL); | |
103 | if (unlikely(!_aab)) { | |
104 | ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size); | |
105 | return -ENOMEM; | |
106 | } | |
107 | ||
108 | sp2d = &_aab->__asp2d.sp2d; | |
109 | *psp2d = sp2d; /* From here Just call _sp2d_free */ | |
110 | ||
111 | __a1pa = _aab->__a1pa; | |
112 | __a1pa_end = __a1pa + num_a1pa; | |
113 | ||
114 | for (i = 0; i < pages_in_unit; ++i) { | |
115 | if (unlikely(__a1pa >= __a1pa_end)) { | |
116 | num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, | |
117 | pages_in_unit - i); | |
118 | ||
119 | __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL); | |
120 | if (unlikely(!__a1pa)) { | |
121 | ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", | |
122 | num_a1pa); | |
123 | return -ENOMEM; | |
124 | } | |
125 | __a1pa_end = __a1pa + num_a1pa; | |
126 | /* First *pages is marked for kfree of the buffer */ | |
127 | sp2d->_1p_stripes[i].alloc = true; | |
128 | } | |
129 | ||
130 | sp2d->_1p_stripes[i].pages = __a1pa->pages; | |
131 | sp2d->_1p_stripes[i].scribble = __a1pa->scribble ; | |
132 | sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read; | |
133 | ++__a1pa; | |
134 | } | |
135 | ||
136 | sp2d->parity = parity; | |
137 | sp2d->data_devs = data_devs; | |
138 | sp2d->pages_in_unit = pages_in_unit; | |
139 | return 0; | |
140 | } | |
141 | ||
142 | static void _sp2d_reset(struct __stripe_pages_2d *sp2d, | |
143 | const struct _ore_r4w_op *r4w, void *priv) | |
144 | { | |
145 | unsigned data_devs = sp2d->data_devs; | |
146 | unsigned group_width = data_devs + sp2d->parity; | |
537632e0 | 147 | int p, c; |
769ba8d9 BH |
148 | |
149 | if (!sp2d->needed) | |
150 | return; | |
151 | ||
537632e0 BH |
152 | for (c = data_devs - 1; c >= 0; --c) |
153 | for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { | |
154 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
769ba8d9 | 155 | |
537632e0 BH |
156 | if (_1ps->page_is_read[c]) { |
157 | struct page *page = _1ps->pages[c]; | |
769ba8d9 | 158 | |
537632e0 BH |
159 | r4w->put_page(priv, page); |
160 | _1ps->page_is_read[c] = false; | |
161 | } | |
769ba8d9 BH |
162 | } |
163 | ||
537632e0 BH |
164 | for (p = 0; p < sp2d->pages_in_unit; p++) { |
165 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
166 | ||
769ba8d9 BH |
167 | memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages)); |
168 | _1ps->write_count = 0; | |
169 | _1ps->tx = NULL; | |
170 | } | |
171 | ||
172 | sp2d->needed = false; | |
173 | } | |
174 | ||
175 | static void _sp2d_free(struct __stripe_pages_2d *sp2d) | |
176 | { | |
177 | unsigned i; | |
178 | ||
179 | if (!sp2d) | |
180 | return; | |
181 | ||
182 | for (i = 0; i < sp2d->pages_in_unit; ++i) { | |
183 | if (sp2d->_1p_stripes[i].alloc) | |
184 | kfree(sp2d->_1p_stripes[i].pages); | |
185 | } | |
186 | ||
187 | kfree(sp2d); | |
188 | } | |
189 | ||
190 | static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d) | |
191 | { | |
192 | unsigned p; | |
193 | ||
194 | for (p = 0; p < sp2d->pages_in_unit; p++) { | |
195 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
196 | ||
197 | if (_1ps->write_count) | |
198 | return p; | |
199 | } | |
200 | ||
201 | return ~0; | |
202 | } | |
203 | ||
204 | static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) | |
205 | { | |
74b217d0 | 206 | int p; |
769ba8d9 BH |
207 | |
208 | for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { | |
209 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
210 | ||
211 | if (_1ps->write_count) | |
212 | return p; | |
213 | } | |
214 | ||
215 | return ~0; | |
216 | } | |
217 | ||
218 | static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) | |
219 | { | |
220 | unsigned p; | |
221 | for (p = 0; p < sp2d->pages_in_unit; p++) { | |
222 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
223 | ||
224 | if (!_1ps->write_count) | |
225 | continue; | |
226 | ||
227 | init_async_submit(&_1ps->submit, | |
228 | ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK, | |
229 | NULL, | |
230 | NULL, NULL, | |
231 | (addr_conv_t *)_1ps->scribble); | |
232 | ||
233 | /* TODO: raid6 */ | |
234 | _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages, | |
235 | 0, sp2d->data_devs, PAGE_SIZE, | |
236 | &_1ps->submit); | |
237 | } | |
238 | ||
239 | for (p = 0; p < sp2d->pages_in_unit; p++) { | |
240 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
241 | /* NOTE: We wait for HW synchronously (I don't have such HW | |
242 | * to test with.) Is parallelism needed with today's multi | |
243 | * cores? | |
244 | */ | |
245 | async_tx_issue_pending(_1ps->tx); | |
246 | } | |
247 | } | |
248 | ||
249 | void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, | |
250 | struct ore_striping_info *si, struct page *page) | |
251 | { | |
252 | struct __1_page_stripe *_1ps; | |
253 | ||
254 | sp2d->needed = true; | |
255 | ||
256 | _1ps = &sp2d->_1p_stripes[si->cur_pg]; | |
257 | _1ps->pages[si->cur_comp] = page; | |
258 | ++_1ps->write_count; | |
259 | ||
260 | si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit; | |
261 | /* si->cur_comp is advanced outside at main loop */ | |
262 | } | |
263 | ||
a1fec1db BH |
264 | void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, |
265 | bool not_last) | |
266 | { | |
267 | struct osd_sg_entry *sge; | |
268 | ||
269 | ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d " | |
270 | "offset=0x%llx length=0x%x last_sgs_total=0x%x\n", | |
271 | per_dev->dev, cur_len, not_last, per_dev->cur_sg, | |
272 | _LLU(per_dev->offset), per_dev->length, | |
273 | per_dev->last_sgs_total); | |
274 | ||
275 | if (!per_dev->cur_sg) { | |
276 | sge = per_dev->sglist; | |
277 | ||
278 | /* First time we prepare two entries */ | |
279 | if (per_dev->length) { | |
280 | ++per_dev->cur_sg; | |
281 | sge->offset = per_dev->offset; | |
282 | sge->len = per_dev->length; | |
283 | } else { | |
284 | /* Here the parity is the first unit of this object. | |
285 | * This happens every time we reach a parity device on | |
286 | * the same stripe as the per_dev->offset. We need to | |
287 | * just skip this unit. | |
288 | */ | |
289 | per_dev->offset += cur_len; | |
290 | return; | |
291 | } | |
292 | } else { | |
293 | /* finalize the last one */ | |
294 | sge = &per_dev->sglist[per_dev->cur_sg - 1]; | |
295 | sge->len = per_dev->length - per_dev->last_sgs_total; | |
296 | } | |
297 | ||
298 | if (not_last) { | |
299 | /* Partly prepare the next one */ | |
300 | struct osd_sg_entry *next_sge = sge + 1; | |
301 | ||
302 | ++per_dev->cur_sg; | |
303 | next_sge->offset = sge->offset + sge->len + cur_len; | |
304 | /* Save cur len so we know how mutch was added next time */ | |
305 | per_dev->last_sgs_total = per_dev->length; | |
306 | next_sge->len = 0; | |
307 | } else if (!sge->len) { | |
308 | /* Optimize for when the last unit is a parity */ | |
309 | --per_dev->cur_sg; | |
310 | } | |
311 | } | |
312 | ||
769ba8d9 BH |
313 | static int _alloc_read_4_write(struct ore_io_state *ios) |
314 | { | |
315 | struct ore_layout *layout = ios->layout; | |
316 | int ret; | |
317 | /* We want to only read those pages not in cache so worst case | |
318 | * is a stripe populated with every other page | |
319 | */ | |
320 | unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2; | |
321 | ||
322 | ret = _ore_get_io_state(layout, ios->oc, | |
323 | layout->group_width * layout->mirrors_p1, | |
324 | sgs_per_dev, 0, &ios->ios_read_4_write); | |
325 | return ret; | |
326 | } | |
327 | ||
328 | /* @si contains info of the to-be-inserted page. Update of @si should be | |
329 | * maintained by caller. Specificaly si->dev, si->obj_offset, ... | |
330 | */ | |
724577ca BH |
331 | static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si, |
332 | struct page *page, unsigned pg_len) | |
769ba8d9 BH |
333 | { |
334 | struct request_queue *q; | |
335 | struct ore_per_dev_state *per_dev; | |
336 | struct ore_io_state *read_ios; | |
337 | unsigned first_dev = si->dev - (si->dev % | |
338 | (ios->layout->group_width * ios->layout->mirrors_p1)); | |
339 | unsigned comp = si->dev - first_dev; | |
340 | unsigned added_len; | |
341 | ||
342 | if (!ios->ios_read_4_write) { | |
343 | int ret = _alloc_read_4_write(ios); | |
344 | ||
345 | if (unlikely(ret)) | |
346 | return ret; | |
347 | } | |
348 | ||
349 | read_ios = ios->ios_read_4_write; | |
350 | read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1; | |
351 | ||
352 | per_dev = &read_ios->per_dev[comp]; | |
353 | if (!per_dev->length) { | |
354 | per_dev->bio = bio_kmalloc(GFP_KERNEL, | |
355 | ios->sp2d->pages_in_unit); | |
356 | if (unlikely(!per_dev->bio)) { | |
357 | ORE_DBGMSG("Failed to allocate BIO size=%u\n", | |
358 | ios->sp2d->pages_in_unit); | |
359 | return -ENOMEM; | |
360 | } | |
361 | per_dev->offset = si->obj_offset; | |
362 | per_dev->dev = si->dev; | |
363 | } else if (si->obj_offset != (per_dev->offset + per_dev->length)) { | |
364 | u64 gap = si->obj_offset - (per_dev->offset + per_dev->length); | |
365 | ||
366 | _ore_add_sg_seg(per_dev, gap, true); | |
367 | } | |
368 | q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); | |
724577ca BH |
369 | added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len, |
370 | si->obj_offset % PAGE_SIZE); | |
371 | if (unlikely(added_len != pg_len)) { | |
769ba8d9 BH |
372 | ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", |
373 | per_dev->bio->bi_vcnt); | |
374 | return -ENOMEM; | |
375 | } | |
376 | ||
724577ca | 377 | per_dev->length += pg_len; |
769ba8d9 BH |
378 | return 0; |
379 | } | |
380 | ||
724577ca BH |
381 | /* read the beginning of an unaligned first page */ |
382 | static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page) | |
383 | { | |
384 | struct ore_striping_info si; | |
385 | unsigned pg_len; | |
386 | ||
387 | ore_calc_stripe_info(ios->layout, ios->offset, 0, &si); | |
388 | ||
389 | pg_len = si.obj_offset % PAGE_SIZE; | |
390 | si.obj_offset -= pg_len; | |
391 | ||
392 | ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n", | |
393 | _LLU(si.obj_offset), pg_len, page->index, si.dev); | |
394 | ||
395 | return _add_to_r4w(ios, &si, page, pg_len); | |
396 | } | |
397 | ||
398 | /* read the end of an incomplete last page */ | |
399 | static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset) | |
400 | { | |
401 | struct ore_striping_info si; | |
402 | struct page *page; | |
403 | unsigned pg_len, p, c; | |
404 | ||
405 | ore_calc_stripe_info(ios->layout, *offset, 0, &si); | |
406 | ||
407 | p = si.unit_off / PAGE_SIZE; | |
408 | c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, | |
409 | ios->layout->mirrors_p1, si.par_dev, si.dev); | |
410 | page = ios->sp2d->_1p_stripes[p].pages[c]; | |
411 | ||
412 | pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE); | |
413 | *offset += pg_len; | |
414 | ||
415 | ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n", | |
416 | p, c, _LLU(*offset), pg_len, si.dev, si.par_dev); | |
417 | ||
418 | BUG_ON(!page); | |
419 | ||
420 | return _add_to_r4w(ios, &si, page, pg_len); | |
421 | } | |
422 | ||
769ba8d9 BH |
423 | static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) |
424 | { | |
425 | struct bio_vec *bv; | |
426 | unsigned i, d; | |
427 | ||
428 | /* loop on all devices all pages */ | |
429 | for (d = 0; d < ios->numdevs; d++) { | |
430 | struct bio *bio = ios->per_dev[d].bio; | |
431 | ||
432 | if (!bio) | |
433 | continue; | |
434 | ||
d74c6d51 | 435 | bio_for_each_segment_all(bv, bio, i) { |
769ba8d9 BH |
436 | struct page *page = bv->bv_page; |
437 | ||
438 | SetPageUptodate(page); | |
439 | if (PageError(page)) | |
440 | ClearPageError(page); | |
441 | } | |
442 | } | |
443 | } | |
444 | ||
445 | /* read_4_write is hacked to read the start of the first stripe and/or | |
446 | * the end of the last stripe. If needed, with an sg-gap at each device/page. | |
447 | * It is assumed to be called after the to_be_written pages of the first stripe | |
448 | * are populating ios->sp2d[][] | |
449 | * | |
450 | * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations | |
451 | * These pages are held at sp2d[p].pages[c] but with | |
452 | * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are | |
453 | * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is | |
454 | * @uptodate=true, so we don't need to read it, only unlock, after IO. | |
455 | * | |
456 | * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then | |
457 | * to-be-written count, we should consider the xor-in-place mode. | |
458 | * need_to_read_pages_count is the actual number of pages not present in cache. | |
459 | * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough | |
460 | * approximation? In this mode the read pages are put in the empty places of | |
461 | * ios->sp2d[p][*], xor is calculated the same way. These pages are | |
462 | * allocated/freed and don't go through cache | |
463 | */ | |
9ff19309 | 464 | static int _read_4_write_first_stripe(struct ore_io_state *ios) |
769ba8d9 | 465 | { |
769ba8d9 BH |
466 | struct ore_striping_info read_si; |
467 | struct __stripe_pages_2d *sp2d = ios->sp2d; | |
468 | u64 offset = ios->si.first_stripe_start; | |
9ff19309 | 469 | unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; |
769ba8d9 BH |
470 | |
471 | if (offset == ios->offset) /* Go to start collect $200 */ | |
472 | goto read_last_stripe; | |
473 | ||
474 | min_p = _sp2d_min_pg(sp2d); | |
475 | max_p = _sp2d_max_pg(sp2d); | |
476 | ||
9ff19309 BH |
477 | ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n", |
478 | offset, ios->offset, min_p, max_p); | |
479 | ||
769ba8d9 BH |
480 | for (c = 0; ; c++) { |
481 | ore_calc_stripe_info(ios->layout, offset, 0, &read_si); | |
482 | read_si.obj_offset += min_p * PAGE_SIZE; | |
483 | offset += min_p * PAGE_SIZE; | |
484 | for (p = min_p; p <= max_p; p++) { | |
485 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
486 | struct page **pp = &_1ps->pages[c]; | |
487 | bool uptodate; | |
488 | ||
724577ca BH |
489 | if (*pp) { |
490 | if (ios->offset % PAGE_SIZE) | |
491 | /* Read the remainder of the page */ | |
492 | _add_to_r4w_first_page(ios, *pp); | |
769ba8d9 BH |
493 | /* to-be-written pages start here */ |
494 | goto read_last_stripe; | |
724577ca | 495 | } |
769ba8d9 BH |
496 | |
497 | *pp = ios->r4w->get_page(ios->private, offset, | |
498 | &uptodate); | |
499 | if (unlikely(!*pp)) | |
500 | return -ENOMEM; | |
501 | ||
502 | if (!uptodate) | |
724577ca | 503 | _add_to_r4w(ios, &read_si, *pp, PAGE_SIZE); |
769ba8d9 BH |
504 | |
505 | /* Mark read-pages to be cache_released */ | |
506 | _1ps->page_is_read[c] = true; | |
507 | read_si.obj_offset += PAGE_SIZE; | |
508 | offset += PAGE_SIZE; | |
509 | } | |
510 | offset += (sp2d->pages_in_unit - p) * PAGE_SIZE; | |
511 | } | |
512 | ||
513 | read_last_stripe: | |
9ff19309 BH |
514 | return 0; |
515 | } | |
516 | ||
517 | static int _read_4_write_last_stripe(struct ore_io_state *ios) | |
518 | { | |
519 | struct ore_striping_info read_si; | |
520 | struct __stripe_pages_2d *sp2d = ios->sp2d; | |
521 | u64 offset; | |
522 | u64 last_stripe_end; | |
523 | unsigned bytes_in_stripe = ios->si.bytes_in_stripe; | |
524 | unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; | |
525 | ||
724577ca BH |
526 | offset = ios->offset + ios->length; |
527 | if (offset % PAGE_SIZE) | |
528 | _add_to_r4w_last_page(ios, &offset); | |
529 | /* offset will be aligned to next page */ | |
530 | ||
769ba8d9 BH |
531 | last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) |
532 | * bytes_in_stripe; | |
533 | if (offset == last_stripe_end) /* Optimize for the aligned case */ | |
534 | goto read_it; | |
535 | ||
536 | ore_calc_stripe_info(ios->layout, offset, 0, &read_si); | |
537 | p = read_si.unit_off / PAGE_SIZE; | |
538 | c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, | |
539 | ios->layout->mirrors_p1, read_si.par_dev, read_si.dev); | |
540 | ||
769ba8d9 BH |
541 | if (min_p == sp2d->pages_in_unit) { |
542 | /* Didn't do it yet */ | |
543 | min_p = _sp2d_min_pg(sp2d); | |
544 | max_p = _sp2d_max_pg(sp2d); | |
545 | } | |
546 | ||
9ff19309 BH |
547 | ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n", |
548 | offset, last_stripe_end, min_p, max_p); | |
549 | ||
769ba8d9 BH |
550 | while (offset < last_stripe_end) { |
551 | struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; | |
552 | ||
553 | if ((min_p <= p) && (p <= max_p)) { | |
554 | struct page *page; | |
555 | bool uptodate; | |
556 | ||
557 | BUG_ON(_1ps->pages[c]); | |
558 | page = ios->r4w->get_page(ios->private, offset, | |
559 | &uptodate); | |
560 | if (unlikely(!page)) | |
561 | return -ENOMEM; | |
562 | ||
563 | _1ps->pages[c] = page; | |
564 | /* Mark read-pages to be cache_released */ | |
565 | _1ps->page_is_read[c] = true; | |
566 | if (!uptodate) | |
724577ca | 567 | _add_to_r4w(ios, &read_si, page, PAGE_SIZE); |
769ba8d9 BH |
568 | } |
569 | ||
570 | offset += PAGE_SIZE; | |
571 | if (p == (sp2d->pages_in_unit - 1)) { | |
572 | ++c; | |
573 | p = 0; | |
574 | ore_calc_stripe_info(ios->layout, offset, 0, &read_si); | |
575 | } else { | |
576 | read_si.obj_offset += PAGE_SIZE; | |
577 | ++p; | |
578 | } | |
579 | } | |
580 | ||
581 | read_it: | |
9ff19309 BH |
582 | return 0; |
583 | } | |
584 | ||
585 | static int _read_4_write_execute(struct ore_io_state *ios) | |
586 | { | |
587 | struct ore_io_state *ios_read; | |
588 | unsigned i; | |
589 | int ret; | |
590 | ||
769ba8d9 BH |
591 | ios_read = ios->ios_read_4_write; |
592 | if (!ios_read) | |
593 | return 0; | |
594 | ||
595 | /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change | |
596 | * to check for per_dev->bio | |
597 | */ | |
598 | ios_read->pages = ios->pages; | |
599 | ||
600 | /* Now read these devices */ | |
601 | for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) { | |
602 | ret = _ore_read_mirror(ios_read, i); | |
603 | if (unlikely(ret)) | |
604 | return ret; | |
605 | } | |
606 | ||
607 | ret = ore_io_execute(ios_read); /* Synchronus execution */ | |
608 | if (unlikely(ret)) { | |
609 | ORE_DBGMSG("!! ore_io_execute => %d\n", ret); | |
610 | return ret; | |
611 | } | |
612 | ||
613 | _mark_read4write_pages_uptodate(ios_read, ret); | |
9ff19309 BH |
614 | ore_put_io_state(ios_read); |
615 | ios->ios_read_4_write = NULL; /* Might need a reuse at last stripe */ | |
769ba8d9 BH |
616 | return 0; |
617 | } | |
618 | ||
a1fec1db BH |
619 | /* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ |
620 | int _ore_add_parity_unit(struct ore_io_state *ios, | |
621 | struct ore_striping_info *si, | |
622 | struct ore_per_dev_state *per_dev, | |
623 | unsigned cur_len) | |
624 | { | |
625 | if (ios->reading) { | |
361aba56 BH |
626 | if (per_dev->cur_sg >= ios->sgs_per_dev) { |
627 | ORE_DBGMSG("cur_sg(%d) >= sgs_per_dev(%d)\n" , | |
628 | per_dev->cur_sg, ios->sgs_per_dev); | |
629 | return -ENOMEM; | |
630 | } | |
a1fec1db BH |
631 | _ore_add_sg_seg(per_dev, cur_len, true); |
632 | } else { | |
769ba8d9 | 633 | struct __stripe_pages_2d *sp2d = ios->sp2d; |
a1fec1db | 634 | struct page **pages = ios->parity_pages + ios->cur_par_page; |
769ba8d9 | 635 | unsigned num_pages; |
a1fec1db BH |
636 | unsigned array_start = 0; |
637 | unsigned i; | |
638 | int ret; | |
639 | ||
769ba8d9 BH |
640 | si->cur_pg = _sp2d_min_pg(sp2d); |
641 | num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; | |
642 | ||
643 | if (!cur_len) /* If last stripe operate on parity comp */ | |
644 | si->cur_comp = sp2d->data_devs; | |
645 | ||
646 | if (!per_dev->length) { | |
647 | per_dev->offset += si->cur_pg * PAGE_SIZE; | |
648 | /* If first stripe, Read in all read4write pages | |
649 | * (if needed) before we calculate the first parity. | |
650 | */ | |
9ff19309 | 651 | _read_4_write_first_stripe(ios); |
769ba8d9 | 652 | } |
9ff19309 BH |
653 | if (!cur_len) /* If last stripe r4w pages of last stripe */ |
654 | _read_4_write_last_stripe(ios); | |
655 | _read_4_write_execute(ios); | |
769ba8d9 | 656 | |
a1fec1db BH |
657 | for (i = 0; i < num_pages; i++) { |
658 | pages[i] = _raid_page_alloc(); | |
659 | if (unlikely(!pages[i])) | |
660 | return -ENOMEM; | |
661 | ||
662 | ++(ios->cur_par_page); | |
a1fec1db BH |
663 | } |
664 | ||
769ba8d9 BH |
665 | BUG_ON(si->cur_comp != sp2d->data_devs); |
666 | BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); | |
a1fec1db BH |
667 | |
668 | ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, | |
669 | per_dev, num_pages * PAGE_SIZE); | |
670 | if (unlikely(ret)) | |
671 | return ret; | |
769ba8d9 BH |
672 | |
673 | /* TODO: raid6 if (last_parity_dev) */ | |
674 | _gen_xor_unit(sp2d); | |
675 | _sp2d_reset(sp2d, ios->r4w, ios->private); | |
a1fec1db BH |
676 | } |
677 | return 0; | |
678 | } | |
679 | ||
680 | int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) | |
681 | { | |
769ba8d9 | 682 | if (ios->parity_pages) { |
9ff19309 | 683 | struct ore_layout *layout = ios->layout; |
769ba8d9 | 684 | unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; |
769ba8d9 BH |
685 | |
686 | if (_sp2d_alloc(pages_in_unit, layout->group_width, | |
687 | layout->parity, &ios->sp2d)) { | |
688 | return -ENOMEM; | |
689 | } | |
769ba8d9 | 690 | } |
a1fec1db BH |
691 | return 0; |
692 | } | |
693 | ||
694 | void _ore_free_raid_stuff(struct ore_io_state *ios) | |
695 | { | |
769ba8d9 | 696 | if (ios->sp2d) { /* writing and raid */ |
a1fec1db BH |
697 | unsigned i; |
698 | ||
699 | for (i = 0; i < ios->cur_par_page; i++) { | |
700 | struct page *page = ios->parity_pages[i]; | |
701 | ||
702 | if (page) | |
703 | _raid_page_free(page); | |
704 | } | |
705 | if (ios->extra_part_alloc) | |
706 | kfree(ios->parity_pages); | |
769ba8d9 BH |
707 | /* If IO returned an error pages might need unlocking */ |
708 | _sp2d_reset(ios->sp2d, ios->r4w, ios->private); | |
709 | _sp2d_free(ios->sp2d); | |
a1fec1db BH |
710 | } else { |
711 | /* Will only be set if raid reading && sglist is big */ | |
712 | if (ios->extra_part_alloc) | |
713 | kfree(ios->per_dev[0].sglist); | |
714 | } | |
769ba8d9 BH |
715 | if (ios->ios_read_4_write) |
716 | ore_put_io_state(ios->ios_read_4_write); | |
a1fec1db | 717 | } |