Commit | Line | Data |
---|---|---|
4e0cc29b DW |
1 | /* |
2 | * Copyright (C) 2016 Oracle. All Rights Reserved. | |
3 | * | |
4 | * Author: Darrick J. Wong <darrick.wong@oracle.com> | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU General Public License | |
8 | * as published by the Free Software Foundation; either version 2 | |
9 | * of the License, or (at your option) any later version. | |
10 | * | |
11 | * This program is distributed in the hope that it would be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | * GNU General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * along with this program; if not, write the Free Software Foundation, | |
18 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. | |
19 | */ | |
20 | #include "xfs.h" | |
21 | #include "xfs_fs.h" | |
22 | #include "xfs_shared.h" | |
23 | #include "xfs_format.h" | |
24 | #include "xfs_log_format.h" | |
25 | #include "xfs_trans_resv.h" | |
26 | #include "xfs_bit.h" | |
27 | #include "xfs_sb.h" | |
28 | #include "xfs_mount.h" | |
29 | #include "xfs_defer.h" | |
30 | #include "xfs_trans.h" | |
31 | #include "xfs_trace.h" | |
32 | ||
33 | /* | |
34 | * Deferred Operations in XFS | |
35 | * | |
36 | * Due to the way locking rules work in XFS, certain transactions (block | |
37 | * mapping and unmapping, typically) have permanent reservations so that | |
38 | * we can roll the transaction to adhere to AG locking order rules and | |
39 | * to unlock buffers between metadata updates. Prior to rmap/reflink, | |
40 | * the mapping code had a mechanism to perform these deferrals for | |
41 | * extents that were going to be freed; this code makes that facility | |
42 | * more generic. | |
43 | * | |
44 | * When adding the reverse mapping and reflink features, it became | |
45 | * necessary to perform complex remapping multi-transactions to comply | |
46 | * with AG locking order rules, and to be able to spread a single | |
47 | * refcount update operation (an operation on an n-block extent can | |
48 | * update as many as n records!) among multiple transactions. XFS can | |
49 | * roll a transaction to facilitate this, but using this facility | |
50 | * requires us to log "intent" items in case log recovery needs to | |
51 | * redo the operation, and to log "done" items to indicate that redo | |
52 | * is not necessary. | |
53 | * | |
54 | * Deferred work is tracked in xfs_defer_pending items. Each pending | |
55 | * item tracks one type of deferred work. Incoming work items (which | |
56 | * have not yet had an intent logged) are attached to a pending item | |
57 | * on the dop_intake list, where they wait for the caller to finish | |
58 | * the deferred operations. | |
59 | * | |
60 | * Finishing a set of deferred operations is an involved process. To | |
61 | * start, we define "rolling a deferred-op transaction" as follows: | |
62 | * | |
63 | * > For each xfs_defer_pending item on the dop_intake list, | |
64 | * - Sort the work items in AG order. XFS locking | |
65 | * order rules require us to lock buffers in AG order. | |
66 | * - Create a log intent item for that type. | |
67 | * - Attach it to the pending item. | |
68 | * - Move the pending item from the dop_intake list to the | |
69 | * dop_pending list. | |
70 | * > Roll the transaction. | |
71 | * | |
72 | * NOTE: To avoid exceeding the transaction reservation, we limit the | |
73 | * number of items that we attach to a given xfs_defer_pending. | |
74 | * | |
75 | * The actual finishing process looks like this: | |
76 | * | |
77 | * > For each xfs_defer_pending in the dop_pending list, | |
78 | * - Roll the deferred-op transaction as above. | |
79 | * - Create a log done item for that type, and attach it to the | |
80 | * log intent item. | |
81 | * - For each work item attached to the log intent item, | |
82 | * * Perform the described action. | |
83 | * * Attach the work item to the log done item. | |
84 | * | |
85 | * The key here is that we must log an intent item for all pending | |
86 | * work items every time we roll the transaction, and that we must log | |
87 | * a done item as soon as the work is completed. With this mechanism | |
88 | * we can perform complex remapping operations, chaining intent items | |
89 | * as needed. | |
90 | * | |
91 | * This is an example of remapping the extent (E, E+B) into file X at | |
92 | * offset A and dealing with the extent (C, C+B) already being mapped | |
93 | * there: | |
94 | * +-------------------------------------------------+ | |
95 | * | Unmap file X startblock C offset A length B | t0 | |
96 | * | Intent to reduce refcount for extent (C, B) | | |
97 | * | Intent to remove rmap (X, C, A, B) | | |
98 | * | Intent to free extent (D, 1) (bmbt block) | | |
99 | * | Intent to map (X, A, B) at startblock E | | |
100 | * +-------------------------------------------------+ | |
101 | * | Map file X startblock E offset A length B | t1 | |
102 | * | Done mapping (X, E, A, B) | | |
103 | * | Intent to increase refcount for extent (E, B) | | |
104 | * | Intent to add rmap (X, E, A, B) | | |
105 | * +-------------------------------------------------+ | |
106 | * | Reduce refcount for extent (C, B) | t2 | |
107 | * | Done reducing refcount for extent (C, B) | | |
108 | * | Increase refcount for extent (E, B) | | |
109 | * | Done increasing refcount for extent (E, B) | | |
110 | * | Intent to free extent (C, B) | | |
111 | * | Intent to free extent (F, 1) (refcountbt block) | | |
112 | * | Intent to remove rmap (F, 1, REFC) | | |
113 | * +-------------------------------------------------+ | |
114 | * | Remove rmap (X, C, A, B) | t3 | |
115 | * | Done removing rmap (X, C, A, B) | | |
116 | * | Add rmap (X, E, A, B) | | |
117 | * | Done adding rmap (X, E, A, B) | | |
118 | * | Remove rmap (F, 1, REFC) | | |
119 | * | Done removing rmap (F, 1, REFC) | | |
120 | * +-------------------------------------------------+ | |
121 | * | Free extent (C, B) | t4 | |
122 | * | Done freeing extent (C, B) | | |
123 | * | Free extent (D, 1) | | |
124 | * | Done freeing extent (D, 1) | | |
125 | * | Free extent (F, 1) | | |
126 | * | Done freeing extent (F, 1) | | |
127 | * +-------------------------------------------------+ | |
128 | * | |
129 | * If we should crash before t2 commits, log recovery replays | |
130 | * the following intent items: | |
131 | * | |
132 | * - Intent to reduce refcount for extent (C, B) | |
133 | * - Intent to remove rmap (X, C, A, B) | |
134 | * - Intent to free extent (D, 1) (bmbt block) | |
135 | * - Intent to increase refcount for extent (E, B) | |
136 | * - Intent to add rmap (X, E, A, B) | |
137 | * | |
138 | * In the process of recovering, it should also generate and take care | |
139 | * of these intent items: | |
140 | * | |
141 | * - Intent to free extent (C, B) | |
142 | * - Intent to free extent (F, 1) (refcountbt block) | |
143 | * - Intent to remove rmap (F, 1, REFC) | |
144 | */ | |
145 | ||
146 | static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX]; | |
147 | ||
148 | /* | |
149 | * For each pending item in the intake list, log its intent item and the | |
150 | * associated extents, then add the entire intake list to the end of | |
151 | * the pending list. | |
152 | */ | |
153 | STATIC void | |
154 | xfs_defer_intake_work( | |
155 | struct xfs_trans *tp, | |
156 | struct xfs_defer_ops *dop) | |
157 | { | |
158 | struct list_head *li; | |
159 | struct xfs_defer_pending *dfp; | |
160 | ||
161 | list_for_each_entry(dfp, &dop->dop_intake, dfp_list) { | |
3cd48abc | 162 | trace_xfs_defer_intake_work(tp->t_mountp, dfp); |
4e0cc29b DW |
163 | dfp->dfp_intent = dfp->dfp_type->create_intent(tp, |
164 | dfp->dfp_count); | |
165 | list_sort(tp->t_mountp, &dfp->dfp_work, | |
166 | dfp->dfp_type->diff_items); | |
167 | list_for_each(li, &dfp->dfp_work) | |
168 | dfp->dfp_type->log_item(tp, dfp->dfp_intent, li); | |
169 | } | |
170 | ||
171 | list_splice_tail_init(&dop->dop_intake, &dop->dop_pending); | |
172 | } | |
173 | ||
174 | /* Abort all the intents that were committed. */ | |
175 | STATIC void | |
176 | xfs_defer_trans_abort( | |
177 | struct xfs_trans *tp, | |
178 | struct xfs_defer_ops *dop, | |
179 | int error) | |
180 | { | |
181 | struct xfs_defer_pending *dfp; | |
182 | ||
3cd48abc | 183 | trace_xfs_defer_trans_abort(tp->t_mountp, dop); |
4e0cc29b DW |
184 | /* |
185 | * If the transaction was committed, drop the intent reference | |
186 | * since we're bailing out of here. The other reference is | |
187 | * dropped when the intent hits the AIL. If the transaction | |
188 | * was not committed, the intent is freed by the intent item | |
189 | * unlock handler on abort. | |
190 | */ | |
191 | if (!dop->dop_committed) | |
192 | return; | |
193 | ||
194 | /* Abort intent items. */ | |
195 | list_for_each_entry(dfp, &dop->dop_pending, dfp_list) { | |
3cd48abc | 196 | trace_xfs_defer_pending_abort(tp->t_mountp, dfp); |
ea78d808 | 197 | if (!dfp->dfp_done) |
4e0cc29b DW |
198 | dfp->dfp_type->abort_intent(dfp->dfp_intent); |
199 | } | |
200 | ||
201 | /* Shut down FS. */ | |
202 | xfs_force_shutdown(tp->t_mountp, (error == -EFSCORRUPTED) ? | |
203 | SHUTDOWN_CORRUPT_INCORE : SHUTDOWN_META_IO_ERROR); | |
204 | } | |
205 | ||
206 | /* Roll a transaction so we can do some deferred op processing. */ | |
207 | STATIC int | |
208 | xfs_defer_trans_roll( | |
209 | struct xfs_trans **tp, | |
210 | struct xfs_defer_ops *dop, | |
211 | struct xfs_inode *ip) | |
212 | { | |
213 | int i; | |
214 | int error; | |
215 | ||
216 | /* Log all the joined inodes except the one we passed in. */ | |
217 | for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) { | |
218 | if (dop->dop_inodes[i] == ip) | |
219 | continue; | |
220 | xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE); | |
221 | } | |
222 | ||
3cd48abc DW |
223 | trace_xfs_defer_trans_roll((*tp)->t_mountp, dop); |
224 | ||
4e0cc29b DW |
225 | /* Roll the transaction. */ |
226 | error = xfs_trans_roll(tp, ip); | |
227 | if (error) { | |
3cd48abc | 228 | trace_xfs_defer_trans_roll_error((*tp)->t_mountp, dop, error); |
4e0cc29b DW |
229 | xfs_defer_trans_abort(*tp, dop, error); |
230 | return error; | |
231 | } | |
232 | dop->dop_committed = true; | |
233 | ||
234 | /* Rejoin the joined inodes except the one we passed in. */ | |
235 | for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) { | |
236 | if (dop->dop_inodes[i] == ip) | |
237 | continue; | |
238 | xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0); | |
239 | } | |
240 | ||
241 | return error; | |
242 | } | |
243 | ||
244 | /* Do we have any work items to finish? */ | |
245 | bool | |
246 | xfs_defer_has_unfinished_work( | |
247 | struct xfs_defer_ops *dop) | |
248 | { | |
249 | return !list_empty(&dop->dop_pending) || !list_empty(&dop->dop_intake); | |
250 | } | |
251 | ||
252 | /* | |
253 | * Add this inode to the deferred op. Each joined inode is relogged | |
254 | * each time we roll the transaction, in addition to any inode passed | |
255 | * to xfs_defer_finish(). | |
256 | */ | |
257 | int | |
258 | xfs_defer_join( | |
259 | struct xfs_defer_ops *dop, | |
260 | struct xfs_inode *ip) | |
261 | { | |
262 | int i; | |
263 | ||
264 | for (i = 0; i < XFS_DEFER_OPS_NR_INODES; i++) { | |
265 | if (dop->dop_inodes[i] == ip) | |
266 | return 0; | |
267 | else if (dop->dop_inodes[i] == NULL) { | |
268 | dop->dop_inodes[i] = ip; | |
269 | return 0; | |
270 | } | |
271 | } | |
272 | ||
273 | return -EFSCORRUPTED; | |
274 | } | |
275 | ||
276 | /* | |
277 | * Finish all the pending work. This involves logging intent items for | |
278 | * any work items that wandered in since the last transaction roll (if | |
279 | * one has even happened), rolling the transaction, and finishing the | |
280 | * work items in the first item on the logged-and-pending list. | |
281 | * | |
282 | * If an inode is provided, relog it to the new transaction. | |
283 | */ | |
284 | int | |
285 | xfs_defer_finish( | |
286 | struct xfs_trans **tp, | |
287 | struct xfs_defer_ops *dop, | |
288 | struct xfs_inode *ip) | |
289 | { | |
290 | struct xfs_defer_pending *dfp; | |
291 | struct list_head *li; | |
292 | struct list_head *n; | |
4e0cc29b DW |
293 | void *state; |
294 | int error = 0; | |
295 | void (*cleanup_fn)(struct xfs_trans *, void *, int); | |
296 | ||
297 | ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); | |
298 | ||
3cd48abc DW |
299 | trace_xfs_defer_finish((*tp)->t_mountp, dop); |
300 | ||
4e0cc29b DW |
301 | /* Until we run out of pending work to finish... */ |
302 | while (xfs_defer_has_unfinished_work(dop)) { | |
303 | /* Log intents for work items sitting in the intake. */ | |
304 | xfs_defer_intake_work(*tp, dop); | |
305 | ||
306 | /* Roll the transaction. */ | |
307 | error = xfs_defer_trans_roll(tp, dop, ip); | |
308 | if (error) | |
309 | goto out; | |
310 | ||
4e0cc29b DW |
311 | /* Log an intent-done item for the first pending item. */ |
312 | dfp = list_first_entry(&dop->dop_pending, | |
313 | struct xfs_defer_pending, dfp_list); | |
3cd48abc | 314 | trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp); |
ea78d808 | 315 | dfp->dfp_done = dfp->dfp_type->create_done(*tp, dfp->dfp_intent, |
4e0cc29b DW |
316 | dfp->dfp_count); |
317 | cleanup_fn = dfp->dfp_type->finish_cleanup; | |
318 | ||
319 | /* Finish the work items. */ | |
320 | state = NULL; | |
321 | list_for_each_safe(li, n, &dfp->dfp_work) { | |
322 | list_del(li); | |
323 | dfp->dfp_count--; | |
324 | error = dfp->dfp_type->finish_item(*tp, dop, li, | |
ea78d808 | 325 | dfp->dfp_done, &state); |
4e0cc29b DW |
326 | if (error) { |
327 | /* | |
328 | * Clean up after ourselves and jump out. | |
329 | * xfs_defer_cancel will take care of freeing | |
330 | * all these lists and stuff. | |
331 | */ | |
332 | if (cleanup_fn) | |
333 | cleanup_fn(*tp, state, error); | |
334 | xfs_defer_trans_abort(*tp, dop, error); | |
335 | goto out; | |
336 | } | |
337 | } | |
338 | /* Done with the dfp, free it. */ | |
339 | list_del(&dfp->dfp_list); | |
340 | kmem_free(dfp); | |
341 | ||
342 | if (cleanup_fn) | |
343 | cleanup_fn(*tp, state, error); | |
344 | } | |
345 | ||
346 | out: | |
3cd48abc DW |
347 | if (error) |
348 | trace_xfs_defer_finish_error((*tp)->t_mountp, dop, error); | |
349 | else | |
350 | trace_xfs_defer_finish_done((*tp)->t_mountp, dop); | |
4e0cc29b DW |
351 | return error; |
352 | } | |
353 | ||
354 | /* | |
355 | * Free up any items left in the list. | |
356 | */ | |
357 | void | |
358 | xfs_defer_cancel( | |
359 | struct xfs_defer_ops *dop) | |
360 | { | |
361 | struct xfs_defer_pending *dfp; | |
362 | struct xfs_defer_pending *pli; | |
363 | struct list_head *pwi; | |
364 | struct list_head *n; | |
365 | ||
3cd48abc DW |
366 | trace_xfs_defer_cancel(NULL, dop); |
367 | ||
4e0cc29b DW |
368 | /* |
369 | * Free the pending items. Caller should already have arranged | |
370 | * for the intent items to be released. | |
371 | */ | |
372 | list_for_each_entry_safe(dfp, pli, &dop->dop_intake, dfp_list) { | |
3cd48abc | 373 | trace_xfs_defer_intake_cancel(NULL, dfp); |
4e0cc29b DW |
374 | list_del(&dfp->dfp_list); |
375 | list_for_each_safe(pwi, n, &dfp->dfp_work) { | |
376 | list_del(pwi); | |
377 | dfp->dfp_count--; | |
378 | dfp->dfp_type->cancel_item(pwi); | |
379 | } | |
380 | ASSERT(dfp->dfp_count == 0); | |
381 | kmem_free(dfp); | |
382 | } | |
383 | list_for_each_entry_safe(dfp, pli, &dop->dop_pending, dfp_list) { | |
3cd48abc | 384 | trace_xfs_defer_pending_cancel(NULL, dfp); |
4e0cc29b DW |
385 | list_del(&dfp->dfp_list); |
386 | list_for_each_safe(pwi, n, &dfp->dfp_work) { | |
387 | list_del(pwi); | |
388 | dfp->dfp_count--; | |
389 | dfp->dfp_type->cancel_item(pwi); | |
390 | } | |
391 | ASSERT(dfp->dfp_count == 0); | |
392 | kmem_free(dfp); | |
393 | } | |
394 | } | |
395 | ||
396 | /* Add an item for later deferred processing. */ | |
397 | void | |
398 | xfs_defer_add( | |
399 | struct xfs_defer_ops *dop, | |
400 | enum xfs_defer_ops_type type, | |
401 | struct list_head *li) | |
402 | { | |
403 | struct xfs_defer_pending *dfp = NULL; | |
404 | ||
405 | /* | |
406 | * Add the item to a pending item at the end of the intake list. | |
407 | * If the last pending item has the same type, reuse it. Else, | |
408 | * create a new pending item at the end of the intake list. | |
409 | */ | |
410 | if (!list_empty(&dop->dop_intake)) { | |
411 | dfp = list_last_entry(&dop->dop_intake, | |
412 | struct xfs_defer_pending, dfp_list); | |
413 | if (dfp->dfp_type->type != type || | |
414 | (dfp->dfp_type->max_items && | |
415 | dfp->dfp_count >= dfp->dfp_type->max_items)) | |
416 | dfp = NULL; | |
417 | } | |
418 | if (!dfp) { | |
419 | dfp = kmem_alloc(sizeof(struct xfs_defer_pending), | |
420 | KM_SLEEP | KM_NOFS); | |
421 | dfp->dfp_type = defer_op_types[type]; | |
4e0cc29b | 422 | dfp->dfp_intent = NULL; |
ea78d808 | 423 | dfp->dfp_done = NULL; |
4e0cc29b DW |
424 | dfp->dfp_count = 0; |
425 | INIT_LIST_HEAD(&dfp->dfp_work); | |
426 | list_add_tail(&dfp->dfp_list, &dop->dop_intake); | |
427 | } | |
428 | ||
429 | list_add_tail(li, &dfp->dfp_work); | |
430 | dfp->dfp_count++; | |
431 | } | |
432 | ||
433 | /* Initialize a deferred operation list. */ | |
434 | void | |
435 | xfs_defer_init_op_type( | |
436 | const struct xfs_defer_op_type *type) | |
437 | { | |
438 | defer_op_types[type->type] = type; | |
439 | } | |
440 | ||
441 | /* Initialize a deferred operation. */ | |
442 | void | |
443 | xfs_defer_init( | |
444 | struct xfs_defer_ops *dop, | |
445 | xfs_fsblock_t *fbp) | |
446 | { | |
447 | dop->dop_committed = false; | |
448 | dop->dop_low = false; | |
449 | memset(&dop->dop_inodes, 0, sizeof(dop->dop_inodes)); | |
450 | *fbp = NULLFSBLOCK; | |
451 | INIT_LIST_HEAD(&dop->dop_intake); | |
452 | INIT_LIST_HEAD(&dop->dop_pending); | |
3cd48abc | 453 | trace_xfs_defer_init(NULL, dop); |
4e0cc29b | 454 | } |