Commit | Line | Data |
---|---|---|
028ba5df TY |
1 | /* -*- mode: c; c-basic-offset: 8; -*- |
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | |
3 | * | |
4 | * move_extents.c | |
5 | * | |
6 | * Copyright (C) 2011 Oracle. All rights reserved. | |
7 | * | |
8 | * This program is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU General Public | |
10 | * License version 2 as published by the Free Software Foundation. | |
11 | * | |
12 | * This program is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * General Public License for more details. | |
16 | */ | |
17 | #include <linux/fs.h> | |
18 | #include <linux/types.h> | |
19 | #include <linux/mount.h> | |
20 | #include <linux/swap.h> | |
21 | ||
22 | #include <cluster/masklog.h> | |
23 | ||
24 | #include "ocfs2.h" | |
25 | #include "ocfs2_ioctl.h" | |
26 | ||
27 | #include "alloc.h" | |
28 | #include "aops.h" | |
29 | #include "dlmglue.h" | |
30 | #include "extent_map.h" | |
31 | #include "inode.h" | |
32 | #include "journal.h" | |
33 | #include "suballoc.h" | |
34 | #include "uptodate.h" | |
35 | #include "super.h" | |
36 | #include "dir.h" | |
37 | #include "buffer_head_io.h" | |
38 | #include "sysfile.h" | |
39 | #include "suballoc.h" | |
40 | #include "refcounttree.h" | |
41 | #include "move_extents.h" | |
42 | ||
43 | struct ocfs2_move_extents_context { | |
44 | struct inode *inode; | |
45 | struct file *file; | |
46 | int auto_defrag; | |
4dfa66bd | 47 | int partial; |
028ba5df TY |
48 | int credits; |
49 | u32 new_phys_cpos; | |
50 | u32 clusters_moved; | |
51 | u64 refcount_loc; | |
52 | struct ocfs2_move_extents *range; | |
53 | struct ocfs2_extent_tree et; | |
54 | struct ocfs2_alloc_context *meta_ac; | |
55 | struct ocfs2_alloc_context *data_ac; | |
56 | struct ocfs2_cached_dealloc_ctxt dealloc; | |
57 | }; | |
de474ee8 | 58 | |
8f603e56 TY |
59 | static int __ocfs2_move_extent(handle_t *handle, |
60 | struct ocfs2_move_extents_context *context, | |
61 | u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos, | |
62 | int ext_flags) | |
63 | { | |
64 | int ret = 0, index; | |
65 | struct inode *inode = context->inode; | |
66 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | |
67 | struct ocfs2_extent_rec *rec, replace_rec; | |
68 | struct ocfs2_path *path = NULL; | |
69 | struct ocfs2_extent_list *el; | |
70 | u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci); | |
71 | u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos); | |
72 | ||
73 | ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos, | |
74 | p_cpos, new_p_cpos, len); | |
75 | if (ret) { | |
76 | mlog_errno(ret); | |
77 | goto out; | |
78 | } | |
79 | ||
80 | memset(&replace_rec, 0, sizeof(replace_rec)); | |
81 | replace_rec.e_cpos = cpu_to_le32(cpos); | |
82 | replace_rec.e_leaf_clusters = cpu_to_le16(len); | |
83 | replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, | |
84 | new_p_cpos)); | |
85 | ||
86 | path = ocfs2_new_path_from_et(&context->et); | |
87 | if (!path) { | |
88 | ret = -ENOMEM; | |
89 | mlog_errno(ret); | |
90 | goto out; | |
91 | } | |
92 | ||
93 | ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos); | |
94 | if (ret) { | |
95 | mlog_errno(ret); | |
96 | goto out; | |
97 | } | |
98 | ||
99 | el = path_leaf_el(path); | |
100 | ||
101 | index = ocfs2_search_extent_list(el, cpos); | |
102 | if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { | |
103 | ocfs2_error(inode->i_sb, | |
104 | "Inode %llu has an extent at cpos %u which can no " | |
105 | "longer be found.\n", | |
106 | (unsigned long long)ino, cpos); | |
107 | ret = -EROFS; | |
108 | goto out; | |
109 | } | |
110 | ||
111 | rec = &el->l_recs[index]; | |
112 | ||
113 | BUG_ON(ext_flags != rec->e_flags); | |
114 | /* | |
115 | * after moving/defraging to new location, the extent is not going | |
116 | * to be refcounted anymore. | |
117 | */ | |
118 | replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED; | |
119 | ||
120 | ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), | |
121 | context->et.et_root_bh, | |
122 | OCFS2_JOURNAL_ACCESS_WRITE); | |
123 | if (ret) { | |
124 | mlog_errno(ret); | |
125 | goto out; | |
126 | } | |
127 | ||
128 | ret = ocfs2_split_extent(handle, &context->et, path, index, | |
129 | &replace_rec, context->meta_ac, | |
130 | &context->dealloc); | |
131 | if (ret) { | |
132 | mlog_errno(ret); | |
133 | goto out; | |
134 | } | |
135 | ||
136 | ocfs2_journal_dirty(handle, context->et.et_root_bh); | |
137 | ||
138 | context->new_phys_cpos = new_p_cpos; | |
139 | ||
140 | /* | |
141 | * need I to append truncate log for old clusters? | |
142 | */ | |
143 | if (old_blkno) { | |
144 | if (ext_flags & OCFS2_EXT_REFCOUNTED) | |
145 | ret = ocfs2_decrease_refcount(inode, handle, | |
146 | ocfs2_blocks_to_clusters(osb->sb, | |
147 | old_blkno), | |
148 | len, context->meta_ac, | |
149 | &context->dealloc, 1); | |
150 | else | |
151 | ret = ocfs2_truncate_log_append(osb, handle, | |
152 | old_blkno, len); | |
153 | } | |
154 | ||
155 | out: | |
156 | return ret; | |
157 | } | |
158 | ||
de474ee8 TY |
159 | /* |
160 | * lock allocators, and reserving appropriate number of bits for | |
161 | * meta blocks and data clusters. | |
162 | * | |
163 | * in some cases, we don't need to reserve clusters, just let data_ac | |
164 | * be NULL. | |
165 | */ | |
166 | static int ocfs2_lock_allocators_move_extents(struct inode *inode, | |
167 | struct ocfs2_extent_tree *et, | |
168 | u32 clusters_to_move, | |
169 | u32 extents_to_split, | |
170 | struct ocfs2_alloc_context **meta_ac, | |
171 | struct ocfs2_alloc_context **data_ac, | |
172 | int extra_blocks, | |
173 | int *credits) | |
174 | { | |
175 | int ret, num_free_extents; | |
176 | unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; | |
177 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | |
178 | ||
179 | num_free_extents = ocfs2_num_free_extents(osb, et); | |
180 | if (num_free_extents < 0) { | |
181 | ret = num_free_extents; | |
182 | mlog_errno(ret); | |
183 | goto out; | |
184 | } | |
185 | ||
186 | if (!num_free_extents || | |
187 | (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) | |
188 | extra_blocks += ocfs2_extend_meta_needed(et->et_root_el); | |
189 | ||
190 | ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac); | |
191 | if (ret) { | |
192 | mlog_errno(ret); | |
193 | goto out; | |
194 | } | |
195 | ||
196 | if (data_ac) { | |
197 | ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac); | |
198 | if (ret) { | |
199 | mlog_errno(ret); | |
200 | goto out; | |
201 | } | |
202 | } | |
203 | ||
204 | *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el, | |
205 | clusters_to_move + 2); | |
206 | ||
207 | mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n", | |
208 | extra_blocks, clusters_to_move, *credits); | |
209 | out: | |
210 | if (ret) { | |
211 | if (*meta_ac) { | |
212 | ocfs2_free_alloc_context(*meta_ac); | |
213 | *meta_ac = NULL; | |
214 | } | |
215 | } | |
216 | ||
217 | return ret; | |
218 | } | |
202ee5fa TY |
219 | |
220 | /* | |
221 | * Using one journal handle to guarantee the data consistency in case | |
222 | * crash happens anywhere. | |
dda54e76 TY |
223 | * |
224 | * XXX: defrag can end up with finishing partial extent as requested, | |
225 | * due to not enough contiguous clusters can be found in allocator. | |
202ee5fa TY |
226 | */ |
227 | static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, | |
4dfa66bd | 228 | u32 cpos, u32 phys_cpos, u32 *len, int ext_flags) |
202ee5fa | 229 | { |
4dfa66bd | 230 | int ret, credits = 0, extra_blocks = 0, partial = context->partial; |
202ee5fa TY |
231 | handle_t *handle; |
232 | struct inode *inode = context->inode; | |
233 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | |
234 | struct inode *tl_inode = osb->osb_tl_inode; | |
235 | struct ocfs2_refcount_tree *ref_tree = NULL; | |
236 | u32 new_phys_cpos, new_len; | |
237 | u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); | |
238 | ||
4dfa66bd | 239 | if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) { |
202ee5fa TY |
240 | |
241 | BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & | |
242 | OCFS2_HAS_REFCOUNT_FL)); | |
243 | ||
244 | BUG_ON(!context->refcount_loc); | |
245 | ||
246 | ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, | |
247 | &ref_tree, NULL); | |
248 | if (ret) { | |
249 | mlog_errno(ret); | |
250 | return ret; | |
251 | } | |
252 | ||
253 | ret = ocfs2_prepare_refcount_change_for_del(inode, | |
254 | context->refcount_loc, | |
255 | phys_blkno, | |
4dfa66bd | 256 | *len, |
202ee5fa TY |
257 | &credits, |
258 | &extra_blocks); | |
259 | if (ret) { | |
260 | mlog_errno(ret); | |
261 | goto out; | |
262 | } | |
263 | } | |
264 | ||
4dfa66bd | 265 | ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1, |
202ee5fa TY |
266 | &context->meta_ac, |
267 | &context->data_ac, | |
268 | extra_blocks, &credits); | |
269 | if (ret) { | |
270 | mlog_errno(ret); | |
271 | goto out; | |
272 | } | |
273 | ||
274 | /* | |
275 | * should be using allocation reservation strategy there? | |
276 | * | |
277 | * if (context->data_ac) | |
278 | * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; | |
279 | */ | |
280 | ||
281 | mutex_lock(&tl_inode->i_mutex); | |
282 | ||
283 | if (ocfs2_truncate_log_needs_flush(osb)) { | |
284 | ret = __ocfs2_flush_truncate_log(osb); | |
285 | if (ret < 0) { | |
286 | mlog_errno(ret); | |
287 | goto out_unlock_mutex; | |
288 | } | |
289 | } | |
290 | ||
291 | handle = ocfs2_start_trans(osb, credits); | |
292 | if (IS_ERR(handle)) { | |
293 | ret = PTR_ERR(handle); | |
294 | mlog_errno(ret); | |
295 | goto out_unlock_mutex; | |
296 | } | |
297 | ||
4dfa66bd | 298 | ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len, |
202ee5fa TY |
299 | &new_phys_cpos, &new_len); |
300 | if (ret) { | |
301 | mlog_errno(ret); | |
302 | goto out_commit; | |
303 | } | |
304 | ||
305 | /* | |
4dfa66bd TY |
306 | * allowing partial extent moving is kind of 'pros and cons', it makes |
307 | * whole defragmentation less likely to fail, on the contrary, the bad | |
308 | * thing is it may make the fs even more fragmented after moving, let | |
309 | * userspace make a good decision here. | |
202ee5fa | 310 | */ |
4dfa66bd TY |
311 | if (new_len != *len) { |
312 | mlog(0, "len_claimed: %u, len: %u\n", new_len, *len); | |
313 | if (!partial) { | |
314 | context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE; | |
315 | ret = -ENOSPC; | |
316 | goto out_commit; | |
317 | } | |
202ee5fa TY |
318 | } |
319 | ||
320 | mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos, | |
321 | phys_cpos, new_phys_cpos); | |
322 | ||
4dfa66bd | 323 | ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos, |
202ee5fa TY |
324 | new_phys_cpos, ext_flags); |
325 | if (ret) | |
326 | mlog_errno(ret); | |
327 | ||
4dfa66bd TY |
328 | if (partial && (new_len != *len)) |
329 | *len = new_len; | |
330 | ||
202ee5fa TY |
331 | /* |
332 | * Here we should write the new page out first if we are | |
333 | * in write-back mode. | |
334 | */ | |
4dfa66bd | 335 | ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len); |
202ee5fa TY |
336 | if (ret) |
337 | mlog_errno(ret); | |
338 | ||
339 | out_commit: | |
340 | ocfs2_commit_trans(osb, handle); | |
341 | ||
342 | out_unlock_mutex: | |
343 | mutex_unlock(&tl_inode->i_mutex); | |
344 | ||
345 | if (context->data_ac) { | |
346 | ocfs2_free_alloc_context(context->data_ac); | |
347 | context->data_ac = NULL; | |
348 | } | |
349 | ||
350 | if (context->meta_ac) { | |
351 | ocfs2_free_alloc_context(context->meta_ac); | |
352 | context->meta_ac = NULL; | |
353 | } | |
354 | ||
355 | out: | |
356 | if (ref_tree) | |
357 | ocfs2_unlock_refcount_tree(osb, ref_tree, 1); | |
358 | ||
359 | return ret; | |
360 | } | |
1c06b912 TY |
361 | |
362 | /* | |
363 | * find the victim alloc group, where #blkno fits. | |
364 | */ | |
365 | static int ocfs2_find_victim_alloc_group(struct inode *inode, | |
366 | u64 vict_blkno, | |
367 | int type, int slot, | |
368 | int *vict_bit, | |
369 | struct buffer_head **ret_bh) | |
370 | { | |
371 | int ret, i, blocks_per_unit = 1; | |
372 | u64 blkno; | |
373 | char namebuf[40]; | |
374 | ||
375 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | |
376 | struct buffer_head *ac_bh = NULL, *gd_bh = NULL; | |
377 | struct ocfs2_chain_list *cl; | |
378 | struct ocfs2_chain_rec *rec; | |
379 | struct ocfs2_dinode *ac_dinode; | |
380 | struct ocfs2_group_desc *bg; | |
381 | ||
382 | ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); | |
383 | ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, | |
384 | strlen(namebuf), &blkno); | |
385 | if (ret) { | |
386 | ret = -ENOENT; | |
387 | goto out; | |
388 | } | |
389 | ||
390 | ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh); | |
391 | if (ret) { | |
392 | mlog_errno(ret); | |
393 | goto out; | |
394 | } | |
395 | ||
396 | ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data; | |
397 | cl = &(ac_dinode->id2.i_chain); | |
398 | rec = &(cl->cl_recs[0]); | |
399 | ||
400 | if (type == GLOBAL_BITMAP_SYSTEM_INODE) | |
401 | blocks_per_unit <<= (osb->s_clustersize_bits - | |
402 | inode->i_sb->s_blocksize_bits); | |
403 | /* | |
404 | * 'vict_blkno' was out of the valid range. | |
405 | */ | |
406 | if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || | |
407 | (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) * | |
408 | blocks_per_unit))) { | |
409 | ret = -EINVAL; | |
410 | goto out; | |
411 | } | |
412 | ||
413 | for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { | |
414 | ||
415 | rec = &(cl->cl_recs[i]); | |
416 | if (!rec) | |
417 | continue; | |
418 | ||
419 | bg = NULL; | |
420 | ||
421 | do { | |
422 | if (!bg) | |
423 | blkno = le64_to_cpu(rec->c_blkno); | |
424 | else | |
425 | blkno = le64_to_cpu(bg->bg_next_group); | |
426 | ||
427 | if (gd_bh) { | |
428 | brelse(gd_bh); | |
429 | gd_bh = NULL; | |
430 | } | |
431 | ||
432 | ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh); | |
433 | if (ret) { | |
434 | mlog_errno(ret); | |
435 | goto out; | |
436 | } | |
437 | ||
438 | bg = (struct ocfs2_group_desc *)gd_bh->b_data; | |
439 | ||
440 | if (vict_blkno < (le64_to_cpu(bg->bg_blkno) + | |
441 | le16_to_cpu(bg->bg_bits))) { | |
442 | ||
443 | *ret_bh = gd_bh; | |
444 | *vict_bit = (vict_blkno - blkno) / | |
445 | blocks_per_unit; | |
446 | mlog(0, "find the victim group: #%llu, " | |
447 | "total_bits: %u, vict_bit: %u\n", | |
448 | blkno, le16_to_cpu(bg->bg_bits), | |
449 | *vict_bit); | |
450 | goto out; | |
451 | } | |
452 | ||
453 | } while (le64_to_cpu(bg->bg_next_group)); | |
454 | } | |
455 | ||
456 | ret = -EINVAL; | |
457 | out: | |
458 | brelse(ac_bh); | |
459 | ||
460 | /* | |
461 | * caller has to release the gd_bh properly. | |
462 | */ | |
463 | return ret; | |
464 | } | |
99e4c750 TY |
465 | |
466 | /* | |
467 | * XXX: helper to validate and adjust moving goal. | |
468 | */ | |
469 | static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, | |
470 | struct ocfs2_move_extents *range) | |
471 | { | |
472 | int ret, goal_bit = 0; | |
473 | ||
474 | struct buffer_head *gd_bh = NULL; | |
475 | struct ocfs2_group_desc *bg; | |
476 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | |
477 | int c_to_b = 1 << (osb->s_clustersize_bits - | |
478 | inode->i_sb->s_blocksize_bits); | |
479 | ||
480 | /* | |
481 | * validate goal sits within global_bitmap, and return the victim | |
482 | * group desc | |
483 | */ | |
484 | ret = ocfs2_find_victim_alloc_group(inode, range->me_goal, | |
485 | GLOBAL_BITMAP_SYSTEM_INODE, | |
486 | OCFS2_INVALID_SLOT, | |
487 | &goal_bit, &gd_bh); | |
488 | if (ret) | |
489 | goto out; | |
490 | ||
491 | bg = (struct ocfs2_group_desc *)gd_bh->b_data; | |
492 | ||
493 | /* | |
494 | * make goal become cluster aligned. | |
495 | */ | |
496 | if (range->me_goal % c_to_b) | |
497 | range->me_goal = range->me_goal / c_to_b * c_to_b; | |
498 | ||
499 | /* | |
500 | * moving goal is not allowd to start with a group desc blok(#0 blk) | |
501 | * let's compromise to the latter cluster. | |
502 | */ | |
503 | if (range->me_goal == le64_to_cpu(bg->bg_blkno)) | |
504 | range->me_goal += c_to_b; | |
505 | ||
506 | /* | |
507 | * movement is not gonna cross two groups. | |
508 | */ | |
509 | if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize < | |
510 | range->me_len) { | |
511 | ret = -EINVAL; | |
512 | goto out; | |
513 | } | |
514 | /* | |
515 | * more exact validations/adjustments will be performed later during | |
516 | * moving operation for each extent range. | |
517 | */ | |
518 | mlog(0, "extents get ready to be moved to #%llu block\n", | |
519 | range->me_goal); | |
520 | ||
521 | out: | |
522 | brelse(gd_bh); | |
523 | ||
524 | return ret; | |
525 | } | |
e6b5859c TY |
526 | |
527 | static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, | |
528 | int *goal_bit, u32 move_len, u32 max_hop, | |
529 | u32 *phys_cpos) | |
530 | { | |
531 | int i, used, last_free_bits = 0, base_bit = *goal_bit; | |
532 | struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; | |
533 | u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb, | |
534 | le64_to_cpu(gd->bg_blkno)); | |
535 | ||
536 | for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) { | |
537 | ||
538 | used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap); | |
539 | if (used) { | |
540 | /* | |
541 | * we even tried searching the free chunk by jumping | |
542 | * a 'max_hop' distance, but still failed. | |
543 | */ | |
544 | if ((i - base_bit) > max_hop) { | |
545 | *phys_cpos = 0; | |
546 | break; | |
547 | } | |
548 | ||
549 | if (last_free_bits) | |
550 | last_free_bits = 0; | |
551 | ||
552 | continue; | |
553 | } else | |
554 | last_free_bits++; | |
555 | ||
556 | if (last_free_bits == move_len) { | |
557 | *goal_bit = i; | |
558 | *phys_cpos = base_cpos + i; | |
559 | break; | |
560 | } | |
561 | } | |
562 | ||
563 | mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); | |
564 | } | |
8473aa8a TY |
565 | |
566 | static int ocfs2_alloc_dinode_update_counts(struct inode *inode, | |
567 | handle_t *handle, | |
568 | struct buffer_head *di_bh, | |
569 | u32 num_bits, | |
570 | u16 chain) | |
571 | { | |
572 | int ret; | |
573 | u32 tmp_used; | |
574 | struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; | |
575 | struct ocfs2_chain_list *cl = | |
576 | (struct ocfs2_chain_list *) &di->id2.i_chain; | |
577 | ||
578 | ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, | |
579 | OCFS2_JOURNAL_ACCESS_WRITE); | |
580 | if (ret < 0) { | |
581 | mlog_errno(ret); | |
582 | goto out; | |
583 | } | |
584 | ||
585 | tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); | |
586 | di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); | |
587 | le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); | |
588 | ocfs2_journal_dirty(handle, di_bh); | |
589 | ||
590 | out: | |
591 | return ret; | |
592 | } | |
593 | ||
594 | static inline int ocfs2_block_group_set_bits(handle_t *handle, | |
595 | struct inode *alloc_inode, | |
596 | struct ocfs2_group_desc *bg, | |
597 | struct buffer_head *group_bh, | |
598 | unsigned int bit_off, | |
599 | unsigned int num_bits) | |
600 | { | |
601 | int status; | |
602 | void *bitmap = bg->bg_bitmap; | |
603 | int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; | |
604 | ||
605 | /* All callers get the descriptor via | |
606 | * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ | |
607 | BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); | |
608 | BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); | |
609 | ||
610 | mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, | |
611 | num_bits); | |
612 | ||
613 | if (ocfs2_is_cluster_bitmap(alloc_inode)) | |
614 | journal_type = OCFS2_JOURNAL_ACCESS_UNDO; | |
615 | ||
616 | status = ocfs2_journal_access_gd(handle, | |
617 | INODE_CACHE(alloc_inode), | |
618 | group_bh, | |
619 | journal_type); | |
620 | if (status < 0) { | |
621 | mlog_errno(status); | |
622 | goto bail; | |
623 | } | |
624 | ||
625 | le16_add_cpu(&bg->bg_free_bits_count, -num_bits); | |
626 | if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { | |
627 | ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" | |
628 | " count %u but claims %u are freed. num_bits %d", | |
629 | (unsigned long long)le64_to_cpu(bg->bg_blkno), | |
630 | le16_to_cpu(bg->bg_bits), | |
631 | le16_to_cpu(bg->bg_free_bits_count), num_bits); | |
632 | return -EROFS; | |
633 | } | |
634 | while (num_bits--) | |
635 | ocfs2_set_bit(bit_off++, bitmap); | |
636 | ||
637 | ocfs2_journal_dirty(handle, group_bh); | |
638 | ||
639 | bail: | |
640 | return status; | |
641 | } | |
e0847717 TY |
642 | |
643 | static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, | |
644 | u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, | |
645 | u32 len, int ext_flags) | |
646 | { | |
647 | int ret, credits = 0, extra_blocks = 0, goal_bit = 0; | |
648 | handle_t *handle; | |
649 | struct inode *inode = context->inode; | |
650 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | |
651 | struct inode *tl_inode = osb->osb_tl_inode; | |
652 | struct inode *gb_inode = NULL; | |
653 | struct buffer_head *gb_bh = NULL; | |
654 | struct buffer_head *gd_bh = NULL; | |
655 | struct ocfs2_group_desc *gd; | |
656 | struct ocfs2_refcount_tree *ref_tree = NULL; | |
657 | u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb, | |
658 | context->range->me_threshold); | |
659 | u64 phys_blkno, new_phys_blkno; | |
660 | ||
661 | phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); | |
662 | ||
663 | if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) { | |
664 | ||
665 | BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & | |
666 | OCFS2_HAS_REFCOUNT_FL)); | |
667 | ||
668 | BUG_ON(!context->refcount_loc); | |
669 | ||
670 | ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, | |
671 | &ref_tree, NULL); | |
672 | if (ret) { | |
673 | mlog_errno(ret); | |
674 | return ret; | |
675 | } | |
676 | ||
677 | ret = ocfs2_prepare_refcount_change_for_del(inode, | |
678 | context->refcount_loc, | |
679 | phys_blkno, | |
680 | len, | |
681 | &credits, | |
682 | &extra_blocks); | |
683 | if (ret) { | |
684 | mlog_errno(ret); | |
685 | goto out; | |
686 | } | |
687 | } | |
688 | ||
689 | ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1, | |
690 | &context->meta_ac, | |
691 | NULL, extra_blocks, &credits); | |
692 | if (ret) { | |
693 | mlog_errno(ret); | |
694 | goto out; | |
695 | } | |
696 | ||
697 | /* | |
698 | * need to count 2 extra credits for global_bitmap inode and | |
699 | * group descriptor. | |
700 | */ | |
701 | credits += OCFS2_INODE_UPDATE_CREDITS + 1; | |
702 | ||
703 | /* | |
704 | * ocfs2_move_extent() didn't reserve any clusters in lock_allocators() | |
705 | * logic, while we still need to lock the global_bitmap. | |
706 | */ | |
707 | gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, | |
708 | OCFS2_INVALID_SLOT); | |
709 | if (!gb_inode) { | |
710 | mlog(ML_ERROR, "unable to get global_bitmap inode\n"); | |
711 | ret = -EIO; | |
712 | goto out; | |
713 | } | |
714 | ||
715 | mutex_lock(&gb_inode->i_mutex); | |
716 | ||
717 | ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); | |
718 | if (ret) { | |
719 | mlog_errno(ret); | |
720 | goto out_unlock_gb_mutex; | |
721 | } | |
722 | ||
723 | mutex_lock(&tl_inode->i_mutex); | |
724 | ||
725 | handle = ocfs2_start_trans(osb, credits); | |
726 | if (IS_ERR(handle)) { | |
727 | ret = PTR_ERR(handle); | |
728 | mlog_errno(ret); | |
729 | goto out_unlock_tl_inode; | |
730 | } | |
731 | ||
732 | new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos); | |
733 | ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno, | |
734 | GLOBAL_BITMAP_SYSTEM_INODE, | |
735 | OCFS2_INVALID_SLOT, | |
736 | &goal_bit, &gd_bh); | |
737 | if (ret) { | |
738 | mlog_errno(ret); | |
739 | goto out_commit; | |
740 | } | |
741 | ||
742 | /* | |
743 | * probe the victim cluster group to find a proper | |
744 | * region to fit wanted movement, it even will perfrom | |
745 | * a best-effort attempt by compromising to a threshold | |
746 | * around the goal. | |
747 | */ | |
748 | ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, | |
749 | new_phys_cpos); | |
750 | if (!new_phys_cpos) { | |
751 | ret = -ENOSPC; | |
752 | goto out_commit; | |
753 | } | |
754 | ||
755 | ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos, | |
756 | *new_phys_cpos, ext_flags); | |
757 | if (ret) { | |
758 | mlog_errno(ret); | |
759 | goto out_commit; | |
760 | } | |
761 | ||
762 | gd = (struct ocfs2_group_desc *)gd_bh->b_data; | |
763 | ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len, | |
764 | le16_to_cpu(gd->bg_chain)); | |
765 | if (ret) { | |
766 | mlog_errno(ret); | |
767 | goto out_commit; | |
768 | } | |
769 | ||
770 | ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, | |
771 | goal_bit, len); | |
772 | if (ret) | |
773 | mlog_errno(ret); | |
774 | ||
775 | /* | |
776 | * Here we should write the new page out first if we are | |
777 | * in write-back mode. | |
778 | */ | |
779 | ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len); | |
780 | if (ret) | |
781 | mlog_errno(ret); | |
782 | ||
783 | out_commit: | |
784 | ocfs2_commit_trans(osb, handle); | |
785 | brelse(gd_bh); | |
786 | ||
787 | out_unlock_tl_inode: | |
788 | mutex_unlock(&tl_inode->i_mutex); | |
789 | ||
790 | ocfs2_inode_unlock(gb_inode, 1); | |
791 | out_unlock_gb_mutex: | |
792 | mutex_unlock(&gb_inode->i_mutex); | |
793 | brelse(gb_bh); | |
794 | iput(gb_inode); | |
795 | ||
796 | out: | |
797 | if (context->meta_ac) { | |
798 | ocfs2_free_alloc_context(context->meta_ac); | |
799 | context->meta_ac = NULL; | |
800 | } | |
801 | ||
802 | if (ref_tree) | |
803 | ocfs2_unlock_refcount_tree(osb, ref_tree, 1); | |
804 | ||
805 | return ret; | |
806 | } | |
ee16cc03 TY |
807 | |
808 | /* | |
809 | * Helper to calculate the defraging length in one run according to threshold. | |
810 | */ | |
811 | static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged, | |
812 | u32 threshold, int *skip) | |
813 | { | |
814 | if ((*alloc_size + *len_defraged) < threshold) { | |
815 | /* | |
816 | * proceed defragmentation until we meet the thresh | |
817 | */ | |
818 | *len_defraged += *alloc_size; | |
819 | } else if (*len_defraged == 0) { | |
820 | /* | |
821 | * XXX: skip a large extent. | |
822 | */ | |
823 | *skip = 1; | |
824 | } else { | |
825 | /* | |
826 | * split this extent to coalesce with former pieces as | |
827 | * to reach the threshold. | |
828 | * | |
829 | * we're done here with one cycle of defragmentation | |
830 | * in a size of 'thresh', resetting 'len_defraged' | |
831 | * forces a new defragmentation. | |
832 | */ | |
833 | *alloc_size = threshold - *len_defraged; | |
834 | *len_defraged = 0; | |
835 | } | |
836 | } | |
53069d4e TY |
837 | |
838 | static int __ocfs2_move_extents_range(struct buffer_head *di_bh, | |
839 | struct ocfs2_move_extents_context *context) | |
840 | { | |
841 | int ret = 0, flags, do_defrag, skip = 0; | |
842 | u32 cpos, phys_cpos, move_start, len_to_move, alloc_size; | |
843 | u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0; | |
844 | ||
845 | struct inode *inode = context->inode; | |
846 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; | |
847 | struct ocfs2_move_extents *range = context->range; | |
848 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | |
849 | ||
850 | if ((inode->i_size == 0) || (range->me_len == 0)) | |
851 | return 0; | |
852 | ||
853 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) | |
854 | return 0; | |
855 | ||
856 | context->refcount_loc = le64_to_cpu(di->i_refcount_loc); | |
857 | ||
858 | ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh); | |
859 | ocfs2_init_dealloc_ctxt(&context->dealloc); | |
860 | ||
861 | /* | |
862 | * TO-DO XXX: | |
863 | * | |
864 | * - xattr extents. | |
865 | */ | |
866 | ||
867 | do_defrag = context->auto_defrag; | |
868 | ||
869 | /* | |
870 | * extents moving happens in unit of clusters, for the sake | |
871 | * of simplicity, we may ignore two clusters where 'byte_start' | |
872 | * and 'byte_start + len' were within. | |
873 | */ | |
874 | move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start); | |
875 | len_to_move = (range->me_start + range->me_len) >> | |
876 | osb->s_clustersize_bits; | |
877 | if (len_to_move >= move_start) | |
878 | len_to_move -= move_start; | |
879 | else | |
880 | len_to_move = 0; | |
881 | ||
dda54e76 | 882 | if (do_defrag) { |
53069d4e | 883 | defrag_thresh = range->me_threshold >> osb->s_clustersize_bits; |
dda54e76 TY |
884 | if (defrag_thresh <= 1) |
885 | goto done; | |
886 | } else | |
53069d4e TY |
887 | new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, |
888 | range->me_goal); | |
889 | ||
890 | mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, " | |
891 | "thresh: %u\n", | |
892 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | |
893 | (unsigned long long)range->me_start, | |
894 | (unsigned long long)range->me_len, | |
895 | move_start, len_to_move, defrag_thresh); | |
896 | ||
897 | cpos = move_start; | |
898 | while (len_to_move) { | |
899 | ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size, | |
900 | &flags); | |
901 | if (ret) { | |
902 | mlog_errno(ret); | |
903 | goto out; | |
904 | } | |
905 | ||
906 | if (alloc_size > len_to_move) | |
907 | alloc_size = len_to_move; | |
908 | ||
909 | /* | |
910 | * XXX: how to deal with a hole: | |
911 | * | |
912 | * - skip the hole of course | |
913 | * - force a new defragmentation | |
914 | */ | |
915 | if (!phys_cpos) { | |
916 | if (do_defrag) | |
917 | len_defraged = 0; | |
918 | ||
919 | goto next; | |
920 | } | |
921 | ||
922 | if (do_defrag) { | |
923 | ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged, | |
924 | defrag_thresh, &skip); | |
925 | /* | |
926 | * skip large extents | |
927 | */ | |
928 | if (skip) { | |
929 | skip = 0; | |
930 | goto next; | |
931 | } | |
932 | ||
933 | mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, " | |
934 | "alloc_size: %u, len_defraged: %u\n", | |
935 | cpos, phys_cpos, alloc_size, len_defraged); | |
936 | ||
937 | ret = ocfs2_defrag_extent(context, cpos, phys_cpos, | |
4dfa66bd | 938 | &alloc_size, flags); |
53069d4e TY |
939 | } else { |
940 | ret = ocfs2_move_extent(context, cpos, phys_cpos, | |
941 | &new_phys_cpos, alloc_size, | |
942 | flags); | |
943 | ||
944 | new_phys_cpos += alloc_size; | |
945 | } | |
946 | ||
947 | if (ret < 0) { | |
948 | mlog_errno(ret); | |
949 | goto out; | |
950 | } | |
951 | ||
952 | context->clusters_moved += alloc_size; | |
953 | next: | |
954 | cpos += alloc_size; | |
955 | len_to_move -= alloc_size; | |
956 | } | |
957 | ||
dda54e76 | 958 | done: |
53069d4e TY |
959 | range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE; |
960 | ||
961 | out: | |
962 | range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb, | |
963 | context->clusters_moved); | |
964 | range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb, | |
965 | context->new_phys_cpos); | |
966 | ||
967 | ocfs2_schedule_truncate_log_flush(osb, 1); | |
968 | ocfs2_run_deallocs(osb, &context->dealloc); | |
969 | ||
970 | return ret; | |
971 | } | |
972 | ||
973 | static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) | |
974 | { | |
975 | int status; | |
976 | handle_t *handle; | |
977 | struct inode *inode = context->inode; | |
978 | struct ocfs2_dinode *di; | |
979 | struct buffer_head *di_bh = NULL; | |
980 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | |
981 | ||
982 | if (!inode) | |
983 | return -ENOENT; | |
984 | ||
985 | if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) | |
986 | return -EROFS; | |
987 | ||
988 | mutex_lock(&inode->i_mutex); | |
989 | ||
990 | /* | |
991 | * This prevents concurrent writes from other nodes | |
992 | */ | |
993 | status = ocfs2_rw_lock(inode, 1); | |
994 | if (status) { | |
995 | mlog_errno(status); | |
996 | goto out; | |
997 | } | |
998 | ||
999 | status = ocfs2_inode_lock(inode, &di_bh, 1); | |
1000 | if (status) { | |
1001 | mlog_errno(status); | |
1002 | goto out_rw_unlock; | |
1003 | } | |
1004 | ||
1005 | /* | |
1006 | * rememer ip_xattr_sem also needs to be held if necessary | |
1007 | */ | |
1008 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | |
1009 | ||
1010 | status = __ocfs2_move_extents_range(di_bh, context); | |
1011 | ||
1012 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | |
1013 | if (status) { | |
1014 | mlog_errno(status); | |
1015 | goto out_inode_unlock; | |
1016 | } | |
1017 | ||
1018 | /* | |
1019 | * We update ctime for these changes | |
1020 | */ | |
1021 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | |
1022 | if (IS_ERR(handle)) { | |
1023 | status = PTR_ERR(handle); | |
1024 | mlog_errno(status); | |
1025 | goto out_inode_unlock; | |
1026 | } | |
1027 | ||
1028 | status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, | |
1029 | OCFS2_JOURNAL_ACCESS_WRITE); | |
1030 | if (status) { | |
1031 | mlog_errno(status); | |
1032 | goto out_commit; | |
1033 | } | |
1034 | ||
1035 | di = (struct ocfs2_dinode *)di_bh->b_data; | |
1036 | inode->i_ctime = CURRENT_TIME; | |
1037 | di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); | |
1038 | di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); | |
1039 | ||
1040 | ocfs2_journal_dirty(handle, di_bh); | |
1041 | ||
1042 | out_commit: | |
1043 | ocfs2_commit_trans(osb, handle); | |
1044 | ||
1045 | out_inode_unlock: | |
1046 | brelse(di_bh); | |
1047 | ocfs2_inode_unlock(inode, 1); | |
1048 | out_rw_unlock: | |
1049 | ocfs2_rw_unlock(inode, 1); | |
1050 | out: | |
1051 | mutex_unlock(&inode->i_mutex); | |
1052 | ||
1053 | return status; | |
1054 | } | |
1055 | ||
1056 | int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) | |
1057 | { | |
1058 | int status; | |
1059 | ||
1060 | struct inode *inode = filp->f_path.dentry->d_inode; | |
1061 | struct ocfs2_move_extents range; | |
1062 | struct ocfs2_move_extents_context *context = NULL; | |
1063 | ||
1064 | status = mnt_want_write(filp->f_path.mnt); | |
1065 | if (status) | |
1066 | return status; | |
1067 | ||
1068 | if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) | |
1069 | goto out; | |
1070 | ||
1071 | if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { | |
1072 | status = -EPERM; | |
1073 | goto out; | |
1074 | } | |
1075 | ||
1076 | context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS); | |
1077 | if (!context) { | |
1078 | status = -ENOMEM; | |
1079 | mlog_errno(status); | |
1080 | goto out; | |
1081 | } | |
1082 | ||
1083 | context->inode = inode; | |
1084 | context->file = filp; | |
1085 | ||
1086 | if (argp) { | |
1087 | if (copy_from_user(&range, (struct ocfs2_move_extents *)argp, | |
1088 | sizeof(range))) { | |
1089 | status = -EFAULT; | |
1090 | goto out; | |
1091 | } | |
1092 | } else { | |
1093 | status = -EINVAL; | |
1094 | goto out; | |
1095 | } | |
1096 | ||
1097 | if (range.me_start > i_size_read(inode)) | |
1098 | goto out; | |
1099 | ||
1100 | if (range.me_start + range.me_len > i_size_read(inode)) | |
1101 | range.me_len = i_size_read(inode) - range.me_start; | |
1102 | ||
1103 | context->range = ⦥ | |
1104 | ||
1105 | if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) { | |
1106 | context->auto_defrag = 1; | |
dda54e76 TY |
1107 | /* |
1108 | * ok, the default theshold for the defragmentation | |
1109 | * is 1M, since our maximum clustersize was 1M also. | |
1110 | * any thought? | |
1111 | */ | |
53069d4e | 1112 | if (!range.me_threshold) |
53069d4e | 1113 | range.me_threshold = 1024 * 1024; |
dda54e76 TY |
1114 | |
1115 | if (range.me_threshold > i_size_read(inode)) | |
1116 | range.me_threshold = i_size_read(inode); | |
1117 | ||
4dfa66bd TY |
1118 | if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG) |
1119 | context->partial = 1; | |
53069d4e TY |
1120 | } else { |
1121 | /* | |
1122 | * first best-effort attempt to validate and adjust the goal | |
1123 | * (physical address in block), while it can't guarantee later | |
1124 | * operation can succeed all the time since global_bitmap may | |
1125 | * change a bit over time. | |
1126 | */ | |
1127 | ||
1128 | status = ocfs2_validate_and_adjust_move_goal(inode, &range); | |
1129 | if (status) | |
1130 | goto out; | |
1131 | } | |
1132 | ||
1133 | status = ocfs2_move_extents(context); | |
1134 | if (status) | |
1135 | mlog_errno(status); | |
1136 | out: | |
1137 | /* | |
1138 | * movement/defragmentation may end up being partially completed, | |
1139 | * that's the reason why we need to return userspace the finished | |
1140 | * length and new_offset even if failure happens somewhere. | |
1141 | */ | |
1142 | if (argp) { | |
1143 | if (copy_to_user((struct ocfs2_move_extents *)argp, &range, | |
1144 | sizeof(range))) | |
1145 | status = -EFAULT; | |
1146 | } | |
1147 | ||
1148 | kfree(context); | |
1149 | ||
1150 | mnt_drop_write(filp->f_path.mnt); | |
1151 | ||
1152 | return status; | |
1153 | } |