2 * Copyright (C) 2007 Oracle. All rights reserved.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
24 #include <linux/pagemap.h>
25 #include <linux/highmem.h>
26 #include <linux/time.h>
27 #include <linux/init.h>
28 #include <linux/string.h>
29 #include <linux/backing-dev.h>
30 #include <linux/mpage.h>
31 #include <linux/swap.h>
32 #include <linux/writeback.h>
33 #include <linux/statfs.h>
34 #include <linux/compat.h>
35 #include <linux/aio.h>
36 #include <linux/bit_spinlock.h>
37 #include <linux/xattr.h>
38 #include <linux/posix_acl.h>
39 #include <linux/falloc.h>
40 #include <linux/slab.h>
41 #include <linux/ratelimit.h>
42 #include <linux/mount.h>
43 #include <linux/btrfs.h>
44 #include <linux/blkdev.h>
45 #include <linux/posix_acl_xattr.h>
49 #include "transaction.h"
50 #include "btrfs_inode.h"
51 #include "print-tree.h"
52 #include "ordered-data.h"
56 #include "compression.h"
58 #include "free-space-cache.h"
59 #include "inode-map.h"
63 struct btrfs_iget_args
{
65 struct btrfs_root
*root
;
68 static const struct inode_operations btrfs_dir_inode_operations
;
69 static const struct inode_operations btrfs_symlink_inode_operations
;
70 static const struct inode_operations btrfs_dir_ro_inode_operations
;
71 static const struct inode_operations btrfs_special_inode_operations
;
72 static const struct inode_operations btrfs_file_inode_operations
;
73 static const struct address_space_operations btrfs_aops
;
74 static const struct address_space_operations btrfs_symlink_aops
;
75 static const struct file_operations btrfs_dir_file_operations
;
76 static struct extent_io_ops btrfs_extent_io_ops
;
78 static struct kmem_cache
*btrfs_inode_cachep
;
79 static struct kmem_cache
*btrfs_delalloc_work_cachep
;
80 struct kmem_cache
*btrfs_trans_handle_cachep
;
81 struct kmem_cache
*btrfs_transaction_cachep
;
82 struct kmem_cache
*btrfs_path_cachep
;
83 struct kmem_cache
*btrfs_free_space_cachep
;
86 static unsigned char btrfs_type_by_mode
[S_IFMT
>> S_SHIFT
] = {
87 [S_IFREG
>> S_SHIFT
] = BTRFS_FT_REG_FILE
,
88 [S_IFDIR
>> S_SHIFT
] = BTRFS_FT_DIR
,
89 [S_IFCHR
>> S_SHIFT
] = BTRFS_FT_CHRDEV
,
90 [S_IFBLK
>> S_SHIFT
] = BTRFS_FT_BLKDEV
,
91 [S_IFIFO
>> S_SHIFT
] = BTRFS_FT_FIFO
,
92 [S_IFSOCK
>> S_SHIFT
] = BTRFS_FT_SOCK
,
93 [S_IFLNK
>> S_SHIFT
] = BTRFS_FT_SYMLINK
,
96 static int btrfs_setsize(struct inode
*inode
, struct iattr
*attr
);
97 static int btrfs_truncate(struct inode
*inode
);
98 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent
*ordered_extent
);
99 static noinline
int cow_file_range(struct inode
*inode
,
100 struct page
*locked_page
,
101 u64 start
, u64 end
, int *page_started
,
102 unsigned long *nr_written
, int unlock
);
103 static struct extent_map
*create_pinned_em(struct inode
*inode
, u64 start
,
104 u64 len
, u64 orig_start
,
105 u64 block_start
, u64 block_len
,
106 u64 orig_block_len
, u64 ram_bytes
,
109 static int btrfs_dirty_inode(struct inode
*inode
);
111 static int btrfs_init_inode_security(struct btrfs_trans_handle
*trans
,
112 struct inode
*inode
, struct inode
*dir
,
113 const struct qstr
*qstr
)
117 err
= btrfs_init_acl(trans
, inode
, dir
);
119 err
= btrfs_xattr_security_init(trans
, inode
, dir
, qstr
);
124 * this does all the hard work for inserting an inline extent into
125 * the btree. The caller should have done a btrfs_drop_extents so that
126 * no overlapping inline items exist in the btree
128 static noinline
int insert_inline_extent(struct btrfs_trans_handle
*trans
,
129 struct btrfs_root
*root
, struct inode
*inode
,
130 u64 start
, size_t size
, size_t compressed_size
,
132 struct page
**compressed_pages
)
134 struct btrfs_key key
;
135 struct btrfs_path
*path
;
136 struct extent_buffer
*leaf
;
137 struct page
*page
= NULL
;
140 struct btrfs_file_extent_item
*ei
;
143 size_t cur_size
= size
;
145 unsigned long offset
;
147 if (compressed_size
&& compressed_pages
)
148 cur_size
= compressed_size
;
150 path
= btrfs_alloc_path();
154 path
->leave_spinning
= 1;
156 key
.objectid
= btrfs_ino(inode
);
158 btrfs_set_key_type(&key
, BTRFS_EXTENT_DATA_KEY
);
159 datasize
= btrfs_file_extent_calc_inline_size(cur_size
);
161 inode_add_bytes(inode
, size
);
162 ret
= btrfs_insert_empty_item(trans
, root
, path
, &key
,
168 leaf
= path
->nodes
[0];
169 ei
= btrfs_item_ptr(leaf
, path
->slots
[0],
170 struct btrfs_file_extent_item
);
171 btrfs_set_file_extent_generation(leaf
, ei
, trans
->transid
);
172 btrfs_set_file_extent_type(leaf
, ei
, BTRFS_FILE_EXTENT_INLINE
);
173 btrfs_set_file_extent_encryption(leaf
, ei
, 0);
174 btrfs_set_file_extent_other_encoding(leaf
, ei
, 0);
175 btrfs_set_file_extent_ram_bytes(leaf
, ei
, size
);
176 ptr
= btrfs_file_extent_inline_start(ei
);
178 if (compress_type
!= BTRFS_COMPRESS_NONE
) {
181 while (compressed_size
> 0) {
182 cpage
= compressed_pages
[i
];
183 cur_size
= min_t(unsigned long, compressed_size
,
186 kaddr
= kmap_atomic(cpage
);
187 write_extent_buffer(leaf
, kaddr
, ptr
, cur_size
);
188 kunmap_atomic(kaddr
);
192 compressed_size
-= cur_size
;
194 btrfs_set_file_extent_compression(leaf
, ei
,
197 page
= find_get_page(inode
->i_mapping
,
198 start
>> PAGE_CACHE_SHIFT
);
199 btrfs_set_file_extent_compression(leaf
, ei
, 0);
200 kaddr
= kmap_atomic(page
);
201 offset
= start
& (PAGE_CACHE_SIZE
- 1);
202 write_extent_buffer(leaf
, kaddr
+ offset
, ptr
, size
);
203 kunmap_atomic(kaddr
);
204 page_cache_release(page
);
206 btrfs_mark_buffer_dirty(leaf
);
207 btrfs_free_path(path
);
210 * we're an inline extent, so nobody can
211 * extend the file past i_size without locking
212 * a page we already have locked.
214 * We must do any isize and inode updates
215 * before we unlock the pages. Otherwise we
216 * could end up racing with unlink.
218 BTRFS_I(inode
)->disk_i_size
= inode
->i_size
;
219 ret
= btrfs_update_inode(trans
, root
, inode
);
223 btrfs_free_path(path
);
229 * conditionally insert an inline extent into the file. This
230 * does the checks required to make sure the data is small enough
231 * to fit as an inline extent.
233 static noinline
int cow_file_range_inline(struct btrfs_trans_handle
*trans
,
234 struct btrfs_root
*root
,
235 struct inode
*inode
, u64 start
, u64 end
,
236 size_t compressed_size
, int compress_type
,
237 struct page
**compressed_pages
)
239 u64 isize
= i_size_read(inode
);
240 u64 actual_end
= min(end
+ 1, isize
);
241 u64 inline_len
= actual_end
- start
;
242 u64 aligned_end
= ALIGN(end
, root
->sectorsize
);
243 u64 data_len
= inline_len
;
247 data_len
= compressed_size
;
250 actual_end
>= PAGE_CACHE_SIZE
||
251 data_len
>= BTRFS_MAX_INLINE_DATA_SIZE(root
) ||
253 (actual_end
& (root
->sectorsize
- 1)) == 0) ||
255 data_len
> root
->fs_info
->max_inline
) {
259 ret
= btrfs_drop_extents(trans
, root
, inode
, start
, aligned_end
, 1);
263 if (isize
> actual_end
)
264 inline_len
= min_t(u64
, isize
, actual_end
);
265 ret
= insert_inline_extent(trans
, root
, inode
, start
,
266 inline_len
, compressed_size
,
267 compress_type
, compressed_pages
);
268 if (ret
&& ret
!= -ENOSPC
) {
269 btrfs_abort_transaction(trans
, root
, ret
);
271 } else if (ret
== -ENOSPC
) {
275 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC
, &BTRFS_I(inode
)->runtime_flags
);
276 btrfs_delalloc_release_metadata(inode
, end
+ 1 - start
);
277 btrfs_drop_extent_cache(inode
, start
, aligned_end
- 1, 0);
281 struct async_extent
{
286 unsigned long nr_pages
;
288 struct list_head list
;
293 struct btrfs_root
*root
;
294 struct page
*locked_page
;
297 struct list_head extents
;
298 struct btrfs_work work
;
301 static noinline
int add_async_extent(struct async_cow
*cow
,
302 u64 start
, u64 ram_size
,
305 unsigned long nr_pages
,
308 struct async_extent
*async_extent
;
310 async_extent
= kmalloc(sizeof(*async_extent
), GFP_NOFS
);
311 BUG_ON(!async_extent
); /* -ENOMEM */
312 async_extent
->start
= start
;
313 async_extent
->ram_size
= ram_size
;
314 async_extent
->compressed_size
= compressed_size
;
315 async_extent
->pages
= pages
;
316 async_extent
->nr_pages
= nr_pages
;
317 async_extent
->compress_type
= compress_type
;
318 list_add_tail(&async_extent
->list
, &cow
->extents
);
323 * we create compressed extents in two phases. The first
324 * phase compresses a range of pages that have already been
325 * locked (both pages and state bits are locked).
327 * This is done inside an ordered work queue, and the compression
328 * is spread across many cpus. The actual IO submission is step
329 * two, and the ordered work queue takes care of making sure that
330 * happens in the same order things were put onto the queue by
331 * writepages and friends.
333 * If this code finds it can't get good compression, it puts an
334 * entry onto the work queue to write the uncompressed bytes. This
335 * makes sure that both compressed inodes and uncompressed inodes
336 * are written in the same order that the flusher thread sent them
339 static noinline
int compress_file_range(struct inode
*inode
,
340 struct page
*locked_page
,
342 struct async_cow
*async_cow
,
345 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
346 struct btrfs_trans_handle
*trans
;
348 u64 blocksize
= root
->sectorsize
;
350 u64 isize
= i_size_read(inode
);
352 struct page
**pages
= NULL
;
353 unsigned long nr_pages
;
354 unsigned long nr_pages_ret
= 0;
355 unsigned long total_compressed
= 0;
356 unsigned long total_in
= 0;
357 unsigned long max_compressed
= 128 * 1024;
358 unsigned long max_uncompressed
= 128 * 1024;
361 int compress_type
= root
->fs_info
->compress_type
;
364 /* if this is a small write inside eof, kick off a defrag */
365 if ((end
- start
+ 1) < 16 * 1024 &&
366 (start
> 0 || end
+ 1 < BTRFS_I(inode
)->disk_i_size
))
367 btrfs_add_inode_defrag(NULL
, inode
);
369 actual_end
= min_t(u64
, isize
, end
+ 1);
372 nr_pages
= (end
>> PAGE_CACHE_SHIFT
) - (start
>> PAGE_CACHE_SHIFT
) + 1;
373 nr_pages
= min(nr_pages
, (128 * 1024UL) / PAGE_CACHE_SIZE
);
376 * we don't want to send crud past the end of i_size through
377 * compression, that's just a waste of CPU time. So, if the
378 * end of the file is before the start of our current
379 * requested range of bytes, we bail out to the uncompressed
380 * cleanup code that can deal with all of this.
382 * It isn't really the fastest way to fix things, but this is a
383 * very uncommon corner.
385 if (actual_end
<= start
)
386 goto cleanup_and_bail_uncompressed
;
388 total_compressed
= actual_end
- start
;
390 /* we want to make sure that amount of ram required to uncompress
391 * an extent is reasonable, so we limit the total size in ram
392 * of a compressed extent to 128k. This is a crucial number
393 * because it also controls how easily we can spread reads across
394 * cpus for decompression.
396 * We also want to make sure the amount of IO required to do
397 * a random read is reasonably small, so we limit the size of
398 * a compressed extent to 128k.
400 total_compressed
= min(total_compressed
, max_uncompressed
);
401 num_bytes
= ALIGN(end
- start
+ 1, blocksize
);
402 num_bytes
= max(blocksize
, num_bytes
);
407 * we do compression for mount -o compress and when the
408 * inode has not been flagged as nocompress. This flag can
409 * change at any time if we discover bad compression ratios.
411 if (!(BTRFS_I(inode
)->flags
& BTRFS_INODE_NOCOMPRESS
) &&
412 (btrfs_test_opt(root
, COMPRESS
) ||
413 (BTRFS_I(inode
)->force_compress
) ||
414 (BTRFS_I(inode
)->flags
& BTRFS_INODE_COMPRESS
))) {
416 pages
= kzalloc(sizeof(struct page
*) * nr_pages
, GFP_NOFS
);
418 /* just bail out to the uncompressed code */
422 if (BTRFS_I(inode
)->force_compress
)
423 compress_type
= BTRFS_I(inode
)->force_compress
;
426 * we need to call clear_page_dirty_for_io on each
427 * page in the range. Otherwise applications with the file
428 * mmap'd can wander in and change the page contents while
429 * we are compressing them.
431 * If the compression fails for any reason, we set the pages
432 * dirty again later on.
434 extent_range_clear_dirty_for_io(inode
, start
, end
);
436 ret
= btrfs_compress_pages(compress_type
,
437 inode
->i_mapping
, start
,
438 total_compressed
, pages
,
439 nr_pages
, &nr_pages_ret
,
445 unsigned long offset
= total_compressed
&
446 (PAGE_CACHE_SIZE
- 1);
447 struct page
*page
= pages
[nr_pages_ret
- 1];
450 /* zero the tail end of the last page, we might be
451 * sending it down to disk
454 kaddr
= kmap_atomic(page
);
455 memset(kaddr
+ offset
, 0,
456 PAGE_CACHE_SIZE
- offset
);
457 kunmap_atomic(kaddr
);
464 trans
= btrfs_join_transaction(root
);
466 ret
= PTR_ERR(trans
);
468 goto cleanup_and_out
;
470 trans
->block_rsv
= &root
->fs_info
->delalloc_block_rsv
;
472 /* lets try to make an inline extent */
473 if (ret
|| total_in
< (actual_end
- start
)) {
474 /* we didn't compress the entire range, try
475 * to make an uncompressed inline extent.
477 ret
= cow_file_range_inline(trans
, root
, inode
,
478 start
, end
, 0, 0, NULL
);
480 /* try making a compressed inline extent */
481 ret
= cow_file_range_inline(trans
, root
, inode
,
484 compress_type
, pages
);
487 unsigned long clear_flags
= EXTENT_DELALLOC
|
489 clear_flags
|= (ret
< 0) ? EXTENT_DO_ACCOUNTING
: 0;
492 * inline extent creation worked or returned error,
493 * we don't need to create any more async work items.
494 * Unlock and free up our temp pages.
496 extent_clear_unlock_delalloc(inode
, start
, end
, NULL
,
497 clear_flags
, PAGE_UNLOCK
|
501 btrfs_end_transaction(trans
, root
);
504 btrfs_end_transaction(trans
, root
);
509 * we aren't doing an inline extent round the compressed size
510 * up to a block size boundary so the allocator does sane
513 total_compressed
= ALIGN(total_compressed
, blocksize
);
516 * one last check to make sure the compression is really a
517 * win, compare the page count read with the blocks on disk
519 total_in
= ALIGN(total_in
, PAGE_CACHE_SIZE
);
520 if (total_compressed
>= total_in
) {
523 num_bytes
= total_in
;
526 if (!will_compress
&& pages
) {
528 * the compression code ran but failed to make things smaller,
529 * free any pages it allocated and our page pointer array
531 for (i
= 0; i
< nr_pages_ret
; i
++) {
532 WARN_ON(pages
[i
]->mapping
);
533 page_cache_release(pages
[i
]);
537 total_compressed
= 0;
540 /* flag the file so we don't compress in the future */
541 if (!btrfs_test_opt(root
, FORCE_COMPRESS
) &&
542 !(BTRFS_I(inode
)->force_compress
)) {
543 BTRFS_I(inode
)->flags
|= BTRFS_INODE_NOCOMPRESS
;
549 /* the async work queues will take care of doing actual
550 * allocation on disk for these compressed pages,
551 * and will submit them to the elevator.
553 add_async_extent(async_cow
, start
, num_bytes
,
554 total_compressed
, pages
, nr_pages_ret
,
557 if (start
+ num_bytes
< end
) {
564 cleanup_and_bail_uncompressed
:
566 * No compression, but we still need to write the pages in
567 * the file we've been given so far. redirty the locked
568 * page if it corresponds to our extent and set things up
569 * for the async work queue to run cow_file_range to do
570 * the normal delalloc dance
572 if (page_offset(locked_page
) >= start
&&
573 page_offset(locked_page
) <= end
) {
574 __set_page_dirty_nobuffers(locked_page
);
575 /* unlocked later on in the async handlers */
578 extent_range_redirty_for_io(inode
, start
, end
);
579 add_async_extent(async_cow
, start
, end
- start
+ 1,
580 0, NULL
, 0, BTRFS_COMPRESS_NONE
);
588 for (i
= 0; i
< nr_pages_ret
; i
++) {
589 WARN_ON(pages
[i
]->mapping
);
590 page_cache_release(pages
[i
]);
597 extent_clear_unlock_delalloc(inode
, start
, end
, NULL
,
598 EXTENT_DELALLOC
| EXTENT_DO_ACCOUNTING
|
599 EXTENT_DEFRAG
, PAGE_UNLOCK
|
600 PAGE_CLEAR_DIRTY
| PAGE_SET_WRITEBACK
|
602 if (!trans
|| IS_ERR(trans
))
603 btrfs_error(root
->fs_info
, ret
, "Failed to join transaction");
605 btrfs_abort_transaction(trans
, root
, ret
);
610 * phase two of compressed writeback. This is the ordered portion
611 * of the code, which only gets called in the order the work was
612 * queued. We walk all the async extents created by compress_file_range
613 * and send them down to the disk.
615 static noinline
int submit_compressed_extents(struct inode
*inode
,
616 struct async_cow
*async_cow
)
618 struct async_extent
*async_extent
;
620 struct btrfs_trans_handle
*trans
;
621 struct btrfs_key ins
;
622 struct extent_map
*em
;
623 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
624 struct extent_map_tree
*em_tree
= &BTRFS_I(inode
)->extent_tree
;
625 struct extent_io_tree
*io_tree
;
628 if (list_empty(&async_cow
->extents
))
632 while (!list_empty(&async_cow
->extents
)) {
633 async_extent
= list_entry(async_cow
->extents
.next
,
634 struct async_extent
, list
);
635 list_del(&async_extent
->list
);
637 io_tree
= &BTRFS_I(inode
)->io_tree
;
640 /* did the compression code fall back to uncompressed IO? */
641 if (!async_extent
->pages
) {
642 int page_started
= 0;
643 unsigned long nr_written
= 0;
645 lock_extent(io_tree
, async_extent
->start
,
646 async_extent
->start
+
647 async_extent
->ram_size
- 1);
649 /* allocate blocks */
650 ret
= cow_file_range(inode
, async_cow
->locked_page
,
652 async_extent
->start
+
653 async_extent
->ram_size
- 1,
654 &page_started
, &nr_written
, 0);
659 * if page_started, cow_file_range inserted an
660 * inline extent and took care of all the unlocking
661 * and IO for us. Otherwise, we need to submit
662 * all those pages down to the drive.
664 if (!page_started
&& !ret
)
665 extent_write_locked_range(io_tree
,
666 inode
, async_extent
->start
,
667 async_extent
->start
+
668 async_extent
->ram_size
- 1,
672 unlock_page(async_cow
->locked_page
);
678 lock_extent(io_tree
, async_extent
->start
,
679 async_extent
->start
+ async_extent
->ram_size
- 1);
681 trans
= btrfs_join_transaction(root
);
683 ret
= PTR_ERR(trans
);
685 trans
->block_rsv
= &root
->fs_info
->delalloc_block_rsv
;
686 ret
= btrfs_reserve_extent(trans
, root
,
687 async_extent
->compressed_size
,
688 async_extent
->compressed_size
,
689 0, alloc_hint
, &ins
, 1);
690 if (ret
&& ret
!= -ENOSPC
)
691 btrfs_abort_transaction(trans
, root
, ret
);
692 btrfs_end_transaction(trans
, root
);
698 for (i
= 0; i
< async_extent
->nr_pages
; i
++) {
699 WARN_ON(async_extent
->pages
[i
]->mapping
);
700 page_cache_release(async_extent
->pages
[i
]);
702 kfree(async_extent
->pages
);
703 async_extent
->nr_pages
= 0;
704 async_extent
->pages
= NULL
;
706 if (ret
== -ENOSPC
) {
707 unlock_extent(io_tree
, async_extent
->start
,
708 async_extent
->start
+
709 async_extent
->ram_size
- 1);
716 * here we're doing allocation and writeback of the
719 btrfs_drop_extent_cache(inode
, async_extent
->start
,
720 async_extent
->start
+
721 async_extent
->ram_size
- 1, 0);
723 em
= alloc_extent_map();
726 goto out_free_reserve
;
728 em
->start
= async_extent
->start
;
729 em
->len
= async_extent
->ram_size
;
730 em
->orig_start
= em
->start
;
731 em
->mod_start
= em
->start
;
732 em
->mod_len
= em
->len
;
734 em
->block_start
= ins
.objectid
;
735 em
->block_len
= ins
.offset
;
736 em
->orig_block_len
= ins
.offset
;
737 em
->ram_bytes
= async_extent
->ram_size
;
738 em
->bdev
= root
->fs_info
->fs_devices
->latest_bdev
;
739 em
->compress_type
= async_extent
->compress_type
;
740 set_bit(EXTENT_FLAG_PINNED
, &em
->flags
);
741 set_bit(EXTENT_FLAG_COMPRESSED
, &em
->flags
);
745 write_lock(&em_tree
->lock
);
746 ret
= add_extent_mapping(em_tree
, em
, 1);
747 write_unlock(&em_tree
->lock
);
748 if (ret
!= -EEXIST
) {
752 btrfs_drop_extent_cache(inode
, async_extent
->start
,
753 async_extent
->start
+
754 async_extent
->ram_size
- 1, 0);
758 goto out_free_reserve
;
760 ret
= btrfs_add_ordered_extent_compress(inode
,
763 async_extent
->ram_size
,
765 BTRFS_ORDERED_COMPRESSED
,
766 async_extent
->compress_type
);
768 goto out_free_reserve
;
771 * clear dirty, set writeback and unlock the pages.
773 extent_clear_unlock_delalloc(inode
, async_extent
->start
,
774 async_extent
->start
+
775 async_extent
->ram_size
- 1,
776 NULL
, EXTENT_LOCKED
| EXTENT_DELALLOC
,
777 PAGE_UNLOCK
| PAGE_CLEAR_DIRTY
|
779 ret
= btrfs_submit_compressed_write(inode
,
781 async_extent
->ram_size
,
783 ins
.offset
, async_extent
->pages
,
784 async_extent
->nr_pages
);
785 alloc_hint
= ins
.objectid
+ ins
.offset
;
795 btrfs_free_reserved_extent(root
, ins
.objectid
, ins
.offset
);
797 extent_clear_unlock_delalloc(inode
, async_extent
->start
,
798 async_extent
->start
+
799 async_extent
->ram_size
- 1,
800 NULL
, EXTENT_LOCKED
| EXTENT_DELALLOC
|
801 EXTENT_DEFRAG
| EXTENT_DO_ACCOUNTING
,
802 PAGE_UNLOCK
| PAGE_CLEAR_DIRTY
|
803 PAGE_SET_WRITEBACK
| PAGE_END_WRITEBACK
);
808 static u64
get_extent_allocation_hint(struct inode
*inode
, u64 start
,
811 struct extent_map_tree
*em_tree
= &BTRFS_I(inode
)->extent_tree
;
812 struct extent_map
*em
;
815 read_lock(&em_tree
->lock
);
816 em
= search_extent_mapping(em_tree
, start
, num_bytes
);
819 * if block start isn't an actual block number then find the
820 * first block in this inode and use that as a hint. If that
821 * block is also bogus then just don't worry about it.
823 if (em
->block_start
>= EXTENT_MAP_LAST_BYTE
) {
825 em
= search_extent_mapping(em_tree
, 0, 0);
826 if (em
&& em
->block_start
< EXTENT_MAP_LAST_BYTE
)
827 alloc_hint
= em
->block_start
;
831 alloc_hint
= em
->block_start
;
835 read_unlock(&em_tree
->lock
);
841 * when extent_io.c finds a delayed allocation range in the file,
842 * the call backs end up in this code. The basic idea is to
843 * allocate extents on disk for the range, and create ordered data structs
844 * in ram to track those extents.
846 * locked_page is the page that writepage had locked already. We use
847 * it to make sure we don't do extra locks or unlocks.
849 * *page_started is set to one if we unlock locked_page and do everything
850 * required to start IO on it. It may be clean and already done with
853 static noinline
int __cow_file_range(struct btrfs_trans_handle
*trans
,
855 struct btrfs_root
*root
,
856 struct page
*locked_page
,
857 u64 start
, u64 end
, int *page_started
,
858 unsigned long *nr_written
,
863 unsigned long ram_size
;
866 u64 blocksize
= root
->sectorsize
;
867 struct btrfs_key ins
;
868 struct extent_map
*em
;
869 struct extent_map_tree
*em_tree
= &BTRFS_I(inode
)->extent_tree
;
872 BUG_ON(btrfs_is_free_space_inode(inode
));
874 num_bytes
= ALIGN(end
- start
+ 1, blocksize
);
875 num_bytes
= max(blocksize
, num_bytes
);
876 disk_num_bytes
= num_bytes
;
878 /* if this is a small write inside eof, kick off defrag */
879 if (num_bytes
< 64 * 1024 &&
880 (start
> 0 || end
+ 1 < BTRFS_I(inode
)->disk_i_size
))
881 btrfs_add_inode_defrag(trans
, inode
);
884 /* lets try to make an inline extent */
885 ret
= cow_file_range_inline(trans
, root
, inode
,
886 start
, end
, 0, 0, NULL
);
888 extent_clear_unlock_delalloc(inode
, start
, end
, NULL
,
889 EXTENT_LOCKED
| EXTENT_DELALLOC
|
890 EXTENT_DEFRAG
, PAGE_UNLOCK
|
891 PAGE_CLEAR_DIRTY
| PAGE_SET_WRITEBACK
|
894 *nr_written
= *nr_written
+
895 (end
- start
+ PAGE_CACHE_SIZE
) / PAGE_CACHE_SIZE
;
898 } else if (ret
< 0) {
899 btrfs_abort_transaction(trans
, root
, ret
);
904 BUG_ON(disk_num_bytes
>
905 btrfs_super_total_bytes(root
->fs_info
->super_copy
));
907 alloc_hint
= get_extent_allocation_hint(inode
, start
, num_bytes
);
908 btrfs_drop_extent_cache(inode
, start
, start
+ num_bytes
- 1, 0);
910 while (disk_num_bytes
> 0) {
913 cur_alloc_size
= disk_num_bytes
;
914 ret
= btrfs_reserve_extent(trans
, root
, cur_alloc_size
,
915 root
->sectorsize
, 0, alloc_hint
,
918 btrfs_abort_transaction(trans
, root
, ret
);
922 em
= alloc_extent_map();
928 em
->orig_start
= em
->start
;
929 ram_size
= ins
.offset
;
930 em
->len
= ins
.offset
;
931 em
->mod_start
= em
->start
;
932 em
->mod_len
= em
->len
;
934 em
->block_start
= ins
.objectid
;
935 em
->block_len
= ins
.offset
;
936 em
->orig_block_len
= ins
.offset
;
937 em
->ram_bytes
= ram_size
;
938 em
->bdev
= root
->fs_info
->fs_devices
->latest_bdev
;
939 set_bit(EXTENT_FLAG_PINNED
, &em
->flags
);
943 write_lock(&em_tree
->lock
);
944 ret
= add_extent_mapping(em_tree
, em
, 1);
945 write_unlock(&em_tree
->lock
);
946 if (ret
!= -EEXIST
) {
950 btrfs_drop_extent_cache(inode
, start
,
951 start
+ ram_size
- 1, 0);
956 cur_alloc_size
= ins
.offset
;
957 ret
= btrfs_add_ordered_extent(inode
, start
, ins
.objectid
,
958 ram_size
, cur_alloc_size
, 0);
962 if (root
->root_key
.objectid
==
963 BTRFS_DATA_RELOC_TREE_OBJECTID
) {
964 ret
= btrfs_reloc_clone_csums(inode
, start
,
967 btrfs_abort_transaction(trans
, root
, ret
);
972 if (disk_num_bytes
< cur_alloc_size
)
975 /* we're not doing compressed IO, don't unlock the first
976 * page (which the caller expects to stay locked), don't
977 * clear any dirty bits and don't set any writeback bits
979 * Do set the Private2 bit so we know this page was properly
980 * setup for writepage
982 op
= unlock
? PAGE_UNLOCK
: 0;
983 op
|= PAGE_SET_PRIVATE2
;
985 extent_clear_unlock_delalloc(inode
, start
,
986 start
+ ram_size
- 1, locked_page
,
987 EXTENT_LOCKED
| EXTENT_DELALLOC
,
989 disk_num_bytes
-= cur_alloc_size
;
990 num_bytes
-= cur_alloc_size
;
991 alloc_hint
= ins
.objectid
+ ins
.offset
;
992 start
+= cur_alloc_size
;
998 btrfs_free_reserved_extent(root
, ins
.objectid
, ins
.offset
);
1000 extent_clear_unlock_delalloc(inode
, start
, end
, locked_page
,
1001 EXTENT_LOCKED
| EXTENT_DO_ACCOUNTING
|
1002 EXTENT_DELALLOC
| EXTENT_DEFRAG
,
1003 PAGE_UNLOCK
| PAGE_CLEAR_DIRTY
|
1004 PAGE_SET_WRITEBACK
| PAGE_END_WRITEBACK
);
1008 static noinline
int cow_file_range(struct inode
*inode
,
1009 struct page
*locked_page
,
1010 u64 start
, u64 end
, int *page_started
,
1011 unsigned long *nr_written
,
1014 struct btrfs_trans_handle
*trans
;
1015 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
1018 trans
= btrfs_join_transaction(root
);
1019 if (IS_ERR(trans
)) {
1020 extent_clear_unlock_delalloc(inode
, start
, end
, locked_page
,
1021 EXTENT_LOCKED
| EXTENT_DELALLOC
|
1022 EXTENT_DO_ACCOUNTING
|
1023 EXTENT_DEFRAG
, PAGE_UNLOCK
|
1025 PAGE_SET_WRITEBACK
|
1026 PAGE_END_WRITEBACK
);
1027 return PTR_ERR(trans
);
1029 trans
->block_rsv
= &root
->fs_info
->delalloc_block_rsv
;
1031 ret
= __cow_file_range(trans
, inode
, root
, locked_page
, start
, end
,
1032 page_started
, nr_written
, unlock
);
1034 btrfs_end_transaction(trans
, root
);
1040 * work queue call back to started compression on a file and pages
1042 static noinline
void async_cow_start(struct btrfs_work
*work
)
1044 struct async_cow
*async_cow
;
1046 async_cow
= container_of(work
, struct async_cow
, work
);
1048 compress_file_range(async_cow
->inode
, async_cow
->locked_page
,
1049 async_cow
->start
, async_cow
->end
, async_cow
,
1051 if (num_added
== 0) {
1052 btrfs_add_delayed_iput(async_cow
->inode
);
1053 async_cow
->inode
= NULL
;
1058 * work queue call back to submit previously compressed pages
1060 static noinline
void async_cow_submit(struct btrfs_work
*work
)
1062 struct async_cow
*async_cow
;
1063 struct btrfs_root
*root
;
1064 unsigned long nr_pages
;
1066 async_cow
= container_of(work
, struct async_cow
, work
);
1068 root
= async_cow
->root
;
1069 nr_pages
= (async_cow
->end
- async_cow
->start
+ PAGE_CACHE_SIZE
) >>
1072 if (atomic_sub_return(nr_pages
, &root
->fs_info
->async_delalloc_pages
) <
1074 waitqueue_active(&root
->fs_info
->async_submit_wait
))
1075 wake_up(&root
->fs_info
->async_submit_wait
);
1077 if (async_cow
->inode
)
1078 submit_compressed_extents(async_cow
->inode
, async_cow
);
1081 static noinline
void async_cow_free(struct btrfs_work
*work
)
1083 struct async_cow
*async_cow
;
1084 async_cow
= container_of(work
, struct async_cow
, work
);
1085 if (async_cow
->inode
)
1086 btrfs_add_delayed_iput(async_cow
->inode
);
1090 static int cow_file_range_async(struct inode
*inode
, struct page
*locked_page
,
1091 u64 start
, u64 end
, int *page_started
,
1092 unsigned long *nr_written
)
1094 struct async_cow
*async_cow
;
1095 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
1096 unsigned long nr_pages
;
1098 int limit
= 10 * 1024 * 1024;
1100 clear_extent_bit(&BTRFS_I(inode
)->io_tree
, start
, end
, EXTENT_LOCKED
,
1101 1, 0, NULL
, GFP_NOFS
);
1102 while (start
< end
) {
1103 async_cow
= kmalloc(sizeof(*async_cow
), GFP_NOFS
);
1104 BUG_ON(!async_cow
); /* -ENOMEM */
1105 async_cow
->inode
= igrab(inode
);
1106 async_cow
->root
= root
;
1107 async_cow
->locked_page
= locked_page
;
1108 async_cow
->start
= start
;
1110 if (BTRFS_I(inode
)->flags
& BTRFS_INODE_NOCOMPRESS
)
1113 cur_end
= min(end
, start
+ 512 * 1024 - 1);
1115 async_cow
->end
= cur_end
;
1116 INIT_LIST_HEAD(&async_cow
->extents
);
1118 async_cow
->work
.func
= async_cow_start
;
1119 async_cow
->work
.ordered_func
= async_cow_submit
;
1120 async_cow
->work
.ordered_free
= async_cow_free
;
1121 async_cow
->work
.flags
= 0;
1123 nr_pages
= (cur_end
- start
+ PAGE_CACHE_SIZE
) >>
1125 atomic_add(nr_pages
, &root
->fs_info
->async_delalloc_pages
);
1127 btrfs_queue_worker(&root
->fs_info
->delalloc_workers
,
1130 if (atomic_read(&root
->fs_info
->async_delalloc_pages
) > limit
) {
1131 wait_event(root
->fs_info
->async_submit_wait
,
1132 (atomic_read(&root
->fs_info
->async_delalloc_pages
) <
1136 while (atomic_read(&root
->fs_info
->async_submit_draining
) &&
1137 atomic_read(&root
->fs_info
->async_delalloc_pages
)) {
1138 wait_event(root
->fs_info
->async_submit_wait
,
1139 (atomic_read(&root
->fs_info
->async_delalloc_pages
) ==
1143 *nr_written
+= nr_pages
;
1144 start
= cur_end
+ 1;
1150 static noinline
int csum_exist_in_range(struct btrfs_root
*root
,
1151 u64 bytenr
, u64 num_bytes
)
1154 struct btrfs_ordered_sum
*sums
;
1157 ret
= btrfs_lookup_csums_range(root
->fs_info
->csum_root
, bytenr
,
1158 bytenr
+ num_bytes
- 1, &list
, 0);
1159 if (ret
== 0 && list_empty(&list
))
1162 while (!list_empty(&list
)) {
1163 sums
= list_entry(list
.next
, struct btrfs_ordered_sum
, list
);
1164 list_del(&sums
->list
);
1171 * when nowcow writeback call back. This checks for snapshots or COW copies
1172 * of the extents that exist in the file, and COWs the file as required.
1174 * If no cow copies or snapshots exist, we write directly to the existing
1177 static noinline
int run_delalloc_nocow(struct inode
*inode
,
1178 struct page
*locked_page
,
1179 u64 start
, u64 end
, int *page_started
, int force
,
1180 unsigned long *nr_written
)
1182 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
1183 struct btrfs_trans_handle
*trans
;
1184 struct extent_buffer
*leaf
;
1185 struct btrfs_path
*path
;
1186 struct btrfs_file_extent_item
*fi
;
1187 struct btrfs_key found_key
;
1202 u64 ino
= btrfs_ino(inode
);
1204 path
= btrfs_alloc_path();
1206 extent_clear_unlock_delalloc(inode
, start
, end
, locked_page
,
1207 EXTENT_LOCKED
| EXTENT_DELALLOC
|
1208 EXTENT_DO_ACCOUNTING
|
1209 EXTENT_DEFRAG
, PAGE_UNLOCK
|
1211 PAGE_SET_WRITEBACK
|
1212 PAGE_END_WRITEBACK
);
1216 nolock
= btrfs_is_free_space_inode(inode
);
1219 trans
= btrfs_join_transaction_nolock(root
);
1221 trans
= btrfs_join_transaction(root
);
1223 if (IS_ERR(trans
)) {
1224 extent_clear_unlock_delalloc(inode
, start
, end
, locked_page
,
1225 EXTENT_LOCKED
| EXTENT_DELALLOC
|
1226 EXTENT_DO_ACCOUNTING
|
1227 EXTENT_DEFRAG
, PAGE_UNLOCK
|
1229 PAGE_SET_WRITEBACK
|
1230 PAGE_END_WRITEBACK
);
1231 btrfs_free_path(path
);
1232 return PTR_ERR(trans
);
1235 trans
->block_rsv
= &root
->fs_info
->delalloc_block_rsv
;
1237 cow_start
= (u64
)-1;
1240 ret
= btrfs_lookup_file_extent(trans
, root
, path
, ino
,
1243 btrfs_abort_transaction(trans
, root
, ret
);
1246 if (ret
> 0 && path
->slots
[0] > 0 && check_prev
) {
1247 leaf
= path
->nodes
[0];
1248 btrfs_item_key_to_cpu(leaf
, &found_key
,
1249 path
->slots
[0] - 1);
1250 if (found_key
.objectid
== ino
&&
1251 found_key
.type
== BTRFS_EXTENT_DATA_KEY
)
1256 leaf
= path
->nodes
[0];
1257 if (path
->slots
[0] >= btrfs_header_nritems(leaf
)) {
1258 ret
= btrfs_next_leaf(root
, path
);
1260 btrfs_abort_transaction(trans
, root
, ret
);
1265 leaf
= path
->nodes
[0];
1271 btrfs_item_key_to_cpu(leaf
, &found_key
, path
->slots
[0]);
1273 if (found_key
.objectid
> ino
||
1274 found_key
.type
> BTRFS_EXTENT_DATA_KEY
||
1275 found_key
.offset
> end
)
1278 if (found_key
.offset
> cur_offset
) {
1279 extent_end
= found_key
.offset
;
1284 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
1285 struct btrfs_file_extent_item
);
1286 extent_type
= btrfs_file_extent_type(leaf
, fi
);
1288 ram_bytes
= btrfs_file_extent_ram_bytes(leaf
, fi
);
1289 if (extent_type
== BTRFS_FILE_EXTENT_REG
||
1290 extent_type
== BTRFS_FILE_EXTENT_PREALLOC
) {
1291 disk_bytenr
= btrfs_file_extent_disk_bytenr(leaf
, fi
);
1292 extent_offset
= btrfs_file_extent_offset(leaf
, fi
);
1293 extent_end
= found_key
.offset
+
1294 btrfs_file_extent_num_bytes(leaf
, fi
);
1296 btrfs_file_extent_disk_num_bytes(leaf
, fi
);
1297 if (extent_end
<= start
) {
1301 if (disk_bytenr
== 0)
1303 if (btrfs_file_extent_compression(leaf
, fi
) ||
1304 btrfs_file_extent_encryption(leaf
, fi
) ||
1305 btrfs_file_extent_other_encoding(leaf
, fi
))
1307 if (extent_type
== BTRFS_FILE_EXTENT_REG
&& !force
)
1309 if (btrfs_extent_readonly(root
, disk_bytenr
))
1311 if (btrfs_cross_ref_exist(trans
, root
, ino
,
1313 extent_offset
, disk_bytenr
))
1315 disk_bytenr
+= extent_offset
;
1316 disk_bytenr
+= cur_offset
- found_key
.offset
;
1317 num_bytes
= min(end
+ 1, extent_end
) - cur_offset
;
1319 * force cow if csum exists in the range.
1320 * this ensure that csum for a given extent are
1321 * either valid or do not exist.
1323 if (csum_exist_in_range(root
, disk_bytenr
, num_bytes
))
1326 } else if (extent_type
== BTRFS_FILE_EXTENT_INLINE
) {
1327 extent_end
= found_key
.offset
+
1328 btrfs_file_extent_inline_len(leaf
, fi
);
1329 extent_end
= ALIGN(extent_end
, root
->sectorsize
);
1334 if (extent_end
<= start
) {
1339 if (cow_start
== (u64
)-1)
1340 cow_start
= cur_offset
;
1341 cur_offset
= extent_end
;
1342 if (cur_offset
> end
)
1348 btrfs_release_path(path
);
1349 if (cow_start
!= (u64
)-1) {
1350 ret
= __cow_file_range(trans
, inode
, root
, locked_page
,
1351 cow_start
, found_key
.offset
- 1,
1352 page_started
, nr_written
, 1);
1354 btrfs_abort_transaction(trans
, root
, ret
);
1357 cow_start
= (u64
)-1;
1360 if (extent_type
== BTRFS_FILE_EXTENT_PREALLOC
) {
1361 struct extent_map
*em
;
1362 struct extent_map_tree
*em_tree
;
1363 em_tree
= &BTRFS_I(inode
)->extent_tree
;
1364 em
= alloc_extent_map();
1365 BUG_ON(!em
); /* -ENOMEM */
1366 em
->start
= cur_offset
;
1367 em
->orig_start
= found_key
.offset
- extent_offset
;
1368 em
->len
= num_bytes
;
1369 em
->block_len
= num_bytes
;
1370 em
->block_start
= disk_bytenr
;
1371 em
->orig_block_len
= disk_num_bytes
;
1372 em
->ram_bytes
= ram_bytes
;
1373 em
->bdev
= root
->fs_info
->fs_devices
->latest_bdev
;
1374 em
->mod_start
= em
->start
;
1375 em
->mod_len
= em
->len
;
1376 set_bit(EXTENT_FLAG_PINNED
, &em
->flags
);
1377 set_bit(EXTENT_FLAG_FILLING
, &em
->flags
);
1378 em
->generation
= -1;
1380 write_lock(&em_tree
->lock
);
1381 ret
= add_extent_mapping(em_tree
, em
, 1);
1382 write_unlock(&em_tree
->lock
);
1383 if (ret
!= -EEXIST
) {
1384 free_extent_map(em
);
1387 btrfs_drop_extent_cache(inode
, em
->start
,
1388 em
->start
+ em
->len
- 1, 0);
1390 type
= BTRFS_ORDERED_PREALLOC
;
1392 type
= BTRFS_ORDERED_NOCOW
;
1395 ret
= btrfs_add_ordered_extent(inode
, cur_offset
, disk_bytenr
,
1396 num_bytes
, num_bytes
, type
);
1397 BUG_ON(ret
); /* -ENOMEM */
1399 if (root
->root_key
.objectid
==
1400 BTRFS_DATA_RELOC_TREE_OBJECTID
) {
1401 ret
= btrfs_reloc_clone_csums(inode
, cur_offset
,
1404 btrfs_abort_transaction(trans
, root
, ret
);
1409 extent_clear_unlock_delalloc(inode
, cur_offset
,
1410 cur_offset
+ num_bytes
- 1,
1411 locked_page
, EXTENT_LOCKED
|
1412 EXTENT_DELALLOC
, PAGE_UNLOCK
|
1414 cur_offset
= extent_end
;
1415 if (cur_offset
> end
)
1418 btrfs_release_path(path
);
1420 if (cur_offset
<= end
&& cow_start
== (u64
)-1) {
1421 cow_start
= cur_offset
;
1425 if (cow_start
!= (u64
)-1) {
1426 ret
= __cow_file_range(trans
, inode
, root
, locked_page
,
1428 page_started
, nr_written
, 1);
1430 btrfs_abort_transaction(trans
, root
, ret
);
1436 err
= btrfs_end_transaction(trans
, root
);
1440 if (ret
&& cur_offset
< end
)
1441 extent_clear_unlock_delalloc(inode
, cur_offset
, end
,
1442 locked_page
, EXTENT_LOCKED
|
1443 EXTENT_DELALLOC
| EXTENT_DEFRAG
|
1444 EXTENT_DO_ACCOUNTING
, PAGE_UNLOCK
|
1446 PAGE_SET_WRITEBACK
|
1447 PAGE_END_WRITEBACK
);
1448 btrfs_free_path(path
);
1453 * extent_io.c call back to do delayed allocation processing
1455 static int run_delalloc_range(struct inode
*inode
, struct page
*locked_page
,
1456 u64 start
, u64 end
, int *page_started
,
1457 unsigned long *nr_written
)
1460 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
1462 if (BTRFS_I(inode
)->flags
& BTRFS_INODE_NODATACOW
) {
1463 ret
= run_delalloc_nocow(inode
, locked_page
, start
, end
,
1464 page_started
, 1, nr_written
);
1465 } else if (BTRFS_I(inode
)->flags
& BTRFS_INODE_PREALLOC
) {
1466 ret
= run_delalloc_nocow(inode
, locked_page
, start
, end
,
1467 page_started
, 0, nr_written
);
1468 } else if (!btrfs_test_opt(root
, COMPRESS
) &&
1469 !(BTRFS_I(inode
)->force_compress
) &&
1470 !(BTRFS_I(inode
)->flags
& BTRFS_INODE_COMPRESS
)) {
1471 ret
= cow_file_range(inode
, locked_page
, start
, end
,
1472 page_started
, nr_written
, 1);
1474 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT
,
1475 &BTRFS_I(inode
)->runtime_flags
);
1476 ret
= cow_file_range_async(inode
, locked_page
, start
, end
,
1477 page_started
, nr_written
);
1482 static void btrfs_split_extent_hook(struct inode
*inode
,
1483 struct extent_state
*orig
, u64 split
)
1485 /* not delalloc, ignore it */
1486 if (!(orig
->state
& EXTENT_DELALLOC
))
1489 spin_lock(&BTRFS_I(inode
)->lock
);
1490 BTRFS_I(inode
)->outstanding_extents
++;
1491 spin_unlock(&BTRFS_I(inode
)->lock
);
1495 * extent_io.c merge_extent_hook, used to track merged delayed allocation
1496 * extents so we can keep track of new extents that are just merged onto old
1497 * extents, such as when we are doing sequential writes, so we can properly
1498 * account for the metadata space we'll need.
1500 static void btrfs_merge_extent_hook(struct inode
*inode
,
1501 struct extent_state
*new,
1502 struct extent_state
*other
)
1504 /* not delalloc, ignore it */
1505 if (!(other
->state
& EXTENT_DELALLOC
))
1508 spin_lock(&BTRFS_I(inode
)->lock
);
1509 BTRFS_I(inode
)->outstanding_extents
--;
1510 spin_unlock(&BTRFS_I(inode
)->lock
);
1513 static void btrfs_add_delalloc_inodes(struct btrfs_root
*root
,
1514 struct inode
*inode
)
1516 spin_lock(&root
->delalloc_lock
);
1517 if (list_empty(&BTRFS_I(inode
)->delalloc_inodes
)) {
1518 list_add_tail(&BTRFS_I(inode
)->delalloc_inodes
,
1519 &root
->delalloc_inodes
);
1520 set_bit(BTRFS_INODE_IN_DELALLOC_LIST
,
1521 &BTRFS_I(inode
)->runtime_flags
);
1522 root
->nr_delalloc_inodes
++;
1523 if (root
->nr_delalloc_inodes
== 1) {
1524 spin_lock(&root
->fs_info
->delalloc_root_lock
);
1525 BUG_ON(!list_empty(&root
->delalloc_root
));
1526 list_add_tail(&root
->delalloc_root
,
1527 &root
->fs_info
->delalloc_roots
);
1528 spin_unlock(&root
->fs_info
->delalloc_root_lock
);
1531 spin_unlock(&root
->delalloc_lock
);
1534 static void btrfs_del_delalloc_inode(struct btrfs_root
*root
,
1535 struct inode
*inode
)
1537 spin_lock(&root
->delalloc_lock
);
1538 if (!list_empty(&BTRFS_I(inode
)->delalloc_inodes
)) {
1539 list_del_init(&BTRFS_I(inode
)->delalloc_inodes
);
1540 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST
,
1541 &BTRFS_I(inode
)->runtime_flags
);
1542 root
->nr_delalloc_inodes
--;
1543 if (!root
->nr_delalloc_inodes
) {
1544 spin_lock(&root
->fs_info
->delalloc_root_lock
);
1545 BUG_ON(list_empty(&root
->delalloc_root
));
1546 list_del_init(&root
->delalloc_root
);
1547 spin_unlock(&root
->fs_info
->delalloc_root_lock
);
1550 spin_unlock(&root
->delalloc_lock
);
1554 * extent_io.c set_bit_hook, used to track delayed allocation
1555 * bytes in this file, and to maintain the list of inodes that
1556 * have pending delalloc work to be done.
1558 static void btrfs_set_bit_hook(struct inode
*inode
,
1559 struct extent_state
*state
, unsigned long *bits
)
1563 * set_bit and clear bit hooks normally require _irqsave/restore
1564 * but in this case, we are only testing for the DELALLOC
1565 * bit, which is only set or cleared with irqs on
1567 if (!(state
->state
& EXTENT_DELALLOC
) && (*bits
& EXTENT_DELALLOC
)) {
1568 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
1569 u64 len
= state
->end
+ 1 - state
->start
;
1570 bool do_list
= !btrfs_is_free_space_inode(inode
);
1572 if (*bits
& EXTENT_FIRST_DELALLOC
) {
1573 *bits
&= ~EXTENT_FIRST_DELALLOC
;
1575 spin_lock(&BTRFS_I(inode
)->lock
);
1576 BTRFS_I(inode
)->outstanding_extents
++;
1577 spin_unlock(&BTRFS_I(inode
)->lock
);
1580 __percpu_counter_add(&root
->fs_info
->delalloc_bytes
, len
,
1581 root
->fs_info
->delalloc_batch
);
1582 spin_lock(&BTRFS_I(inode
)->lock
);
1583 BTRFS_I(inode
)->delalloc_bytes
+= len
;
1584 if (do_list
&& !test_bit(BTRFS_INODE_IN_DELALLOC_LIST
,
1585 &BTRFS_I(inode
)->runtime_flags
))
1586 btrfs_add_delalloc_inodes(root
, inode
);
1587 spin_unlock(&BTRFS_I(inode
)->lock
);
1592 * extent_io.c clear_bit_hook, see set_bit_hook for why
1594 static void btrfs_clear_bit_hook(struct inode
*inode
,
1595 struct extent_state
*state
,
1596 unsigned long *bits
)
1599 * set_bit and clear bit hooks normally require _irqsave/restore
1600 * but in this case, we are only testing for the DELALLOC
1601 * bit, which is only set or cleared with irqs on
1603 if ((state
->state
& EXTENT_DELALLOC
) && (*bits
& EXTENT_DELALLOC
)) {
1604 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
1605 u64 len
= state
->end
+ 1 - state
->start
;
1606 bool do_list
= !btrfs_is_free_space_inode(inode
);
1608 if (*bits
& EXTENT_FIRST_DELALLOC
) {
1609 *bits
&= ~EXTENT_FIRST_DELALLOC
;
1610 } else if (!(*bits
& EXTENT_DO_ACCOUNTING
)) {
1611 spin_lock(&BTRFS_I(inode
)->lock
);
1612 BTRFS_I(inode
)->outstanding_extents
--;
1613 spin_unlock(&BTRFS_I(inode
)->lock
);
1616 if (*bits
& EXTENT_DO_ACCOUNTING
)
1617 btrfs_delalloc_release_metadata(inode
, len
);
1619 if (root
->root_key
.objectid
!= BTRFS_DATA_RELOC_TREE_OBJECTID
1620 && do_list
&& !(state
->state
& EXTENT_NORESERVE
))
1621 btrfs_free_reserved_data_space(inode
, len
);
1623 __percpu_counter_add(&root
->fs_info
->delalloc_bytes
, -len
,
1624 root
->fs_info
->delalloc_batch
);
1625 spin_lock(&BTRFS_I(inode
)->lock
);
1626 BTRFS_I(inode
)->delalloc_bytes
-= len
;
1627 if (do_list
&& BTRFS_I(inode
)->delalloc_bytes
== 0 &&
1628 test_bit(BTRFS_INODE_IN_DELALLOC_LIST
,
1629 &BTRFS_I(inode
)->runtime_flags
))
1630 btrfs_del_delalloc_inode(root
, inode
);
1631 spin_unlock(&BTRFS_I(inode
)->lock
);
1636 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1637 * we don't create bios that span stripes or chunks
1639 int btrfs_merge_bio_hook(int rw
, struct page
*page
, unsigned long offset
,
1640 size_t size
, struct bio
*bio
,
1641 unsigned long bio_flags
)
1643 struct btrfs_root
*root
= BTRFS_I(page
->mapping
->host
)->root
;
1644 u64 logical
= (u64
)bio
->bi_sector
<< 9;
1649 if (bio_flags
& EXTENT_BIO_COMPRESSED
)
1652 length
= bio
->bi_size
;
1653 map_length
= length
;
1654 ret
= btrfs_map_block(root
->fs_info
, rw
, logical
,
1655 &map_length
, NULL
, 0);
1656 /* Will always return 0 with map_multi == NULL */
1658 if (map_length
< length
+ size
)
1664 * in order to insert checksums into the metadata in large chunks,
1665 * we wait until bio submission time. All the pages in the bio are
1666 * checksummed and sums are attached onto the ordered extent record.
1668 * At IO completion time the cums attached on the ordered extent record
1669 * are inserted into the btree
1671 static int __btrfs_submit_bio_start(struct inode
*inode
, int rw
,
1672 struct bio
*bio
, int mirror_num
,
1673 unsigned long bio_flags
,
1676 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
1679 ret
= btrfs_csum_one_bio(root
, inode
, bio
, 0, 0);
1680 BUG_ON(ret
); /* -ENOMEM */
1685 * in order to insert checksums into the metadata in large chunks,
1686 * we wait until bio submission time. All the pages in the bio are
1687 * checksummed and sums are attached onto the ordered extent record.
1689 * At IO completion time the cums attached on the ordered extent record
1690 * are inserted into the btree
1692 static int __btrfs_submit_bio_done(struct inode
*inode
, int rw
, struct bio
*bio
,
1693 int mirror_num
, unsigned long bio_flags
,
1696 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
1699 ret
= btrfs_map_bio(root
, rw
, bio
, mirror_num
, 1);
1701 bio_endio(bio
, ret
);
1706 * extent_io.c submission hook. This does the right thing for csum calculation
1707 * on write, or reading the csums from the tree before a read
1709 static int btrfs_submit_bio_hook(struct inode
*inode
, int rw
, struct bio
*bio
,
1710 int mirror_num
, unsigned long bio_flags
,
1713 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
1717 int async
= !atomic_read(&BTRFS_I(inode
)->sync_writers
);
1719 skip_sum
= BTRFS_I(inode
)->flags
& BTRFS_INODE_NODATASUM
;
1721 if (btrfs_is_free_space_inode(inode
))
1724 if (!(rw
& REQ_WRITE
)) {
1725 ret
= btrfs_bio_wq_end_io(root
->fs_info
, bio
, metadata
);
1729 if (bio_flags
& EXTENT_BIO_COMPRESSED
) {
1730 ret
= btrfs_submit_compressed_read(inode
, bio
,
1734 } else if (!skip_sum
) {
1735 ret
= btrfs_lookup_bio_sums(root
, inode
, bio
, NULL
);
1740 } else if (async
&& !skip_sum
) {
1741 /* csum items have already been cloned */
1742 if (root
->root_key
.objectid
== BTRFS_DATA_RELOC_TREE_OBJECTID
)
1744 /* we're doing a write, do the async checksumming */
1745 ret
= btrfs_wq_submit_bio(BTRFS_I(inode
)->root
->fs_info
,
1746 inode
, rw
, bio
, mirror_num
,
1747 bio_flags
, bio_offset
,
1748 __btrfs_submit_bio_start
,
1749 __btrfs_submit_bio_done
);
1751 } else if (!skip_sum
) {
1752 ret
= btrfs_csum_one_bio(root
, inode
, bio
, 0, 0);
1758 ret
= btrfs_map_bio(root
, rw
, bio
, mirror_num
, 0);
1762 bio_endio(bio
, ret
);
1767 * given a list of ordered sums record them in the inode. This happens
1768 * at IO completion time based on sums calculated at bio submission time.
1770 static noinline
int add_pending_csums(struct btrfs_trans_handle
*trans
,
1771 struct inode
*inode
, u64 file_offset
,
1772 struct list_head
*list
)
1774 struct btrfs_ordered_sum
*sum
;
1776 list_for_each_entry(sum
, list
, list
) {
1777 trans
->adding_csums
= 1;
1778 btrfs_csum_file_blocks(trans
,
1779 BTRFS_I(inode
)->root
->fs_info
->csum_root
, sum
);
1780 trans
->adding_csums
= 0;
1785 int btrfs_set_extent_delalloc(struct inode
*inode
, u64 start
, u64 end
,
1786 struct extent_state
**cached_state
)
1788 WARN_ON((end
& (PAGE_CACHE_SIZE
- 1)) == 0);
1789 return set_extent_delalloc(&BTRFS_I(inode
)->io_tree
, start
, end
,
1790 cached_state
, GFP_NOFS
);
1793 /* see btrfs_writepage_start_hook for details on why this is required */
1794 struct btrfs_writepage_fixup
{
1796 struct btrfs_work work
;
1799 static void btrfs_writepage_fixup_worker(struct btrfs_work
*work
)
1801 struct btrfs_writepage_fixup
*fixup
;
1802 struct btrfs_ordered_extent
*ordered
;
1803 struct extent_state
*cached_state
= NULL
;
1805 struct inode
*inode
;
1810 fixup
= container_of(work
, struct btrfs_writepage_fixup
, work
);
1814 if (!page
->mapping
|| !PageDirty(page
) || !PageChecked(page
)) {
1815 ClearPageChecked(page
);
1819 inode
= page
->mapping
->host
;
1820 page_start
= page_offset(page
);
1821 page_end
= page_offset(page
) + PAGE_CACHE_SIZE
- 1;
1823 lock_extent_bits(&BTRFS_I(inode
)->io_tree
, page_start
, page_end
, 0,
1826 /* already ordered? We're done */
1827 if (PagePrivate2(page
))
1830 ordered
= btrfs_lookup_ordered_extent(inode
, page_start
);
1832 unlock_extent_cached(&BTRFS_I(inode
)->io_tree
, page_start
,
1833 page_end
, &cached_state
, GFP_NOFS
);
1835 btrfs_start_ordered_extent(inode
, ordered
, 1);
1836 btrfs_put_ordered_extent(ordered
);
1840 ret
= btrfs_delalloc_reserve_space(inode
, PAGE_CACHE_SIZE
);
1842 mapping_set_error(page
->mapping
, ret
);
1843 end_extent_writepage(page
, ret
, page_start
, page_end
);
1844 ClearPageChecked(page
);
1848 btrfs_set_extent_delalloc(inode
, page_start
, page_end
, &cached_state
);
1849 ClearPageChecked(page
);
1850 set_page_dirty(page
);
1852 unlock_extent_cached(&BTRFS_I(inode
)->io_tree
, page_start
, page_end
,
1853 &cached_state
, GFP_NOFS
);
1856 page_cache_release(page
);
1861 * There are a few paths in the higher layers of the kernel that directly
1862 * set the page dirty bit without asking the filesystem if it is a
1863 * good idea. This causes problems because we want to make sure COW
1864 * properly happens and the data=ordered rules are followed.
1866 * In our case any range that doesn't have the ORDERED bit set
1867 * hasn't been properly setup for IO. We kick off an async process
1868 * to fix it up. The async helper will wait for ordered extents, set
1869 * the delalloc bit and make it safe to write the page.
1871 static int btrfs_writepage_start_hook(struct page
*page
, u64 start
, u64 end
)
1873 struct inode
*inode
= page
->mapping
->host
;
1874 struct btrfs_writepage_fixup
*fixup
;
1875 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
1877 /* this page is properly in the ordered list */
1878 if (TestClearPagePrivate2(page
))
1881 if (PageChecked(page
))
1884 fixup
= kzalloc(sizeof(*fixup
), GFP_NOFS
);
1888 SetPageChecked(page
);
1889 page_cache_get(page
);
1890 fixup
->work
.func
= btrfs_writepage_fixup_worker
;
1892 btrfs_queue_worker(&root
->fs_info
->fixup_workers
, &fixup
->work
);
1896 static int insert_reserved_file_extent(struct btrfs_trans_handle
*trans
,
1897 struct inode
*inode
, u64 file_pos
,
1898 u64 disk_bytenr
, u64 disk_num_bytes
,
1899 u64 num_bytes
, u64 ram_bytes
,
1900 u8 compression
, u8 encryption
,
1901 u16 other_encoding
, int extent_type
)
1903 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
1904 struct btrfs_file_extent_item
*fi
;
1905 struct btrfs_path
*path
;
1906 struct extent_buffer
*leaf
;
1907 struct btrfs_key ins
;
1910 path
= btrfs_alloc_path();
1914 path
->leave_spinning
= 1;
1917 * we may be replacing one extent in the tree with another.
1918 * The new extent is pinned in the extent map, and we don't want
1919 * to drop it from the cache until it is completely in the btree.
1921 * So, tell btrfs_drop_extents to leave this extent in the cache.
1922 * the caller is expected to unpin it and allow it to be merged
1925 ret
= btrfs_drop_extents(trans
, root
, inode
, file_pos
,
1926 file_pos
+ num_bytes
, 0);
1930 ins
.objectid
= btrfs_ino(inode
);
1931 ins
.offset
= file_pos
;
1932 ins
.type
= BTRFS_EXTENT_DATA_KEY
;
1933 ret
= btrfs_insert_empty_item(trans
, root
, path
, &ins
, sizeof(*fi
));
1936 leaf
= path
->nodes
[0];
1937 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
1938 struct btrfs_file_extent_item
);
1939 btrfs_set_file_extent_generation(leaf
, fi
, trans
->transid
);
1940 btrfs_set_file_extent_type(leaf
, fi
, extent_type
);
1941 btrfs_set_file_extent_disk_bytenr(leaf
, fi
, disk_bytenr
);
1942 btrfs_set_file_extent_disk_num_bytes(leaf
, fi
, disk_num_bytes
);
1943 btrfs_set_file_extent_offset(leaf
, fi
, 0);
1944 btrfs_set_file_extent_num_bytes(leaf
, fi
, num_bytes
);
1945 btrfs_set_file_extent_ram_bytes(leaf
, fi
, ram_bytes
);
1946 btrfs_set_file_extent_compression(leaf
, fi
, compression
);
1947 btrfs_set_file_extent_encryption(leaf
, fi
, encryption
);
1948 btrfs_set_file_extent_other_encoding(leaf
, fi
, other_encoding
);
1950 btrfs_mark_buffer_dirty(leaf
);
1951 btrfs_release_path(path
);
1953 inode_add_bytes(inode
, num_bytes
);
1955 ins
.objectid
= disk_bytenr
;
1956 ins
.offset
= disk_num_bytes
;
1957 ins
.type
= BTRFS_EXTENT_ITEM_KEY
;
1958 ret
= btrfs_alloc_reserved_file_extent(trans
, root
,
1959 root
->root_key
.objectid
,
1960 btrfs_ino(inode
), file_pos
, &ins
);
1962 btrfs_free_path(path
);
1967 /* snapshot-aware defrag */
1968 struct sa_defrag_extent_backref
{
1969 struct rb_node node
;
1970 struct old_sa_defrag_extent
*old
;
1979 struct old_sa_defrag_extent
{
1980 struct list_head list
;
1981 struct new_sa_defrag_extent
*new;
1990 struct new_sa_defrag_extent
{
1991 struct rb_root root
;
1992 struct list_head head
;
1993 struct btrfs_path
*path
;
1994 struct inode
*inode
;
2002 static int backref_comp(struct sa_defrag_extent_backref
*b1
,
2003 struct sa_defrag_extent_backref
*b2
)
2005 if (b1
->root_id
< b2
->root_id
)
2007 else if (b1
->root_id
> b2
->root_id
)
2010 if (b1
->inum
< b2
->inum
)
2012 else if (b1
->inum
> b2
->inum
)
2015 if (b1
->file_pos
< b2
->file_pos
)
2017 else if (b1
->file_pos
> b2
->file_pos
)
2021 * [------------------------------] ===> (a range of space)
2022 * |<--->| |<---->| =============> (fs/file tree A)
2023 * |<---------------------------->| ===> (fs/file tree B)
2025 * A range of space can refer to two file extents in one tree while
2026 * refer to only one file extent in another tree.
2028 * So we may process a disk offset more than one time(two extents in A)
2029 * and locate at the same extent(one extent in B), then insert two same
2030 * backrefs(both refer to the extent in B).
2035 static void backref_insert(struct rb_root
*root
,
2036 struct sa_defrag_extent_backref
*backref
)
2038 struct rb_node
**p
= &root
->rb_node
;
2039 struct rb_node
*parent
= NULL
;
2040 struct sa_defrag_extent_backref
*entry
;
2045 entry
= rb_entry(parent
, struct sa_defrag_extent_backref
, node
);
2047 ret
= backref_comp(backref
, entry
);
2051 p
= &(*p
)->rb_right
;
2054 rb_link_node(&backref
->node
, parent
, p
);
2055 rb_insert_color(&backref
->node
, root
);
2059 * Note the backref might has changed, and in this case we just return 0.
2061 static noinline
int record_one_backref(u64 inum
, u64 offset
, u64 root_id
,
2064 struct btrfs_file_extent_item
*extent
;
2065 struct btrfs_fs_info
*fs_info
;
2066 struct old_sa_defrag_extent
*old
= ctx
;
2067 struct new_sa_defrag_extent
*new = old
->new;
2068 struct btrfs_path
*path
= new->path
;
2069 struct btrfs_key key
;
2070 struct btrfs_root
*root
;
2071 struct sa_defrag_extent_backref
*backref
;
2072 struct extent_buffer
*leaf
;
2073 struct inode
*inode
= new->inode
;
2079 if (BTRFS_I(inode
)->root
->root_key
.objectid
== root_id
&&
2080 inum
== btrfs_ino(inode
))
2083 key
.objectid
= root_id
;
2084 key
.type
= BTRFS_ROOT_ITEM_KEY
;
2085 key
.offset
= (u64
)-1;
2087 fs_info
= BTRFS_I(inode
)->root
->fs_info
;
2088 root
= btrfs_read_fs_root_no_name(fs_info
, &key
);
2090 if (PTR_ERR(root
) == -ENOENT
)
2093 pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
2094 inum
, offset
, root_id
);
2095 return PTR_ERR(root
);
2098 key
.objectid
= inum
;
2099 key
.type
= BTRFS_EXTENT_DATA_KEY
;
2100 if (offset
> (u64
)-1 << 32)
2103 key
.offset
= offset
;
2105 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
2115 leaf
= path
->nodes
[0];
2116 slot
= path
->slots
[0];
2118 if (slot
>= btrfs_header_nritems(leaf
)) {
2119 ret
= btrfs_next_leaf(root
, path
);
2122 } else if (ret
> 0) {
2131 btrfs_item_key_to_cpu(leaf
, &key
, slot
);
2133 if (key
.objectid
> inum
)
2136 if (key
.objectid
< inum
|| key
.type
!= BTRFS_EXTENT_DATA_KEY
)
2139 extent
= btrfs_item_ptr(leaf
, slot
,
2140 struct btrfs_file_extent_item
);
2142 if (btrfs_file_extent_disk_bytenr(leaf
, extent
) != old
->bytenr
)
2146 * 'offset' refers to the exact key.offset,
2147 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2148 * (key.offset - extent_offset).
2150 if (key
.offset
!= offset
)
2153 extent_offset
= btrfs_file_extent_offset(leaf
, extent
);
2154 num_bytes
= btrfs_file_extent_num_bytes(leaf
, extent
);
2156 if (extent_offset
>= old
->extent_offset
+ old
->offset
+
2157 old
->len
|| extent_offset
+ num_bytes
<=
2158 old
->extent_offset
+ old
->offset
)
2163 backref
= kmalloc(sizeof(*backref
), GFP_NOFS
);
2169 backref
->root_id
= root_id
;
2170 backref
->inum
= inum
;
2171 backref
->file_pos
= offset
;
2172 backref
->num_bytes
= num_bytes
;
2173 backref
->extent_offset
= extent_offset
;
2174 backref
->generation
= btrfs_file_extent_generation(leaf
, extent
);
2176 backref_insert(&new->root
, backref
);
2179 btrfs_release_path(path
);
2184 static noinline
bool record_extent_backrefs(struct btrfs_path
*path
,
2185 struct new_sa_defrag_extent
*new)
2187 struct btrfs_fs_info
*fs_info
= BTRFS_I(new->inode
)->root
->fs_info
;
2188 struct old_sa_defrag_extent
*old
, *tmp
;
2193 list_for_each_entry_safe(old
, tmp
, &new->head
, list
) {
2194 ret
= iterate_inodes_from_logical(old
->bytenr
+
2195 old
->extent_offset
, fs_info
,
2196 path
, record_one_backref
,
2198 BUG_ON(ret
< 0 && ret
!= -ENOENT
);
2200 /* no backref to be processed for this extent */
2202 list_del(&old
->list
);
2207 if (list_empty(&new->head
))
2213 static int relink_is_mergable(struct extent_buffer
*leaf
,
2214 struct btrfs_file_extent_item
*fi
,
2215 struct new_sa_defrag_extent
*new)
2217 if (btrfs_file_extent_disk_bytenr(leaf
, fi
) != new->bytenr
)
2220 if (btrfs_file_extent_type(leaf
, fi
) != BTRFS_FILE_EXTENT_REG
)
2223 if (btrfs_file_extent_compression(leaf
, fi
) != new->compress_type
)
2226 if (btrfs_file_extent_encryption(leaf
, fi
) ||
2227 btrfs_file_extent_other_encoding(leaf
, fi
))
2234 * Note the backref might has changed, and in this case we just return 0.
2236 static noinline
int relink_extent_backref(struct btrfs_path
*path
,
2237 struct sa_defrag_extent_backref
*prev
,
2238 struct sa_defrag_extent_backref
*backref
)
2240 struct btrfs_file_extent_item
*extent
;
2241 struct btrfs_file_extent_item
*item
;
2242 struct btrfs_ordered_extent
*ordered
;
2243 struct btrfs_trans_handle
*trans
;
2244 struct btrfs_fs_info
*fs_info
;
2245 struct btrfs_root
*root
;
2246 struct btrfs_key key
;
2247 struct extent_buffer
*leaf
;
2248 struct old_sa_defrag_extent
*old
= backref
->old
;
2249 struct new_sa_defrag_extent
*new = old
->new;
2250 struct inode
*src_inode
= new->inode
;
2251 struct inode
*inode
;
2252 struct extent_state
*cached
= NULL
;
2261 if (prev
&& prev
->root_id
== backref
->root_id
&&
2262 prev
->inum
== backref
->inum
&&
2263 prev
->file_pos
+ prev
->num_bytes
== backref
->file_pos
)
2266 /* step 1: get root */
2267 key
.objectid
= backref
->root_id
;
2268 key
.type
= BTRFS_ROOT_ITEM_KEY
;
2269 key
.offset
= (u64
)-1;
2271 fs_info
= BTRFS_I(src_inode
)->root
->fs_info
;
2272 index
= srcu_read_lock(&fs_info
->subvol_srcu
);
2274 root
= btrfs_read_fs_root_no_name(fs_info
, &key
);
2276 srcu_read_unlock(&fs_info
->subvol_srcu
, index
);
2277 if (PTR_ERR(root
) == -ENOENT
)
2279 return PTR_ERR(root
);
2282 /* step 2: get inode */
2283 key
.objectid
= backref
->inum
;
2284 key
.type
= BTRFS_INODE_ITEM_KEY
;
2287 inode
= btrfs_iget(fs_info
->sb
, &key
, root
, NULL
);
2288 if (IS_ERR(inode
)) {
2289 srcu_read_unlock(&fs_info
->subvol_srcu
, index
);
2293 srcu_read_unlock(&fs_info
->subvol_srcu
, index
);
2295 /* step 3: relink backref */
2296 lock_start
= backref
->file_pos
;
2297 lock_end
= backref
->file_pos
+ backref
->num_bytes
- 1;
2298 lock_extent_bits(&BTRFS_I(inode
)->io_tree
, lock_start
, lock_end
,
2301 ordered
= btrfs_lookup_first_ordered_extent(inode
, lock_end
);
2303 btrfs_put_ordered_extent(ordered
);
2307 trans
= btrfs_join_transaction(root
);
2308 if (IS_ERR(trans
)) {
2309 ret
= PTR_ERR(trans
);
2313 key
.objectid
= backref
->inum
;
2314 key
.type
= BTRFS_EXTENT_DATA_KEY
;
2315 key
.offset
= backref
->file_pos
;
2317 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
2320 } else if (ret
> 0) {
2325 extent
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
2326 struct btrfs_file_extent_item
);
2328 if (btrfs_file_extent_generation(path
->nodes
[0], extent
) !=
2329 backref
->generation
)
2332 btrfs_release_path(path
);
2334 start
= backref
->file_pos
;
2335 if (backref
->extent_offset
< old
->extent_offset
+ old
->offset
)
2336 start
+= old
->extent_offset
+ old
->offset
-
2337 backref
->extent_offset
;
2339 len
= min(backref
->extent_offset
+ backref
->num_bytes
,
2340 old
->extent_offset
+ old
->offset
+ old
->len
);
2341 len
-= max(backref
->extent_offset
, old
->extent_offset
+ old
->offset
);
2343 ret
= btrfs_drop_extents(trans
, root
, inode
, start
,
2348 key
.objectid
= btrfs_ino(inode
);
2349 key
.type
= BTRFS_EXTENT_DATA_KEY
;
2352 path
->leave_spinning
= 1;
2354 struct btrfs_file_extent_item
*fi
;
2356 struct btrfs_key found_key
;
2358 ret
= btrfs_search_slot(trans
, root
, &key
, path
, 1, 1);
2363 leaf
= path
->nodes
[0];
2364 btrfs_item_key_to_cpu(leaf
, &found_key
, path
->slots
[0]);
2366 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
2367 struct btrfs_file_extent_item
);
2368 extent_len
= btrfs_file_extent_num_bytes(leaf
, fi
);
2370 if (extent_len
+ found_key
.offset
== start
&&
2371 relink_is_mergable(leaf
, fi
, new)) {
2372 btrfs_set_file_extent_num_bytes(leaf
, fi
,
2374 btrfs_mark_buffer_dirty(leaf
);
2375 inode_add_bytes(inode
, len
);
2381 btrfs_release_path(path
);
2386 ret
= btrfs_insert_empty_item(trans
, root
, path
, &key
,
2389 btrfs_abort_transaction(trans
, root
, ret
);
2393 leaf
= path
->nodes
[0];
2394 item
= btrfs_item_ptr(leaf
, path
->slots
[0],
2395 struct btrfs_file_extent_item
);
2396 btrfs_set_file_extent_disk_bytenr(leaf
, item
, new->bytenr
);
2397 btrfs_set_file_extent_disk_num_bytes(leaf
, item
, new->disk_len
);
2398 btrfs_set_file_extent_offset(leaf
, item
, start
- new->file_pos
);
2399 btrfs_set_file_extent_num_bytes(leaf
, item
, len
);
2400 btrfs_set_file_extent_ram_bytes(leaf
, item
, new->len
);
2401 btrfs_set_file_extent_generation(leaf
, item
, trans
->transid
);
2402 btrfs_set_file_extent_type(leaf
, item
, BTRFS_FILE_EXTENT_REG
);
2403 btrfs_set_file_extent_compression(leaf
, item
, new->compress_type
);
2404 btrfs_set_file_extent_encryption(leaf
, item
, 0);
2405 btrfs_set_file_extent_other_encoding(leaf
, item
, 0);
2407 btrfs_mark_buffer_dirty(leaf
);
2408 inode_add_bytes(inode
, len
);
2409 btrfs_release_path(path
);
2411 ret
= btrfs_inc_extent_ref(trans
, root
, new->bytenr
,
2413 backref
->root_id
, backref
->inum
,
2414 new->file_pos
, 0); /* start - extent_offset */
2416 btrfs_abort_transaction(trans
, root
, ret
);
2422 btrfs_release_path(path
);
2423 path
->leave_spinning
= 0;
2424 btrfs_end_transaction(trans
, root
);
2426 unlock_extent_cached(&BTRFS_I(inode
)->io_tree
, lock_start
, lock_end
,
2432 static void relink_file_extents(struct new_sa_defrag_extent
*new)
2434 struct btrfs_path
*path
;
2435 struct old_sa_defrag_extent
*old
, *tmp
;
2436 struct sa_defrag_extent_backref
*backref
;
2437 struct sa_defrag_extent_backref
*prev
= NULL
;
2438 struct inode
*inode
;
2439 struct btrfs_root
*root
;
2440 struct rb_node
*node
;
2444 root
= BTRFS_I(inode
)->root
;
2446 path
= btrfs_alloc_path();
2450 if (!record_extent_backrefs(path
, new)) {
2451 btrfs_free_path(path
);
2454 btrfs_release_path(path
);
2457 node
= rb_first(&new->root
);
2460 rb_erase(node
, &new->root
);
2462 backref
= rb_entry(node
, struct sa_defrag_extent_backref
, node
);
2464 ret
= relink_extent_backref(path
, prev
, backref
);
2477 btrfs_free_path(path
);
2479 list_for_each_entry_safe(old
, tmp
, &new->head
, list
) {
2480 list_del(&old
->list
);
2484 atomic_dec(&root
->fs_info
->defrag_running
);
2485 wake_up(&root
->fs_info
->transaction_wait
);
2490 static struct new_sa_defrag_extent
*
2491 record_old_file_extents(struct inode
*inode
,
2492 struct btrfs_ordered_extent
*ordered
)
2494 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
2495 struct btrfs_path
*path
;
2496 struct btrfs_key key
;
2497 struct old_sa_defrag_extent
*old
, *tmp
;
2498 struct new_sa_defrag_extent
*new;
2501 new = kmalloc(sizeof(*new), GFP_NOFS
);
2506 new->file_pos
= ordered
->file_offset
;
2507 new->len
= ordered
->len
;
2508 new->bytenr
= ordered
->start
;
2509 new->disk_len
= ordered
->disk_len
;
2510 new->compress_type
= ordered
->compress_type
;
2511 new->root
= RB_ROOT
;
2512 INIT_LIST_HEAD(&new->head
);
2514 path
= btrfs_alloc_path();
2518 key
.objectid
= btrfs_ino(inode
);
2519 key
.type
= BTRFS_EXTENT_DATA_KEY
;
2520 key
.offset
= new->file_pos
;
2522 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
2525 if (ret
> 0 && path
->slots
[0] > 0)
2528 /* find out all the old extents for the file range */
2530 struct btrfs_file_extent_item
*extent
;
2531 struct extent_buffer
*l
;
2540 slot
= path
->slots
[0];
2542 if (slot
>= btrfs_header_nritems(l
)) {
2543 ret
= btrfs_next_leaf(root
, path
);
2551 btrfs_item_key_to_cpu(l
, &key
, slot
);
2553 if (key
.objectid
!= btrfs_ino(inode
))
2555 if (key
.type
!= BTRFS_EXTENT_DATA_KEY
)
2557 if (key
.offset
>= new->file_pos
+ new->len
)
2560 extent
= btrfs_item_ptr(l
, slot
, struct btrfs_file_extent_item
);
2562 num_bytes
= btrfs_file_extent_num_bytes(l
, extent
);
2563 if (key
.offset
+ num_bytes
< new->file_pos
)
2566 disk_bytenr
= btrfs_file_extent_disk_bytenr(l
, extent
);
2570 extent_offset
= btrfs_file_extent_offset(l
, extent
);
2572 old
= kmalloc(sizeof(*old
), GFP_NOFS
);
2576 offset
= max(new->file_pos
, key
.offset
);
2577 end
= min(new->file_pos
+ new->len
, key
.offset
+ num_bytes
);
2579 old
->bytenr
= disk_bytenr
;
2580 old
->extent_offset
= extent_offset
;
2581 old
->offset
= offset
- key
.offset
;
2582 old
->len
= end
- offset
;
2585 list_add_tail(&old
->list
, &new->head
);
2591 btrfs_free_path(path
);
2592 atomic_inc(&root
->fs_info
->defrag_running
);
2597 list_for_each_entry_safe(old
, tmp
, &new->head
, list
) {
2598 list_del(&old
->list
);
2602 btrfs_free_path(path
);
2609 * helper function for btrfs_finish_ordered_io, this
2610 * just reads in some of the csum leaves to prime them into ram
2611 * before we start the transaction. It limits the amount of btree
2612 * reads required while inside the transaction.
2614 /* as ordered data IO finishes, this gets called so we can finish
2615 * an ordered extent if the range of bytes in the file it covers are
2618 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent
*ordered_extent
)
2620 struct inode
*inode
= ordered_extent
->inode
;
2621 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
2622 struct btrfs_trans_handle
*trans
= NULL
;
2623 struct extent_io_tree
*io_tree
= &BTRFS_I(inode
)->io_tree
;
2624 struct extent_state
*cached_state
= NULL
;
2625 struct new_sa_defrag_extent
*new = NULL
;
2626 int compress_type
= 0;
2630 nolock
= btrfs_is_free_space_inode(inode
);
2632 if (test_bit(BTRFS_ORDERED_IOERR
, &ordered_extent
->flags
)) {
2637 if (test_bit(BTRFS_ORDERED_NOCOW
, &ordered_extent
->flags
)) {
2638 BUG_ON(!list_empty(&ordered_extent
->list
)); /* Logic error */
2639 btrfs_ordered_update_i_size(inode
, 0, ordered_extent
);
2641 trans
= btrfs_join_transaction_nolock(root
);
2643 trans
= btrfs_join_transaction(root
);
2644 if (IS_ERR(trans
)) {
2645 ret
= PTR_ERR(trans
);
2649 trans
->block_rsv
= &root
->fs_info
->delalloc_block_rsv
;
2650 ret
= btrfs_update_inode_fallback(trans
, root
, inode
);
2651 if (ret
) /* -ENOMEM or corruption */
2652 btrfs_abort_transaction(trans
, root
, ret
);
2656 lock_extent_bits(io_tree
, ordered_extent
->file_offset
,
2657 ordered_extent
->file_offset
+ ordered_extent
->len
- 1,
2660 ret
= test_range_bit(io_tree
, ordered_extent
->file_offset
,
2661 ordered_extent
->file_offset
+ ordered_extent
->len
- 1,
2662 EXTENT_DEFRAG
, 1, cached_state
);
2664 u64 last_snapshot
= btrfs_root_last_snapshot(&root
->root_item
);
2665 if (last_snapshot
>= BTRFS_I(inode
)->generation
)
2666 /* the inode is shared */
2667 new = record_old_file_extents(inode
, ordered_extent
);
2669 clear_extent_bit(io_tree
, ordered_extent
->file_offset
,
2670 ordered_extent
->file_offset
+ ordered_extent
->len
- 1,
2671 EXTENT_DEFRAG
, 0, 0, &cached_state
, GFP_NOFS
);
2675 trans
= btrfs_join_transaction_nolock(root
);
2677 trans
= btrfs_join_transaction(root
);
2678 if (IS_ERR(trans
)) {
2679 ret
= PTR_ERR(trans
);
2683 trans
->block_rsv
= &root
->fs_info
->delalloc_block_rsv
;
2685 if (test_bit(BTRFS_ORDERED_COMPRESSED
, &ordered_extent
->flags
))
2686 compress_type
= ordered_extent
->compress_type
;
2687 if (test_bit(BTRFS_ORDERED_PREALLOC
, &ordered_extent
->flags
)) {
2688 BUG_ON(compress_type
);
2689 ret
= btrfs_mark_extent_written(trans
, inode
,
2690 ordered_extent
->file_offset
,
2691 ordered_extent
->file_offset
+
2692 ordered_extent
->len
);
2694 BUG_ON(root
== root
->fs_info
->tree_root
);
2695 ret
= insert_reserved_file_extent(trans
, inode
,
2696 ordered_extent
->file_offset
,
2697 ordered_extent
->start
,
2698 ordered_extent
->disk_len
,
2699 ordered_extent
->len
,
2700 ordered_extent
->len
,
2701 compress_type
, 0, 0,
2702 BTRFS_FILE_EXTENT_REG
);
2704 unpin_extent_cache(&BTRFS_I(inode
)->extent_tree
,
2705 ordered_extent
->file_offset
, ordered_extent
->len
,
2708 btrfs_abort_transaction(trans
, root
, ret
);
2712 add_pending_csums(trans
, inode
, ordered_extent
->file_offset
,
2713 &ordered_extent
->list
);
2715 btrfs_ordered_update_i_size(inode
, 0, ordered_extent
);
2716 ret
= btrfs_update_inode_fallback(trans
, root
, inode
);
2717 if (ret
) { /* -ENOMEM or corruption */
2718 btrfs_abort_transaction(trans
, root
, ret
);
2723 unlock_extent_cached(io_tree
, ordered_extent
->file_offset
,
2724 ordered_extent
->file_offset
+
2725 ordered_extent
->len
- 1, &cached_state
, GFP_NOFS
);
2727 if (root
!= root
->fs_info
->tree_root
)
2728 btrfs_delalloc_release_metadata(inode
, ordered_extent
->len
);
2730 btrfs_end_transaction(trans
, root
);
2733 clear_extent_uptodate(io_tree
, ordered_extent
->file_offset
,
2734 ordered_extent
->file_offset
+
2735 ordered_extent
->len
- 1, NULL
, GFP_NOFS
);
2738 * If the ordered extent had an IOERR or something else went
2739 * wrong we need to return the space for this ordered extent
2740 * back to the allocator.
2742 if (!test_bit(BTRFS_ORDERED_NOCOW
, &ordered_extent
->flags
) &&
2743 !test_bit(BTRFS_ORDERED_PREALLOC
, &ordered_extent
->flags
))
2744 btrfs_free_reserved_extent(root
, ordered_extent
->start
,
2745 ordered_extent
->disk_len
);
2750 * This needs to be done to make sure anybody waiting knows we are done
2751 * updating everything for this ordered extent.
2753 btrfs_remove_ordered_extent(inode
, ordered_extent
);
2755 /* for snapshot-aware defrag */
2757 relink_file_extents(new);
2760 btrfs_put_ordered_extent(ordered_extent
);
2761 /* once for the tree */
2762 btrfs_put_ordered_extent(ordered_extent
);
2767 static void finish_ordered_fn(struct btrfs_work
*work
)
2769 struct btrfs_ordered_extent
*ordered_extent
;
2770 ordered_extent
= container_of(work
, struct btrfs_ordered_extent
, work
);
2771 btrfs_finish_ordered_io(ordered_extent
);
2774 static int btrfs_writepage_end_io_hook(struct page
*page
, u64 start
, u64 end
,
2775 struct extent_state
*state
, int uptodate
)
2777 struct inode
*inode
= page
->mapping
->host
;
2778 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
2779 struct btrfs_ordered_extent
*ordered_extent
= NULL
;
2780 struct btrfs_workers
*workers
;
2782 trace_btrfs_writepage_end_io_hook(page
, start
, end
, uptodate
);
2784 ClearPagePrivate2(page
);
2785 if (!btrfs_dec_test_ordered_pending(inode
, &ordered_extent
, start
,
2786 end
- start
+ 1, uptodate
))
2789 ordered_extent
->work
.func
= finish_ordered_fn
;
2790 ordered_extent
->work
.flags
= 0;
2792 if (btrfs_is_free_space_inode(inode
))
2793 workers
= &root
->fs_info
->endio_freespace_worker
;
2795 workers
= &root
->fs_info
->endio_write_workers
;
2796 btrfs_queue_worker(workers
, &ordered_extent
->work
);
2802 * when reads are done, we need to check csums to verify the data is correct
2803 * if there's a match, we allow the bio to finish. If not, the code in
2804 * extent_io.c will try to find good copies for us.
2806 static int btrfs_readpage_end_io_hook(struct btrfs_io_bio
*io_bio
,
2807 u64 phy_offset
, struct page
*page
,
2808 u64 start
, u64 end
, int mirror
)
2810 size_t offset
= start
- page_offset(page
);
2811 struct inode
*inode
= page
->mapping
->host
;
2812 struct extent_io_tree
*io_tree
= &BTRFS_I(inode
)->io_tree
;
2814 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
2817 static DEFINE_RATELIMIT_STATE(_rs
, DEFAULT_RATELIMIT_INTERVAL
,
2818 DEFAULT_RATELIMIT_BURST
);
2820 if (PageChecked(page
)) {
2821 ClearPageChecked(page
);
2825 if (BTRFS_I(inode
)->flags
& BTRFS_INODE_NODATASUM
)
2828 if (root
->root_key
.objectid
== BTRFS_DATA_RELOC_TREE_OBJECTID
&&
2829 test_range_bit(io_tree
, start
, end
, EXTENT_NODATASUM
, 1, NULL
)) {
2830 clear_extent_bits(io_tree
, start
, end
, EXTENT_NODATASUM
,
2835 phy_offset
>>= inode
->i_sb
->s_blocksize_bits
;
2836 csum_expected
= *(((u32
*)io_bio
->csum
) + phy_offset
);
2838 kaddr
= kmap_atomic(page
);
2839 csum
= btrfs_csum_data(kaddr
+ offset
, csum
, end
- start
+ 1);
2840 btrfs_csum_final(csum
, (char *)&csum
);
2841 if (csum
!= csum_expected
)
2844 kunmap_atomic(kaddr
);
2849 if (__ratelimit(&_rs
))
2850 btrfs_info(root
->fs_info
, "csum failed ino %llu off %llu csum %u expected csum %u",
2851 (unsigned long long)btrfs_ino(page
->mapping
->host
),
2852 (unsigned long long)start
, csum
, csum_expected
);
2853 memset(kaddr
+ offset
, 1, end
- start
+ 1);
2854 flush_dcache_page(page
);
2855 kunmap_atomic(kaddr
);
2856 if (csum_expected
== 0)
2861 struct delayed_iput
{
2862 struct list_head list
;
2863 struct inode
*inode
;
2866 /* JDM: If this is fs-wide, why can't we add a pointer to
2867 * btrfs_inode instead and avoid the allocation? */
2868 void btrfs_add_delayed_iput(struct inode
*inode
)
2870 struct btrfs_fs_info
*fs_info
= BTRFS_I(inode
)->root
->fs_info
;
2871 struct delayed_iput
*delayed
;
2873 if (atomic_add_unless(&inode
->i_count
, -1, 1))
2876 delayed
= kmalloc(sizeof(*delayed
), GFP_NOFS
| __GFP_NOFAIL
);
2877 delayed
->inode
= inode
;
2879 spin_lock(&fs_info
->delayed_iput_lock
);
2880 list_add_tail(&delayed
->list
, &fs_info
->delayed_iputs
);
2881 spin_unlock(&fs_info
->delayed_iput_lock
);
2884 void btrfs_run_delayed_iputs(struct btrfs_root
*root
)
2887 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
2888 struct delayed_iput
*delayed
;
2891 spin_lock(&fs_info
->delayed_iput_lock
);
2892 empty
= list_empty(&fs_info
->delayed_iputs
);
2893 spin_unlock(&fs_info
->delayed_iput_lock
);
2897 spin_lock(&fs_info
->delayed_iput_lock
);
2898 list_splice_init(&fs_info
->delayed_iputs
, &list
);
2899 spin_unlock(&fs_info
->delayed_iput_lock
);
2901 while (!list_empty(&list
)) {
2902 delayed
= list_entry(list
.next
, struct delayed_iput
, list
);
2903 list_del(&delayed
->list
);
2904 iput(delayed
->inode
);
2910 * This is called in transaction commit time. If there are no orphan
2911 * files in the subvolume, it removes orphan item and frees block_rsv
2914 void btrfs_orphan_commit_root(struct btrfs_trans_handle
*trans
,
2915 struct btrfs_root
*root
)
2917 struct btrfs_block_rsv
*block_rsv
;
2920 if (atomic_read(&root
->orphan_inodes
) ||
2921 root
->orphan_cleanup_state
!= ORPHAN_CLEANUP_DONE
)
2924 spin_lock(&root
->orphan_lock
);
2925 if (atomic_read(&root
->orphan_inodes
)) {
2926 spin_unlock(&root
->orphan_lock
);
2930 if (root
->orphan_cleanup_state
!= ORPHAN_CLEANUP_DONE
) {
2931 spin_unlock(&root
->orphan_lock
);
2935 block_rsv
= root
->orphan_block_rsv
;
2936 root
->orphan_block_rsv
= NULL
;
2937 spin_unlock(&root
->orphan_lock
);
2939 if (root
->orphan_item_inserted
&&
2940 btrfs_root_refs(&root
->root_item
) > 0) {
2941 ret
= btrfs_del_orphan_item(trans
, root
->fs_info
->tree_root
,
2942 root
->root_key
.objectid
);
2944 btrfs_abort_transaction(trans
, root
, ret
);
2946 root
->orphan_item_inserted
= 0;
2950 WARN_ON(block_rsv
->size
> 0);
2951 btrfs_free_block_rsv(root
, block_rsv
);
2956 * This creates an orphan entry for the given inode in case something goes
2957 * wrong in the middle of an unlink/truncate.
2959 * NOTE: caller of this function should reserve 5 units of metadata for
2962 int btrfs_orphan_add(struct btrfs_trans_handle
*trans
, struct inode
*inode
)
2964 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
2965 struct btrfs_block_rsv
*block_rsv
= NULL
;
2970 if (!root
->orphan_block_rsv
) {
2971 block_rsv
= btrfs_alloc_block_rsv(root
, BTRFS_BLOCK_RSV_TEMP
);
2976 spin_lock(&root
->orphan_lock
);
2977 if (!root
->orphan_block_rsv
) {
2978 root
->orphan_block_rsv
= block_rsv
;
2979 } else if (block_rsv
) {
2980 btrfs_free_block_rsv(root
, block_rsv
);
2984 if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM
,
2985 &BTRFS_I(inode
)->runtime_flags
)) {
2988 * For proper ENOSPC handling, we should do orphan
2989 * cleanup when mounting. But this introduces backward
2990 * compatibility issue.
2992 if (!xchg(&root
->orphan_item_inserted
, 1))
2998 atomic_inc(&root
->orphan_inodes
);
3001 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED
,
3002 &BTRFS_I(inode
)->runtime_flags
))
3004 spin_unlock(&root
->orphan_lock
);
3006 /* grab metadata reservation from transaction handle */
3008 ret
= btrfs_orphan_reserve_metadata(trans
, inode
);
3009 BUG_ON(ret
); /* -ENOSPC in reservation; Logic error? JDM */
3012 /* insert an orphan item to track this unlinked/truncated file */
3014 ret
= btrfs_insert_orphan_item(trans
, root
, btrfs_ino(inode
));
3016 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM
,
3017 &BTRFS_I(inode
)->runtime_flags
);
3019 clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED
,
3020 &BTRFS_I(inode
)->runtime_flags
);
3021 btrfs_orphan_release_metadata(inode
);
3023 if (ret
!= -EEXIST
) {
3024 btrfs_abort_transaction(trans
, root
, ret
);
3031 /* insert an orphan item to track subvolume contains orphan files */
3033 ret
= btrfs_insert_orphan_item(trans
, root
->fs_info
->tree_root
,
3034 root
->root_key
.objectid
);
3035 if (ret
&& ret
!= -EEXIST
) {
3036 btrfs_abort_transaction(trans
, root
, ret
);
3044 * We have done the truncate/delete so we can go ahead and remove the orphan
3045 * item for this particular inode.
3047 static int btrfs_orphan_del(struct btrfs_trans_handle
*trans
,
3048 struct inode
*inode
)
3050 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
3051 int delete_item
= 0;
3052 int release_rsv
= 0;
3055 spin_lock(&root
->orphan_lock
);
3056 if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM
,
3057 &BTRFS_I(inode
)->runtime_flags
))
3060 if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED
,
3061 &BTRFS_I(inode
)->runtime_flags
))
3063 spin_unlock(&root
->orphan_lock
);
3065 if (trans
&& delete_item
)
3066 ret
= btrfs_del_orphan_item(trans
, root
, btrfs_ino(inode
));
3069 btrfs_orphan_release_metadata(inode
);
3070 atomic_dec(&root
->orphan_inodes
);
3077 * this cleans up any orphans that may be left on the list from the last use
3080 int btrfs_orphan_cleanup(struct btrfs_root
*root
)
3082 struct btrfs_path
*path
;
3083 struct extent_buffer
*leaf
;
3084 struct btrfs_key key
, found_key
;
3085 struct btrfs_trans_handle
*trans
;
3086 struct inode
*inode
;
3087 u64 last_objectid
= 0;
3088 int ret
= 0, nr_unlink
= 0, nr_truncate
= 0;
3090 if (cmpxchg(&root
->orphan_cleanup_state
, 0, ORPHAN_CLEANUP_STARTED
))
3093 path
= btrfs_alloc_path();
3100 key
.objectid
= BTRFS_ORPHAN_OBJECTID
;
3101 btrfs_set_key_type(&key
, BTRFS_ORPHAN_ITEM_KEY
);
3102 key
.offset
= (u64
)-1;
3105 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
3110 * if ret == 0 means we found what we were searching for, which
3111 * is weird, but possible, so only screw with path if we didn't
3112 * find the key and see if we have stuff that matches
3116 if (path
->slots
[0] == 0)
3121 /* pull out the item */
3122 leaf
= path
->nodes
[0];
3123 btrfs_item_key_to_cpu(leaf
, &found_key
, path
->slots
[0]);
3125 /* make sure the item matches what we want */
3126 if (found_key
.objectid
!= BTRFS_ORPHAN_OBJECTID
)
3128 if (btrfs_key_type(&found_key
) != BTRFS_ORPHAN_ITEM_KEY
)
3131 /* release the path since we're done with it */
3132 btrfs_release_path(path
);
3135 * this is where we are basically btrfs_lookup, without the
3136 * crossing root thing. we store the inode number in the
3137 * offset of the orphan item.
3140 if (found_key
.offset
== last_objectid
) {
3141 btrfs_err(root
->fs_info
,
3142 "Error removing orphan entry, stopping orphan cleanup");
3147 last_objectid
= found_key
.offset
;
3149 found_key
.objectid
= found_key
.offset
;
3150 found_key
.type
= BTRFS_INODE_ITEM_KEY
;
3151 found_key
.offset
= 0;
3152 inode
= btrfs_iget(root
->fs_info
->sb
, &found_key
, root
, NULL
);
3153 ret
= PTR_RET(inode
);
3154 if (ret
&& ret
!= -ESTALE
)
3157 if (ret
== -ESTALE
&& root
== root
->fs_info
->tree_root
) {
3158 struct btrfs_root
*dead_root
;
3159 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
3160 int is_dead_root
= 0;
3163 * this is an orphan in the tree root. Currently these
3164 * could come from 2 sources:
3165 * a) a snapshot deletion in progress
3166 * b) a free space cache inode
3167 * We need to distinguish those two, as the snapshot
3168 * orphan must not get deleted.
3169 * find_dead_roots already ran before us, so if this
3170 * is a snapshot deletion, we should find the root
3171 * in the dead_roots list
3173 spin_lock(&fs_info
->trans_lock
);
3174 list_for_each_entry(dead_root
, &fs_info
->dead_roots
,
3176 if (dead_root
->root_key
.objectid
==
3177 found_key
.objectid
) {
3182 spin_unlock(&fs_info
->trans_lock
);
3184 /* prevent this orphan from being found again */
3185 key
.offset
= found_key
.objectid
- 1;
3190 * Inode is already gone but the orphan item is still there,
3191 * kill the orphan item.
3193 if (ret
== -ESTALE
) {
3194 trans
= btrfs_start_transaction(root
, 1);
3195 if (IS_ERR(trans
)) {
3196 ret
= PTR_ERR(trans
);
3199 btrfs_debug(root
->fs_info
, "auto deleting %Lu",
3200 found_key
.objectid
);
3201 ret
= btrfs_del_orphan_item(trans
, root
,
3202 found_key
.objectid
);
3203 btrfs_end_transaction(trans
, root
);
3210 * add this inode to the orphan list so btrfs_orphan_del does
3211 * the proper thing when we hit it
3213 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM
,
3214 &BTRFS_I(inode
)->runtime_flags
);
3215 atomic_inc(&root
->orphan_inodes
);
3217 /* if we have links, this was a truncate, lets do that */
3218 if (inode
->i_nlink
) {
3219 if (!S_ISREG(inode
->i_mode
)) {
3226 /* 1 for the orphan item deletion. */
3227 trans
= btrfs_start_transaction(root
, 1);
3228 if (IS_ERR(trans
)) {
3230 ret
= PTR_ERR(trans
);
3233 ret
= btrfs_orphan_add(trans
, inode
);
3234 btrfs_end_transaction(trans
, root
);
3240 ret
= btrfs_truncate(inode
);
3242 btrfs_orphan_del(NULL
, inode
);
3247 /* this will do delete_inode and everything for us */
3252 /* release the path since we're done with it */
3253 btrfs_release_path(path
);
3255 root
->orphan_cleanup_state
= ORPHAN_CLEANUP_DONE
;
3257 if (root
->orphan_block_rsv
)
3258 btrfs_block_rsv_release(root
, root
->orphan_block_rsv
,
3261 if (root
->orphan_block_rsv
|| root
->orphan_item_inserted
) {
3262 trans
= btrfs_join_transaction(root
);
3264 btrfs_end_transaction(trans
, root
);
3268 btrfs_debug(root
->fs_info
, "unlinked %d orphans", nr_unlink
);
3270 btrfs_debug(root
->fs_info
, "truncated %d orphans", nr_truncate
);
3274 btrfs_crit(root
->fs_info
,
3275 "could not do orphan cleanup %d", ret
);
3276 btrfs_free_path(path
);
3281 * very simple check to peek ahead in the leaf looking for xattrs. If we
3282 * don't find any xattrs, we know there can't be any acls.
3284 * slot is the slot the inode is in, objectid is the objectid of the inode
3286 static noinline
int acls_after_inode_item(struct extent_buffer
*leaf
,
3287 int slot
, u64 objectid
)
3289 u32 nritems
= btrfs_header_nritems(leaf
);
3290 struct btrfs_key found_key
;
3291 static u64 xattr_access
= 0;
3292 static u64 xattr_default
= 0;
3295 if (!xattr_access
) {
3296 xattr_access
= btrfs_name_hash(POSIX_ACL_XATTR_ACCESS
,
3297 strlen(POSIX_ACL_XATTR_ACCESS
));
3298 xattr_default
= btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT
,
3299 strlen(POSIX_ACL_XATTR_DEFAULT
));
3303 while (slot
< nritems
) {
3304 btrfs_item_key_to_cpu(leaf
, &found_key
, slot
);
3306 /* we found a different objectid, there must not be acls */
3307 if (found_key
.objectid
!= objectid
)
3310 /* we found an xattr, assume we've got an acl */
3311 if (found_key
.type
== BTRFS_XATTR_ITEM_KEY
) {
3312 if (found_key
.offset
== xattr_access
||
3313 found_key
.offset
== xattr_default
)
3318 * we found a key greater than an xattr key, there can't
3319 * be any acls later on
3321 if (found_key
.type
> BTRFS_XATTR_ITEM_KEY
)
3328 * it goes inode, inode backrefs, xattrs, extents,
3329 * so if there are a ton of hard links to an inode there can
3330 * be a lot of backrefs. Don't waste time searching too hard,
3331 * this is just an optimization
3336 /* we hit the end of the leaf before we found an xattr or
3337 * something larger than an xattr. We have to assume the inode
3344 * read an inode from the btree into the in-memory inode
3346 static void btrfs_read_locked_inode(struct inode
*inode
)
3348 struct btrfs_path
*path
;
3349 struct extent_buffer
*leaf
;
3350 struct btrfs_inode_item
*inode_item
;
3351 struct btrfs_timespec
*tspec
;
3352 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
3353 struct btrfs_key location
;
3357 bool filled
= false;
3359 ret
= btrfs_fill_inode(inode
, &rdev
);
3363 path
= btrfs_alloc_path();
3367 path
->leave_spinning
= 1;
3368 memcpy(&location
, &BTRFS_I(inode
)->location
, sizeof(location
));
3370 ret
= btrfs_lookup_inode(NULL
, root
, path
, &location
, 0);
3374 leaf
= path
->nodes
[0];
3379 inode_item
= btrfs_item_ptr(leaf
, path
->slots
[0],
3380 struct btrfs_inode_item
);
3381 inode
->i_mode
= btrfs_inode_mode(leaf
, inode_item
);
3382 set_nlink(inode
, btrfs_inode_nlink(leaf
, inode_item
));
3383 i_uid_write(inode
, btrfs_inode_uid(leaf
, inode_item
));
3384 i_gid_write(inode
, btrfs_inode_gid(leaf
, inode_item
));
3385 btrfs_i_size_write(inode
, btrfs_inode_size(leaf
, inode_item
));
3387 tspec
= btrfs_inode_atime(inode_item
);
3388 inode
->i_atime
.tv_sec
= btrfs_timespec_sec(leaf
, tspec
);
3389 inode
->i_atime
.tv_nsec
= btrfs_timespec_nsec(leaf
, tspec
);
3391 tspec
= btrfs_inode_mtime(inode_item
);
3392 inode
->i_mtime
.tv_sec
= btrfs_timespec_sec(leaf
, tspec
);
3393 inode
->i_mtime
.tv_nsec
= btrfs_timespec_nsec(leaf
, tspec
);
3395 tspec
= btrfs_inode_ctime(inode_item
);
3396 inode
->i_ctime
.tv_sec
= btrfs_timespec_sec(leaf
, tspec
);
3397 inode
->i_ctime
.tv_nsec
= btrfs_timespec_nsec(leaf
, tspec
);
3399 inode_set_bytes(inode
, btrfs_inode_nbytes(leaf
, inode_item
));
3400 BTRFS_I(inode
)->generation
= btrfs_inode_generation(leaf
, inode_item
);
3401 BTRFS_I(inode
)->last_trans
= btrfs_inode_transid(leaf
, inode_item
);
3404 * If we were modified in the current generation and evicted from memory
3405 * and then re-read we need to do a full sync since we don't have any
3406 * idea about which extents were modified before we were evicted from
3409 if (BTRFS_I(inode
)->last_trans
== root
->fs_info
->generation
)
3410 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC
,
3411 &BTRFS_I(inode
)->runtime_flags
);
3413 inode
->i_version
= btrfs_inode_sequence(leaf
, inode_item
);
3414 inode
->i_generation
= BTRFS_I(inode
)->generation
;
3416 rdev
= btrfs_inode_rdev(leaf
, inode_item
);
3418 BTRFS_I(inode
)->index_cnt
= (u64
)-1;
3419 BTRFS_I(inode
)->flags
= btrfs_inode_flags(leaf
, inode_item
);
3422 * try to precache a NULL acl entry for files that don't have
3423 * any xattrs or acls
3425 maybe_acls
= acls_after_inode_item(leaf
, path
->slots
[0],
3428 cache_no_acl(inode
);
3430 btrfs_free_path(path
);
3432 switch (inode
->i_mode
& S_IFMT
) {
3434 inode
->i_mapping
->a_ops
= &btrfs_aops
;
3435 inode
->i_mapping
->backing_dev_info
= &root
->fs_info
->bdi
;
3436 BTRFS_I(inode
)->io_tree
.ops
= &btrfs_extent_io_ops
;
3437 inode
->i_fop
= &btrfs_file_operations
;
3438 inode
->i_op
= &btrfs_file_inode_operations
;
3441 inode
->i_fop
= &btrfs_dir_file_operations
;
3442 if (root
== root
->fs_info
->tree_root
)
3443 inode
->i_op
= &btrfs_dir_ro_inode_operations
;
3445 inode
->i_op
= &btrfs_dir_inode_operations
;
3448 inode
->i_op
= &btrfs_symlink_inode_operations
;
3449 inode
->i_mapping
->a_ops
= &btrfs_symlink_aops
;
3450 inode
->i_mapping
->backing_dev_info
= &root
->fs_info
->bdi
;
3453 inode
->i_op
= &btrfs_special_inode_operations
;
3454 init_special_inode(inode
, inode
->i_mode
, rdev
);
3458 btrfs_update_iflags(inode
);
3462 btrfs_free_path(path
);
3463 make_bad_inode(inode
);
3467 * given a leaf and an inode, copy the inode fields into the leaf
3469 static void fill_inode_item(struct btrfs_trans_handle
*trans
,
3470 struct extent_buffer
*leaf
,
3471 struct btrfs_inode_item
*item
,
3472 struct inode
*inode
)
3474 struct btrfs_map_token token
;
3476 btrfs_init_map_token(&token
);
3478 btrfs_set_token_inode_uid(leaf
, item
, i_uid_read(inode
), &token
);
3479 btrfs_set_token_inode_gid(leaf
, item
, i_gid_read(inode
), &token
);
3480 btrfs_set_token_inode_size(leaf
, item
, BTRFS_I(inode
)->disk_i_size
,
3482 btrfs_set_token_inode_mode(leaf
, item
, inode
->i_mode
, &token
);
3483 btrfs_set_token_inode_nlink(leaf
, item
, inode
->i_nlink
, &token
);
3485 btrfs_set_token_timespec_sec(leaf
, btrfs_inode_atime(item
),
3486 inode
->i_atime
.tv_sec
, &token
);
3487 btrfs_set_token_timespec_nsec(leaf
, btrfs_inode_atime(item
),
3488 inode
->i_atime
.tv_nsec
, &token
);
3490 btrfs_set_token_timespec_sec(leaf
, btrfs_inode_mtime(item
),
3491 inode
->i_mtime
.tv_sec
, &token
);
3492 btrfs_set_token_timespec_nsec(leaf
, btrfs_inode_mtime(item
),
3493 inode
->i_mtime
.tv_nsec
, &token
);
3495 btrfs_set_token_timespec_sec(leaf
, btrfs_inode_ctime(item
),
3496 inode
->i_ctime
.tv_sec
, &token
);
3497 btrfs_set_token_timespec_nsec(leaf
, btrfs_inode_ctime(item
),
3498 inode
->i_ctime
.tv_nsec
, &token
);
3500 btrfs_set_token_inode_nbytes(leaf
, item
, inode_get_bytes(inode
),
3502 btrfs_set_token_inode_generation(leaf
, item
, BTRFS_I(inode
)->generation
,
3504 btrfs_set_token_inode_sequence(leaf
, item
, inode
->i_version
, &token
);
3505 btrfs_set_token_inode_transid(leaf
, item
, trans
->transid
, &token
);
3506 btrfs_set_token_inode_rdev(leaf
, item
, inode
->i_rdev
, &token
);
3507 btrfs_set_token_inode_flags(leaf
, item
, BTRFS_I(inode
)->flags
, &token
);
3508 btrfs_set_token_inode_block_group(leaf
, item
, 0, &token
);
3512 * copy everything in the in-memory inode into the btree.
3514 static noinline
int btrfs_update_inode_item(struct btrfs_trans_handle
*trans
,
3515 struct btrfs_root
*root
, struct inode
*inode
)
3517 struct btrfs_inode_item
*inode_item
;
3518 struct btrfs_path
*path
;
3519 struct extent_buffer
*leaf
;
3522 path
= btrfs_alloc_path();
3526 path
->leave_spinning
= 1;
3527 ret
= btrfs_lookup_inode(trans
, root
, path
, &BTRFS_I(inode
)->location
,
3535 btrfs_unlock_up_safe(path
, 1);
3536 leaf
= path
->nodes
[0];
3537 inode_item
= btrfs_item_ptr(leaf
, path
->slots
[0],
3538 struct btrfs_inode_item
);
3540 fill_inode_item(trans
, leaf
, inode_item
, inode
);
3541 btrfs_mark_buffer_dirty(leaf
);
3542 btrfs_set_inode_last_trans(trans
, inode
);
3545 btrfs_free_path(path
);
3550 * copy everything in the in-memory inode into the btree.
3552 noinline
int btrfs_update_inode(struct btrfs_trans_handle
*trans
,
3553 struct btrfs_root
*root
, struct inode
*inode
)
3558 * If the inode is a free space inode, we can deadlock during commit
3559 * if we put it into the delayed code.
3561 * The data relocation inode should also be directly updated
3564 if (!btrfs_is_free_space_inode(inode
)
3565 && root
->root_key
.objectid
!= BTRFS_DATA_RELOC_TREE_OBJECTID
) {
3566 btrfs_update_root_times(trans
, root
);
3568 ret
= btrfs_delayed_update_inode(trans
, root
, inode
);
3570 btrfs_set_inode_last_trans(trans
, inode
);
3574 return btrfs_update_inode_item(trans
, root
, inode
);
3577 noinline
int btrfs_update_inode_fallback(struct btrfs_trans_handle
*trans
,
3578 struct btrfs_root
*root
,
3579 struct inode
*inode
)
3583 ret
= btrfs_update_inode(trans
, root
, inode
);
3585 return btrfs_update_inode_item(trans
, root
, inode
);
3590 * unlink helper that gets used here in inode.c and in the tree logging
3591 * recovery code. It remove a link in a directory with a given name, and
3592 * also drops the back refs in the inode to the directory
3594 static int __btrfs_unlink_inode(struct btrfs_trans_handle
*trans
,
3595 struct btrfs_root
*root
,
3596 struct inode
*dir
, struct inode
*inode
,
3597 const char *name
, int name_len
)
3599 struct btrfs_path
*path
;
3601 struct extent_buffer
*leaf
;
3602 struct btrfs_dir_item
*di
;
3603 struct btrfs_key key
;
3605 u64 ino
= btrfs_ino(inode
);
3606 u64 dir_ino
= btrfs_ino(dir
);
3608 path
= btrfs_alloc_path();
3614 path
->leave_spinning
= 1;
3615 di
= btrfs_lookup_dir_item(trans
, root
, path
, dir_ino
,
3616 name
, name_len
, -1);
3625 leaf
= path
->nodes
[0];
3626 btrfs_dir_item_key_to_cpu(leaf
, di
, &key
);
3627 ret
= btrfs_delete_one_dir_name(trans
, root
, path
, di
);
3630 btrfs_release_path(path
);
3632 ret
= btrfs_del_inode_ref(trans
, root
, name
, name_len
, ino
,
3635 btrfs_info(root
->fs_info
,
3636 "failed to delete reference to %.*s, inode %llu parent %llu",
3638 (unsigned long long)ino
, (unsigned long long)dir_ino
);
3639 btrfs_abort_transaction(trans
, root
, ret
);
3643 ret
= btrfs_delete_delayed_dir_index(trans
, root
, dir
, index
);
3645 btrfs_abort_transaction(trans
, root
, ret
);
3649 ret
= btrfs_del_inode_ref_in_log(trans
, root
, name
, name_len
,
3651 if (ret
!= 0 && ret
!= -ENOENT
) {
3652 btrfs_abort_transaction(trans
, root
, ret
);
3656 ret
= btrfs_del_dir_entries_in_log(trans
, root
, name
, name_len
,
3661 btrfs_abort_transaction(trans
, root
, ret
);
3663 btrfs_free_path(path
);
3667 btrfs_i_size_write(dir
, dir
->i_size
- name_len
* 2);
3668 inode_inc_iversion(inode
);
3669 inode_inc_iversion(dir
);
3670 inode
->i_ctime
= dir
->i_mtime
= dir
->i_ctime
= CURRENT_TIME
;
3671 ret
= btrfs_update_inode(trans
, root
, dir
);
3676 int btrfs_unlink_inode(struct btrfs_trans_handle
*trans
,
3677 struct btrfs_root
*root
,
3678 struct inode
*dir
, struct inode
*inode
,
3679 const char *name
, int name_len
)
3682 ret
= __btrfs_unlink_inode(trans
, root
, dir
, inode
, name
, name_len
);
3684 btrfs_drop_nlink(inode
);
3685 ret
= btrfs_update_inode(trans
, root
, inode
);
3691 * helper to start transaction for unlink and rmdir.
3693 * unlink and rmdir are special in btrfs, they do not always free space, so
3694 * if we cannot make our reservations the normal way try and see if there is
3695 * plenty of slack room in the global reserve to migrate, otherwise we cannot
3696 * allow the unlink to occur.
3698 static struct btrfs_trans_handle
*__unlink_start_trans(struct inode
*dir
)
3700 struct btrfs_trans_handle
*trans
;
3701 struct btrfs_root
*root
= BTRFS_I(dir
)->root
;
3705 * 1 for the possible orphan item
3706 * 1 for the dir item
3707 * 1 for the dir index
3708 * 1 for the inode ref
3711 trans
= btrfs_start_transaction(root
, 5);
3712 if (!IS_ERR(trans
) || PTR_ERR(trans
) != -ENOSPC
)
3715 if (PTR_ERR(trans
) == -ENOSPC
) {
3716 u64 num_bytes
= btrfs_calc_trans_metadata_size(root
, 5);
3718 trans
= btrfs_start_transaction(root
, 0);
3721 ret
= btrfs_cond_migrate_bytes(root
->fs_info
,
3722 &root
->fs_info
->trans_block_rsv
,
3725 btrfs_end_transaction(trans
, root
);
3726 return ERR_PTR(ret
);
3728 trans
->block_rsv
= &root
->fs_info
->trans_block_rsv
;
3729 trans
->bytes_reserved
= num_bytes
;
3734 static int btrfs_unlink(struct inode
*dir
, struct dentry
*dentry
)
3736 struct btrfs_root
*root
= BTRFS_I(dir
)->root
;
3737 struct btrfs_trans_handle
*trans
;
3738 struct inode
*inode
= dentry
->d_inode
;
3741 trans
= __unlink_start_trans(dir
);
3743 return PTR_ERR(trans
);
3745 btrfs_record_unlink_dir(trans
, dir
, dentry
->d_inode
, 0);
3747 ret
= btrfs_unlink_inode(trans
, root
, dir
, dentry
->d_inode
,
3748 dentry
->d_name
.name
, dentry
->d_name
.len
);
3752 if (inode
->i_nlink
== 0) {
3753 ret
= btrfs_orphan_add(trans
, inode
);
3759 btrfs_end_transaction(trans
, root
);
3760 btrfs_btree_balance_dirty(root
);
3764 int btrfs_unlink_subvol(struct btrfs_trans_handle
*trans
,
3765 struct btrfs_root
*root
,
3766 struct inode
*dir
, u64 objectid
,
3767 const char *name
, int name_len
)
3769 struct btrfs_path
*path
;
3770 struct extent_buffer
*leaf
;
3771 struct btrfs_dir_item
*di
;
3772 struct btrfs_key key
;
3775 u64 dir_ino
= btrfs_ino(dir
);
3777 path
= btrfs_alloc_path();
3781 di
= btrfs_lookup_dir_item(trans
, root
, path
, dir_ino
,
3782 name
, name_len
, -1);
3783 if (IS_ERR_OR_NULL(di
)) {
3791 leaf
= path
->nodes
[0];
3792 btrfs_dir_item_key_to_cpu(leaf
, di
, &key
);
3793 WARN_ON(key
.type
!= BTRFS_ROOT_ITEM_KEY
|| key
.objectid
!= objectid
);
3794 ret
= btrfs_delete_one_dir_name(trans
, root
, path
, di
);
3796 btrfs_abort_transaction(trans
, root
, ret
);
3799 btrfs_release_path(path
);
3801 ret
= btrfs_del_root_ref(trans
, root
->fs_info
->tree_root
,
3802 objectid
, root
->root_key
.objectid
,
3803 dir_ino
, &index
, name
, name_len
);
3805 if (ret
!= -ENOENT
) {
3806 btrfs_abort_transaction(trans
, root
, ret
);
3809 di
= btrfs_search_dir_index_item(root
, path
, dir_ino
,
3811 if (IS_ERR_OR_NULL(di
)) {
3816 btrfs_abort_transaction(trans
, root
, ret
);
3820 leaf
= path
->nodes
[0];
3821 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
3822 btrfs_release_path(path
);
3825 btrfs_release_path(path
);
3827 ret
= btrfs_delete_delayed_dir_index(trans
, root
, dir
, index
);
3829 btrfs_abort_transaction(trans
, root
, ret
);
3833 btrfs_i_size_write(dir
, dir
->i_size
- name_len
* 2);
3834 inode_inc_iversion(dir
);
3835 dir
->i_mtime
= dir
->i_ctime
= CURRENT_TIME
;
3836 ret
= btrfs_update_inode_fallback(trans
, root
, dir
);
3838 btrfs_abort_transaction(trans
, root
, ret
);
3840 btrfs_free_path(path
);
3844 static int btrfs_rmdir(struct inode
*dir
, struct dentry
*dentry
)
3846 struct inode
*inode
= dentry
->d_inode
;
3848 struct btrfs_root
*root
= BTRFS_I(dir
)->root
;
3849 struct btrfs_trans_handle
*trans
;
3851 if (inode
->i_size
> BTRFS_EMPTY_DIR_SIZE
)
3853 if (btrfs_ino(inode
) == BTRFS_FIRST_FREE_OBJECTID
)
3856 trans
= __unlink_start_trans(dir
);
3858 return PTR_ERR(trans
);
3860 if (unlikely(btrfs_ino(inode
) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID
)) {
3861 err
= btrfs_unlink_subvol(trans
, root
, dir
,
3862 BTRFS_I(inode
)->location
.objectid
,
3863 dentry
->d_name
.name
,
3864 dentry
->d_name
.len
);
3868 err
= btrfs_orphan_add(trans
, inode
);
3872 /* now the directory is empty */
3873 err
= btrfs_unlink_inode(trans
, root
, dir
, dentry
->d_inode
,
3874 dentry
->d_name
.name
, dentry
->d_name
.len
);
3876 btrfs_i_size_write(inode
, 0);
3878 btrfs_end_transaction(trans
, root
);
3879 btrfs_btree_balance_dirty(root
);
3885 * this can truncate away extent items, csum items and directory items.
3886 * It starts at a high offset and removes keys until it can't find
3887 * any higher than new_size
3889 * csum items that cross the new i_size are truncated to the new size
3892 * min_type is the minimum key type to truncate down to. If set to 0, this
3893 * will kill all the items on this inode, including the INODE_ITEM_KEY.
3895 int btrfs_truncate_inode_items(struct btrfs_trans_handle
*trans
,
3896 struct btrfs_root
*root
,
3897 struct inode
*inode
,
3898 u64 new_size
, u32 min_type
)
3900 struct btrfs_path
*path
;
3901 struct extent_buffer
*leaf
;
3902 struct btrfs_file_extent_item
*fi
;
3903 struct btrfs_key key
;
3904 struct btrfs_key found_key
;
3905 u64 extent_start
= 0;
3906 u64 extent_num_bytes
= 0;
3907 u64 extent_offset
= 0;
3909 u32 found_type
= (u8
)-1;
3912 int pending_del_nr
= 0;
3913 int pending_del_slot
= 0;
3914 int extent_type
= -1;
3917 u64 ino
= btrfs_ino(inode
);
3919 BUG_ON(new_size
> 0 && min_type
!= BTRFS_EXTENT_DATA_KEY
);
3921 path
= btrfs_alloc_path();
3927 * We want to drop from the next block forward in case this new size is
3928 * not block aligned since we will be keeping the last block of the
3929 * extent just the way it is.
3931 if (root
->ref_cows
|| root
== root
->fs_info
->tree_root
)
3932 btrfs_drop_extent_cache(inode
, ALIGN(new_size
,
3933 root
->sectorsize
), (u64
)-1, 0);
3936 * This function is also used to drop the items in the log tree before
3937 * we relog the inode, so if root != BTRFS_I(inode)->root, it means
3938 * it is used to drop the loged items. So we shouldn't kill the delayed
3941 if (min_type
== 0 && root
== BTRFS_I(inode
)->root
)
3942 btrfs_kill_delayed_inode_items(inode
);
3945 key
.offset
= (u64
)-1;
3949 path
->leave_spinning
= 1;
3950 ret
= btrfs_search_slot(trans
, root
, &key
, path
, -1, 1);
3957 /* there are no items in the tree for us to truncate, we're
3960 if (path
->slots
[0] == 0)
3967 leaf
= path
->nodes
[0];
3968 btrfs_item_key_to_cpu(leaf
, &found_key
, path
->slots
[0]);
3969 found_type
= btrfs_key_type(&found_key
);
3971 if (found_key
.objectid
!= ino
)
3974 if (found_type
< min_type
)
3977 item_end
= found_key
.offset
;
3978 if (found_type
== BTRFS_EXTENT_DATA_KEY
) {
3979 fi
= btrfs_item_ptr(leaf
, path
->slots
[0],
3980 struct btrfs_file_extent_item
);
3981 extent_type
= btrfs_file_extent_type(leaf
, fi
);
3982 if (extent_type
!= BTRFS_FILE_EXTENT_INLINE
) {
3984 btrfs_file_extent_num_bytes(leaf
, fi
);
3985 } else if (extent_type
== BTRFS_FILE_EXTENT_INLINE
) {
3986 item_end
+= btrfs_file_extent_inline_len(leaf
,
3991 if (found_type
> min_type
) {
3994 if (item_end
< new_size
)
3996 if (found_key
.offset
>= new_size
)
4002 /* FIXME, shrink the extent if the ref count is only 1 */
4003 if (found_type
!= BTRFS_EXTENT_DATA_KEY
)
4006 if (extent_type
!= BTRFS_FILE_EXTENT_INLINE
) {
4008 extent_start
= btrfs_file_extent_disk_bytenr(leaf
, fi
);
4010 u64 orig_num_bytes
=
4011 btrfs_file_extent_num_bytes(leaf
, fi
);
4012 extent_num_bytes
= ALIGN(new_size
-
4015 btrfs_set_file_extent_num_bytes(leaf
, fi
,
4017 num_dec
= (orig_num_bytes
-
4019 if (root
->ref_cows
&& extent_start
!= 0)
4020 inode_sub_bytes(inode
, num_dec
);
4021 btrfs_mark_buffer_dirty(leaf
);
4024 btrfs_file_extent_disk_num_bytes(leaf
,
4026 extent_offset
= found_key
.offset
-
4027 btrfs_file_extent_offset(leaf
, fi
);
4029 /* FIXME blocksize != 4096 */
4030 num_dec
= btrfs_file_extent_num_bytes(leaf
, fi
);
4031 if (extent_start
!= 0) {
4034 inode_sub_bytes(inode
, num_dec
);
4037 } else if (extent_type
== BTRFS_FILE_EXTENT_INLINE
) {
4039 * we can't truncate inline items that have had
4043 btrfs_file_extent_compression(leaf
, fi
) == 0 &&
4044 btrfs_file_extent_encryption(leaf
, fi
) == 0 &&
4045 btrfs_file_extent_other_encoding(leaf
, fi
) == 0) {
4046 u32 size
= new_size
- found_key
.offset
;
4048 if (root
->ref_cows
) {
4049 inode_sub_bytes(inode
, item_end
+ 1 -
4053 btrfs_file_extent_calc_inline_size(size
);
4054 btrfs_truncate_item(root
, path
, size
, 1);
4055 } else if (root
->ref_cows
) {
4056 inode_sub_bytes(inode
, item_end
+ 1 -
4062 if (!pending_del_nr
) {
4063 /* no pending yet, add ourselves */
4064 pending_del_slot
= path
->slots
[0];
4066 } else if (pending_del_nr
&&
4067 path
->slots
[0] + 1 == pending_del_slot
) {
4068 /* hop on the pending chunk */
4070 pending_del_slot
= path
->slots
[0];
4077 if (found_extent
&& (root
->ref_cows
||
4078 root
== root
->fs_info
->tree_root
)) {
4079 btrfs_set_path_blocking(path
);
4080 ret
= btrfs_free_extent(trans
, root
, extent_start
,
4081 extent_num_bytes
, 0,
4082 btrfs_header_owner(leaf
),
4083 ino
, extent_offset
, 0);
4087 if (found_type
== BTRFS_INODE_ITEM_KEY
)
4090 if (path
->slots
[0] == 0 ||
4091 path
->slots
[0] != pending_del_slot
) {
4092 if (pending_del_nr
) {
4093 ret
= btrfs_del_items(trans
, root
, path
,
4097 btrfs_abort_transaction(trans
,
4103 btrfs_release_path(path
);
4110 if (pending_del_nr
) {
4111 ret
= btrfs_del_items(trans
, root
, path
, pending_del_slot
,
4114 btrfs_abort_transaction(trans
, root
, ret
);
4117 btrfs_free_path(path
);
4122 * btrfs_truncate_page - read, zero a chunk and write a page
4123 * @inode - inode that we're zeroing
4124 * @from - the offset to start zeroing
4125 * @len - the length to zero, 0 to zero the entire range respective to the
4127 * @front - zero up to the offset instead of from the offset on
4129 * This will find the page for the "from" offset and cow the page and zero the
4130 * part we want to zero. This is used with truncate and hole punching.
4132 int btrfs_truncate_page(struct inode
*inode
, loff_t from
, loff_t len
,
4135 struct address_space
*mapping
= inode
->i_mapping
;
4136 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
4137 struct extent_io_tree
*io_tree
= &BTRFS_I(inode
)->io_tree
;
4138 struct btrfs_ordered_extent
*ordered
;
4139 struct extent_state
*cached_state
= NULL
;
4141 u32 blocksize
= root
->sectorsize
;
4142 pgoff_t index
= from
>> PAGE_CACHE_SHIFT
;
4143 unsigned offset
= from
& (PAGE_CACHE_SIZE
-1);
4145 gfp_t mask
= btrfs_alloc_write_mask(mapping
);
4150 if ((offset
& (blocksize
- 1)) == 0 &&
4151 (!len
|| ((len
& (blocksize
- 1)) == 0)))
4153 ret
= btrfs_delalloc_reserve_space(inode
, PAGE_CACHE_SIZE
);
4158 page
= find_or_create_page(mapping
, index
, mask
);
4160 btrfs_delalloc_release_space(inode
, PAGE_CACHE_SIZE
);
4165 page_start
= page_offset(page
);
4166 page_end
= page_start
+ PAGE_CACHE_SIZE
- 1;
4168 if (!PageUptodate(page
)) {
4169 ret
= btrfs_readpage(NULL
, page
);
4171 if (page
->mapping
!= mapping
) {
4173 page_cache_release(page
);
4176 if (!PageUptodate(page
)) {
4181 wait_on_page_writeback(page
);
4183 lock_extent_bits(io_tree
, page_start
, page_end
, 0, &cached_state
);
4184 set_page_extent_mapped(page
);
4186 ordered
= btrfs_lookup_ordered_extent(inode
, page_start
);
4188 unlock_extent_cached(io_tree
, page_start
, page_end
,
4189 &cached_state
, GFP_NOFS
);
4191 page_cache_release(page
);
4192 btrfs_start_ordered_extent(inode
, ordered
, 1);
4193 btrfs_put_ordered_extent(ordered
);
4197 clear_extent_bit(&BTRFS_I(inode
)->io_tree
, page_start
, page_end
,
4198 EXTENT_DIRTY
| EXTENT_DELALLOC
|
4199 EXTENT_DO_ACCOUNTING
| EXTENT_DEFRAG
,
4200 0, 0, &cached_state
, GFP_NOFS
);
4202 ret
= btrfs_set_extent_delalloc(inode
, page_start
, page_end
,
4205 unlock_extent_cached(io_tree
, page_start
, page_end
,
4206 &cached_state
, GFP_NOFS
);
4210 if (offset
!= PAGE_CACHE_SIZE
) {
4212 len
= PAGE_CACHE_SIZE
- offset
;
4215 memset(kaddr
, 0, offset
);
4217 memset(kaddr
+ offset
, 0, len
);
4218 flush_dcache_page(page
);
4221 ClearPageChecked(page
);
4222 set_page_dirty(page
);
4223 unlock_extent_cached(io_tree
, page_start
, page_end
, &cached_state
,
4228 btrfs_delalloc_release_space(inode
, PAGE_CACHE_SIZE
);
4230 page_cache_release(page
);
4236 * This function puts in dummy file extents for the area we're creating a hole
4237 * for. So if we are truncating this file to a larger size we need to insert
4238 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4239 * the range between oldsize and size
4241 int btrfs_cont_expand(struct inode
*inode
, loff_t oldsize
, loff_t size
)
4243 struct btrfs_trans_handle
*trans
;
4244 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
4245 struct extent_io_tree
*io_tree
= &BTRFS_I(inode
)->io_tree
;
4246 struct extent_map
*em
= NULL
;
4247 struct extent_state
*cached_state
= NULL
;
4248 struct extent_map_tree
*em_tree
= &BTRFS_I(inode
)->extent_tree
;
4249 u64 hole_start
= ALIGN(oldsize
, root
->sectorsize
);
4250 u64 block_end
= ALIGN(size
, root
->sectorsize
);
4257 * If our size started in the middle of a page we need to zero out the
4258 * rest of the page before we expand the i_size, otherwise we could
4259 * expose stale data.
4261 err
= btrfs_truncate_page(inode
, oldsize
, 0, 0);
4265 if (size
<= hole_start
)
4269 struct btrfs_ordered_extent
*ordered
;
4270 btrfs_wait_ordered_range(inode
, hole_start
,
4271 block_end
- hole_start
);
4272 lock_extent_bits(io_tree
, hole_start
, block_end
- 1, 0,
4274 ordered
= btrfs_lookup_ordered_extent(inode
, hole_start
);
4277 unlock_extent_cached(io_tree
, hole_start
, block_end
- 1,
4278 &cached_state
, GFP_NOFS
);
4279 btrfs_put_ordered_extent(ordered
);
4282 cur_offset
= hole_start
;
4284 em
= btrfs_get_extent(inode
, NULL
, 0, cur_offset
,
4285 block_end
- cur_offset
, 0);
4291 last_byte
= min(extent_map_end(em
), block_end
);
4292 last_byte
= ALIGN(last_byte
, root
->sectorsize
);
4293 if (!test_bit(EXTENT_FLAG_PREALLOC
, &em
->flags
)) {
4294 struct extent_map
*hole_em
;
4295 hole_size
= last_byte
- cur_offset
;
4297 trans
= btrfs_start_transaction(root
, 3);
4298 if (IS_ERR(trans
)) {
4299 err
= PTR_ERR(trans
);
4303 err
= btrfs_drop_extents(trans
, root
, inode
,
4305 cur_offset
+ hole_size
, 1);
4307 btrfs_abort_transaction(trans
, root
, err
);
4308 btrfs_end_transaction(trans
, root
);
4312 err
= btrfs_insert_file_extent(trans
, root
,
4313 btrfs_ino(inode
), cur_offset
, 0,
4314 0, hole_size
, 0, hole_size
,
4317 btrfs_abort_transaction(trans
, root
, err
);
4318 btrfs_end_transaction(trans
, root
);
4322 btrfs_drop_extent_cache(inode
, cur_offset
,
4323 cur_offset
+ hole_size
- 1, 0);
4324 hole_em
= alloc_extent_map();
4326 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC
,
4327 &BTRFS_I(inode
)->runtime_flags
);
4330 hole_em
->start
= cur_offset
;
4331 hole_em
->len
= hole_size
;
4332 hole_em
->orig_start
= cur_offset
;
4334 hole_em
->block_start
= EXTENT_MAP_HOLE
;
4335 hole_em
->block_len
= 0;
4336 hole_em
->orig_block_len
= 0;
4337 hole_em
->ram_bytes
= hole_size
;
4338 hole_em
->bdev
= root
->fs_info
->fs_devices
->latest_bdev
;
4339 hole_em
->compress_type
= BTRFS_COMPRESS_NONE
;
4340 hole_em
->generation
= trans
->transid
;
4343 write_lock(&em_tree
->lock
);
4344 err
= add_extent_mapping(em_tree
, hole_em
, 1);
4345 write_unlock(&em_tree
->lock
);
4348 btrfs_drop_extent_cache(inode
, cur_offset
,
4352 free_extent_map(hole_em
);
4354 btrfs_update_inode(trans
, root
, inode
);
4355 btrfs_end_transaction(trans
, root
);
4357 free_extent_map(em
);
4359 cur_offset
= last_byte
;
4360 if (cur_offset
>= block_end
)
4364 free_extent_map(em
);
4365 unlock_extent_cached(io_tree
, hole_start
, block_end
- 1, &cached_state
,
4370 static int btrfs_setsize(struct inode
*inode
, struct iattr
*attr
)
4372 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
4373 struct btrfs_trans_handle
*trans
;
4374 loff_t oldsize
= i_size_read(inode
);
4375 loff_t newsize
= attr
->ia_size
;
4376 int mask
= attr
->ia_valid
;
4380 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
4381 * special case where we need to update the times despite not having
4382 * these flags set. For all other operations the VFS set these flags
4383 * explicitly if it wants a timestamp update.
4385 if (newsize
!= oldsize
&& (!(mask
& (ATTR_CTIME
| ATTR_MTIME
))))
4386 inode
->i_ctime
= inode
->i_mtime
= current_fs_time(inode
->i_sb
);
4388 if (newsize
> oldsize
) {
4389 truncate_pagecache(inode
, oldsize
, newsize
);
4390 ret
= btrfs_cont_expand(inode
, oldsize
, newsize
);
4394 trans
= btrfs_start_transaction(root
, 1);
4396 return PTR_ERR(trans
);
4398 i_size_write(inode
, newsize
);
4399 btrfs_ordered_update_i_size(inode
, i_size_read(inode
), NULL
);
4400 ret
= btrfs_update_inode(trans
, root
, inode
);
4401 btrfs_end_transaction(trans
, root
);
4405 * We're truncating a file that used to have good data down to
4406 * zero. Make sure it gets into the ordered flush list so that
4407 * any new writes get down to disk quickly.
4410 set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE
,
4411 &BTRFS_I(inode
)->runtime_flags
);
4414 * 1 for the orphan item we're going to add
4415 * 1 for the orphan item deletion.
4417 trans
= btrfs_start_transaction(root
, 2);
4419 return PTR_ERR(trans
);
4422 * We need to do this in case we fail at _any_ point during the
4423 * actual truncate. Once we do the truncate_setsize we could
4424 * invalidate pages which forces any outstanding ordered io to
4425 * be instantly completed which will give us extents that need
4426 * to be truncated. If we fail to get an orphan inode down we
4427 * could have left over extents that were never meant to live,
4428 * so we need to garuntee from this point on that everything
4429 * will be consistent.
4431 ret
= btrfs_orphan_add(trans
, inode
);
4432 btrfs_end_transaction(trans
, root
);
4436 /* we don't support swapfiles, so vmtruncate shouldn't fail */
4437 truncate_setsize(inode
, newsize
);
4439 /* Disable nonlocked read DIO to avoid the end less truncate */
4440 btrfs_inode_block_unlocked_dio(inode
);
4441 inode_dio_wait(inode
);
4442 btrfs_inode_resume_unlocked_dio(inode
);
4444 ret
= btrfs_truncate(inode
);
4445 if (ret
&& inode
->i_nlink
)
4446 btrfs_orphan_del(NULL
, inode
);
4452 static int btrfs_setattr(struct dentry
*dentry
, struct iattr
*attr
)
4454 struct inode
*inode
= dentry
->d_inode
;
4455 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
4458 if (btrfs_root_readonly(root
))
4461 err
= inode_change_ok(inode
, attr
);
4465 if (S_ISREG(inode
->i_mode
) && (attr
->ia_valid
& ATTR_SIZE
)) {
4466 err
= btrfs_setsize(inode
, attr
);
4471 if (attr
->ia_valid
) {
4472 setattr_copy(inode
, attr
);
4473 inode_inc_iversion(inode
);
4474 err
= btrfs_dirty_inode(inode
);
4476 if (!err
&& attr
->ia_valid
& ATTR_MODE
)
4477 err
= btrfs_acl_chmod(inode
);
4483 void btrfs_evict_inode(struct inode
*inode
)
4485 struct btrfs_trans_handle
*trans
;
4486 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
4487 struct btrfs_block_rsv
*rsv
, *global_rsv
;
4488 u64 min_size
= btrfs_calc_trunc_metadata_size(root
, 1);
4491 trace_btrfs_inode_evict(inode
);
4493 truncate_inode_pages(&inode
->i_data
, 0);
4494 if (inode
->i_nlink
&& (btrfs_root_refs(&root
->root_item
) != 0 ||
4495 btrfs_is_free_space_inode(inode
)))
4498 if (is_bad_inode(inode
)) {
4499 btrfs_orphan_del(NULL
, inode
);
4502 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
4503 btrfs_wait_ordered_range(inode
, 0, (u64
)-1);
4505 if (root
->fs_info
->log_root_recovering
) {
4506 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM
,
4507 &BTRFS_I(inode
)->runtime_flags
));
4511 if (inode
->i_nlink
> 0) {
4512 BUG_ON(btrfs_root_refs(&root
->root_item
) != 0);
4516 ret
= btrfs_commit_inode_delayed_inode(inode
);
4518 btrfs_orphan_del(NULL
, inode
);
4522 rsv
= btrfs_alloc_block_rsv(root
, BTRFS_BLOCK_RSV_TEMP
);
4524 btrfs_orphan_del(NULL
, inode
);
4527 rsv
->size
= min_size
;
4529 global_rsv
= &root
->fs_info
->global_block_rsv
;
4531 btrfs_i_size_write(inode
, 0);
4534 * This is a bit simpler than btrfs_truncate since we've already
4535 * reserved our space for our orphan item in the unlink, so we just
4536 * need to reserve some slack space in case we add bytes and update
4537 * inode item when doing the truncate.
4540 ret
= btrfs_block_rsv_refill(root
, rsv
, min_size
,
4541 BTRFS_RESERVE_FLUSH_LIMIT
);
4544 * Try and steal from the global reserve since we will
4545 * likely not use this space anyway, we want to try as
4546 * hard as possible to get this to work.
4549 ret
= btrfs_block_rsv_migrate(global_rsv
, rsv
, min_size
);
4552 btrfs_warn(root
->fs_info
,
4553 "Could not get space for a delete, will truncate on mount %d",
4555 btrfs_orphan_del(NULL
, inode
);
4556 btrfs_free_block_rsv(root
, rsv
);
4560 trans
= btrfs_join_transaction(root
);
4561 if (IS_ERR(trans
)) {
4562 btrfs_orphan_del(NULL
, inode
);
4563 btrfs_free_block_rsv(root
, rsv
);
4567 trans
->block_rsv
= rsv
;
4569 ret
= btrfs_truncate_inode_items(trans
, root
, inode
, 0, 0);
4573 trans
->block_rsv
= &root
->fs_info
->trans_block_rsv
;
4574 btrfs_end_transaction(trans
, root
);
4576 btrfs_btree_balance_dirty(root
);
4579 btrfs_free_block_rsv(root
, rsv
);
4582 * Errors here aren't a big deal, it just means we leave orphan items
4583 * in the tree. They will be cleaned up on the next mount.
4586 trans
->block_rsv
= root
->orphan_block_rsv
;
4587 btrfs_orphan_del(trans
, inode
);
4589 btrfs_orphan_del(NULL
, inode
);
4592 trans
->block_rsv
= &root
->fs_info
->trans_block_rsv
;
4593 if (!(root
== root
->fs_info
->tree_root
||
4594 root
->root_key
.objectid
== BTRFS_TREE_RELOC_OBJECTID
))
4595 btrfs_return_ino(root
, btrfs_ino(inode
));
4597 btrfs_end_transaction(trans
, root
);
4598 btrfs_btree_balance_dirty(root
);
4600 btrfs_remove_delayed_node(inode
);
4606 * this returns the key found in the dir entry in the location pointer.
4607 * If no dir entries were found, location->objectid is 0.
4609 static int btrfs_inode_by_name(struct inode
*dir
, struct dentry
*dentry
,
4610 struct btrfs_key
*location
)
4612 const char *name
= dentry
->d_name
.name
;
4613 int namelen
= dentry
->d_name
.len
;
4614 struct btrfs_dir_item
*di
;
4615 struct btrfs_path
*path
;
4616 struct btrfs_root
*root
= BTRFS_I(dir
)->root
;
4619 path
= btrfs_alloc_path();
4623 di
= btrfs_lookup_dir_item(NULL
, root
, path
, btrfs_ino(dir
), name
,
4628 if (IS_ERR_OR_NULL(di
))
4631 btrfs_dir_item_key_to_cpu(path
->nodes
[0], di
, location
);
4633 btrfs_free_path(path
);
4636 location
->objectid
= 0;
4641 * when we hit a tree root in a directory, the btrfs part of the inode
4642 * needs to be changed to reflect the root directory of the tree root. This
4643 * is kind of like crossing a mount point.
4645 static int fixup_tree_root_location(struct btrfs_root
*root
,
4647 struct dentry
*dentry
,
4648 struct btrfs_key
*location
,
4649 struct btrfs_root
**sub_root
)
4651 struct btrfs_path
*path
;
4652 struct btrfs_root
*new_root
;
4653 struct btrfs_root_ref
*ref
;
4654 struct extent_buffer
*leaf
;
4658 path
= btrfs_alloc_path();
4665 ret
= btrfs_find_root_ref(root
->fs_info
->tree_root
, path
,
4666 BTRFS_I(dir
)->root
->root_key
.objectid
,
4667 location
->objectid
);
4674 leaf
= path
->nodes
[0];
4675 ref
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_root_ref
);
4676 if (btrfs_root_ref_dirid(leaf
, ref
) != btrfs_ino(dir
) ||
4677 btrfs_root_ref_name_len(leaf
, ref
) != dentry
->d_name
.len
)
4680 ret
= memcmp_extent_buffer(leaf
, dentry
->d_name
.name
,
4681 (unsigned long)(ref
+ 1),
4682 dentry
->d_name
.len
);
4686 btrfs_release_path(path
);
4688 new_root
= btrfs_read_fs_root_no_name(root
->fs_info
, location
);
4689 if (IS_ERR(new_root
)) {
4690 err
= PTR_ERR(new_root
);
4694 *sub_root
= new_root
;
4695 location
->objectid
= btrfs_root_dirid(&new_root
->root_item
);
4696 location
->type
= BTRFS_INODE_ITEM_KEY
;
4697 location
->offset
= 0;
4700 btrfs_free_path(path
);
4704 static void inode_tree_add(struct inode
*inode
)
4706 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
4707 struct btrfs_inode
*entry
;
4709 struct rb_node
*parent
;
4710 u64 ino
= btrfs_ino(inode
);
4712 if (inode_unhashed(inode
))
4716 spin_lock(&root
->inode_lock
);
4717 p
= &root
->inode_tree
.rb_node
;
4720 entry
= rb_entry(parent
, struct btrfs_inode
, rb_node
);
4722 if (ino
< btrfs_ino(&entry
->vfs_inode
))
4723 p
= &parent
->rb_left
;
4724 else if (ino
> btrfs_ino(&entry
->vfs_inode
))
4725 p
= &parent
->rb_right
;
4727 WARN_ON(!(entry
->vfs_inode
.i_state
&
4728 (I_WILL_FREE
| I_FREEING
)));
4729 rb_erase(parent
, &root
->inode_tree
);
4730 RB_CLEAR_NODE(parent
);
4731 spin_unlock(&root
->inode_lock
);
4735 rb_link_node(&BTRFS_I(inode
)->rb_node
, parent
, p
);
4736 rb_insert_color(&BTRFS_I(inode
)->rb_node
, &root
->inode_tree
);
4737 spin_unlock(&root
->inode_lock
);
4740 static void inode_tree_del(struct inode
*inode
)
4742 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
4745 spin_lock(&root
->inode_lock
);
4746 if (!RB_EMPTY_NODE(&BTRFS_I(inode
)->rb_node
)) {
4747 rb_erase(&BTRFS_I(inode
)->rb_node
, &root
->inode_tree
);
4748 RB_CLEAR_NODE(&BTRFS_I(inode
)->rb_node
);
4749 empty
= RB_EMPTY_ROOT(&root
->inode_tree
);
4751 spin_unlock(&root
->inode_lock
);
4754 * Free space cache has inodes in the tree root, but the tree root has a
4755 * root_refs of 0, so this could end up dropping the tree root as a
4756 * snapshot, so we need the extra !root->fs_info->tree_root check to
4757 * make sure we don't drop it.
4759 if (empty
&& btrfs_root_refs(&root
->root_item
) == 0 &&
4760 root
!= root
->fs_info
->tree_root
) {
4761 synchronize_srcu(&root
->fs_info
->subvol_srcu
);
4762 spin_lock(&root
->inode_lock
);
4763 empty
= RB_EMPTY_ROOT(&root
->inode_tree
);
4764 spin_unlock(&root
->inode_lock
);
4766 btrfs_add_dead_root(root
);
4770 void btrfs_invalidate_inodes(struct btrfs_root
*root
)
4772 struct rb_node
*node
;
4773 struct rb_node
*prev
;
4774 struct btrfs_inode
*entry
;
4775 struct inode
*inode
;
4778 WARN_ON(btrfs_root_refs(&root
->root_item
) != 0);
4780 spin_lock(&root
->inode_lock
);
4782 node
= root
->inode_tree
.rb_node
;
4786 entry
= rb_entry(node
, struct btrfs_inode
, rb_node
);
4788 if (objectid
< btrfs_ino(&entry
->vfs_inode
))
4789 node
= node
->rb_left
;
4790 else if (objectid
> btrfs_ino(&entry
->vfs_inode
))
4791 node
= node
->rb_right
;
4797 entry
= rb_entry(prev
, struct btrfs_inode
, rb_node
);
4798 if (objectid
<= btrfs_ino(&entry
->vfs_inode
)) {
4802 prev
= rb_next(prev
);
4806 entry
= rb_entry(node
, struct btrfs_inode
, rb_node
);
4807 objectid
= btrfs_ino(&entry
->vfs_inode
) + 1;
4808 inode
= igrab(&entry
->vfs_inode
);
4810 spin_unlock(&root
->inode_lock
);
4811 if (atomic_read(&inode
->i_count
) > 1)
4812 d_prune_aliases(inode
);
4814 * btrfs_drop_inode will have it removed from
4815 * the inode cache when its usage count
4820 spin_lock(&root
->inode_lock
);
4824 if (cond_resched_lock(&root
->inode_lock
))
4827 node
= rb_next(node
);
4829 spin_unlock(&root
->inode_lock
);
4832 static int btrfs_init_locked_inode(struct inode
*inode
, void *p
)
4834 struct btrfs_iget_args
*args
= p
;
4835 inode
->i_ino
= args
->ino
;
4836 BTRFS_I(inode
)->root
= args
->root
;
4840 static int btrfs_find_actor(struct inode
*inode
, void *opaque
)
4842 struct btrfs_iget_args
*args
= opaque
;
4843 return args
->ino
== btrfs_ino(inode
) &&
4844 args
->root
== BTRFS_I(inode
)->root
;
4847 static struct inode
*btrfs_iget_locked(struct super_block
*s
,
4849 struct btrfs_root
*root
)
4851 struct inode
*inode
;
4852 struct btrfs_iget_args args
;
4853 args
.ino
= objectid
;
4856 inode
= iget5_locked(s
, objectid
, btrfs_find_actor
,
4857 btrfs_init_locked_inode
,
4862 /* Get an inode object given its location and corresponding root.
4863 * Returns in *is_new if the inode was read from disk
4865 struct inode
*btrfs_iget(struct super_block
*s
, struct btrfs_key
*location
,
4866 struct btrfs_root
*root
, int *new)
4868 struct inode
*inode
;
4870 inode
= btrfs_iget_locked(s
, location
->objectid
, root
);
4872 return ERR_PTR(-ENOMEM
);
4874 if (inode
->i_state
& I_NEW
) {
4875 BTRFS_I(inode
)->root
= root
;
4876 memcpy(&BTRFS_I(inode
)->location
, location
, sizeof(*location
));
4877 btrfs_read_locked_inode(inode
);
4878 if (!is_bad_inode(inode
)) {
4879 inode_tree_add(inode
);
4880 unlock_new_inode(inode
);
4884 unlock_new_inode(inode
);
4886 inode
= ERR_PTR(-ESTALE
);
4893 static struct inode
*new_simple_dir(struct super_block
*s
,
4894 struct btrfs_key
*key
,
4895 struct btrfs_root
*root
)
4897 struct inode
*inode
= new_inode(s
);
4900 return ERR_PTR(-ENOMEM
);
4902 BTRFS_I(inode
)->root
= root
;
4903 memcpy(&BTRFS_I(inode
)->location
, key
, sizeof(*key
));
4904 set_bit(BTRFS_INODE_DUMMY
, &BTRFS_I(inode
)->runtime_flags
);
4906 inode
->i_ino
= BTRFS_EMPTY_SUBVOL_DIR_OBJECTID
;
4907 inode
->i_op
= &btrfs_dir_ro_inode_operations
;
4908 inode
->i_fop
= &simple_dir_operations
;
4909 inode
->i_mode
= S_IFDIR
| S_IRUGO
| S_IWUSR
| S_IXUGO
;
4910 inode
->i_mtime
= inode
->i_atime
= inode
->i_ctime
= CURRENT_TIME
;
4915 struct inode
*btrfs_lookup_dentry(struct inode
*dir
, struct dentry
*dentry
)
4917 struct inode
*inode
;
4918 struct btrfs_root
*root
= BTRFS_I(dir
)->root
;
4919 struct btrfs_root
*sub_root
= root
;
4920 struct btrfs_key location
;
4924 if (dentry
->d_name
.len
> BTRFS_NAME_LEN
)
4925 return ERR_PTR(-ENAMETOOLONG
);
4927 ret
= btrfs_inode_by_name(dir
, dentry
, &location
);
4929 return ERR_PTR(ret
);
4931 if (location
.objectid
== 0)
4934 if (location
.type
== BTRFS_INODE_ITEM_KEY
) {
4935 inode
= btrfs_iget(dir
->i_sb
, &location
, root
, NULL
);
4939 BUG_ON(location
.type
!= BTRFS_ROOT_ITEM_KEY
);
4941 index
= srcu_read_lock(&root
->fs_info
->subvol_srcu
);
4942 ret
= fixup_tree_root_location(root
, dir
, dentry
,
4943 &location
, &sub_root
);
4946 inode
= ERR_PTR(ret
);
4948 inode
= new_simple_dir(dir
->i_sb
, &location
, sub_root
);
4950 inode
= btrfs_iget(dir
->i_sb
, &location
, sub_root
, NULL
);
4952 srcu_read_unlock(&root
->fs_info
->subvol_srcu
, index
);
4954 if (!IS_ERR(inode
) && root
!= sub_root
) {
4955 down_read(&root
->fs_info
->cleanup_work_sem
);
4956 if (!(inode
->i_sb
->s_flags
& MS_RDONLY
))
4957 ret
= btrfs_orphan_cleanup(sub_root
);
4958 up_read(&root
->fs_info
->cleanup_work_sem
);
4961 inode
= ERR_PTR(ret
);
4968 static int btrfs_dentry_delete(const struct dentry
*dentry
)
4970 struct btrfs_root
*root
;
4971 struct inode
*inode
= dentry
->d_inode
;
4973 if (!inode
&& !IS_ROOT(dentry
))
4974 inode
= dentry
->d_parent
->d_inode
;
4977 root
= BTRFS_I(inode
)->root
;
4978 if (btrfs_root_refs(&root
->root_item
) == 0)
4981 if (btrfs_ino(inode
) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID
)
4987 static void btrfs_dentry_release(struct dentry
*dentry
)
4989 if (dentry
->d_fsdata
)
4990 kfree(dentry
->d_fsdata
);
4993 static struct dentry
*btrfs_lookup(struct inode
*dir
, struct dentry
*dentry
,
4998 ret
= d_splice_alias(btrfs_lookup_dentry(dir
, dentry
), dentry
);
5002 unsigned char btrfs_filetype_table
[] = {
5003 DT_UNKNOWN
, DT_REG
, DT_DIR
, DT_CHR
, DT_BLK
, DT_FIFO
, DT_SOCK
, DT_LNK
5006 static int btrfs_real_readdir(struct file
*file
, struct dir_context
*ctx
)
5008 struct inode
*inode
= file_inode(file
);
5009 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
5010 struct btrfs_item
*item
;
5011 struct btrfs_dir_item
*di
;
5012 struct btrfs_key key
;
5013 struct btrfs_key found_key
;
5014 struct btrfs_path
*path
;
5015 struct list_head ins_list
;
5016 struct list_head del_list
;
5018 struct extent_buffer
*leaf
;
5020 unsigned char d_type
;
5025 int key_type
= BTRFS_DIR_INDEX_KEY
;
5029 int is_curr
= 0; /* ctx->pos points to the current index? */
5031 /* FIXME, use a real flag for deciding about the key type */
5032 if (root
->fs_info
->tree_root
== root
)
5033 key_type
= BTRFS_DIR_ITEM_KEY
;
5035 if (!dir_emit_dots(file
, ctx
))
5038 path
= btrfs_alloc_path();
5044 if (key_type
== BTRFS_DIR_INDEX_KEY
) {
5045 INIT_LIST_HEAD(&ins_list
);
5046 INIT_LIST_HEAD(&del_list
);
5047 btrfs_get_delayed_items(inode
, &ins_list
, &del_list
);
5050 btrfs_set_key_type(&key
, key_type
);
5051 key
.offset
= ctx
->pos
;
5052 key
.objectid
= btrfs_ino(inode
);
5054 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
5059 leaf
= path
->nodes
[0];
5060 slot
= path
->slots
[0];
5061 if (slot
>= btrfs_header_nritems(leaf
)) {
5062 ret
= btrfs_next_leaf(root
, path
);
5070 item
= btrfs_item_nr(leaf
, slot
);
5071 btrfs_item_key_to_cpu(leaf
, &found_key
, slot
);
5073 if (found_key
.objectid
!= key
.objectid
)
5075 if (btrfs_key_type(&found_key
) != key_type
)
5077 if (found_key
.offset
< ctx
->pos
)
5079 if (key_type
== BTRFS_DIR_INDEX_KEY
&&
5080 btrfs_should_delete_dir_index(&del_list
,
5084 ctx
->pos
= found_key
.offset
;
5087 di
= btrfs_item_ptr(leaf
, slot
, struct btrfs_dir_item
);
5089 di_total
= btrfs_item_size(leaf
, item
);
5091 while (di_cur
< di_total
) {
5092 struct btrfs_key location
;
5094 if (verify_dir_item(root
, leaf
, di
))
5097 name_len
= btrfs_dir_name_len(leaf
, di
);
5098 if (name_len
<= sizeof(tmp_name
)) {
5099 name_ptr
= tmp_name
;
5101 name_ptr
= kmalloc(name_len
, GFP_NOFS
);
5107 read_extent_buffer(leaf
, name_ptr
,
5108 (unsigned long)(di
+ 1), name_len
);
5110 d_type
= btrfs_filetype_table
[btrfs_dir_type(leaf
, di
)];
5111 btrfs_dir_item_key_to_cpu(leaf
, di
, &location
);
5114 /* is this a reference to our own snapshot? If so
5117 * In contrast to old kernels, we insert the snapshot's
5118 * dir item and dir index after it has been created, so
5119 * we won't find a reference to our own snapshot. We
5120 * still keep the following code for backward
5123 if (location
.type
== BTRFS_ROOT_ITEM_KEY
&&
5124 location
.objectid
== root
->root_key
.objectid
) {
5128 over
= !dir_emit(ctx
, name_ptr
, name_len
,
5129 location
.objectid
, d_type
);
5132 if (name_ptr
!= tmp_name
)
5137 di_len
= btrfs_dir_name_len(leaf
, di
) +
5138 btrfs_dir_data_len(leaf
, di
) + sizeof(*di
);
5140 di
= (struct btrfs_dir_item
*)((char *)di
+ di_len
);
5146 if (key_type
== BTRFS_DIR_INDEX_KEY
) {
5149 ret
= btrfs_readdir_delayed_dir_index(ctx
, &ins_list
);
5154 /* Reached end of directory/root. Bump pos past the last item. */
5158 * Stop new entries from being returned after we return the last
5161 * New directory entries are assigned a strictly increasing
5162 * offset. This means that new entries created during readdir
5163 * are *guaranteed* to be seen in the future by that readdir.
5164 * This has broken buggy programs which operate on names as
5165 * they're returned by readdir. Until we re-use freed offsets
5166 * we have this hack to stop new entries from being returned
5167 * under the assumption that they'll never reach this huge
5170 * This is being careful not to overflow 32bit loff_t unless the
5171 * last entry requires it because doing so has broken 32bit apps
5174 if (key_type
== BTRFS_DIR_INDEX_KEY
) {
5175 if (ctx
->pos
>= INT_MAX
)
5176 ctx
->pos
= LLONG_MAX
;
5183 if (key_type
== BTRFS_DIR_INDEX_KEY
)
5184 btrfs_put_delayed_items(&ins_list
, &del_list
);
5185 btrfs_free_path(path
);
5189 int btrfs_write_inode(struct inode
*inode
, struct writeback_control
*wbc
)
5191 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
5192 struct btrfs_trans_handle
*trans
;
5194 bool nolock
= false;
5196 if (test_bit(BTRFS_INODE_DUMMY
, &BTRFS_I(inode
)->runtime_flags
))
5199 if (btrfs_fs_closing(root
->fs_info
) && btrfs_is_free_space_inode(inode
))
5202 if (wbc
->sync_mode
== WB_SYNC_ALL
) {
5204 trans
= btrfs_join_transaction_nolock(root
);
5206 trans
= btrfs_join_transaction(root
);
5208 return PTR_ERR(trans
);
5209 ret
= btrfs_commit_transaction(trans
, root
);
5215 * This is somewhat expensive, updating the tree every time the
5216 * inode changes. But, it is most likely to find the inode in cache.
5217 * FIXME, needs more benchmarking...there are no reasons other than performance
5218 * to keep or drop this code.
5220 static int btrfs_dirty_inode(struct inode
*inode
)
5222 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
5223 struct btrfs_trans_handle
*trans
;
5226 if (test_bit(BTRFS_INODE_DUMMY
, &BTRFS_I(inode
)->runtime_flags
))
5229 trans
= btrfs_join_transaction(root
);
5231 return PTR_ERR(trans
);
5233 ret
= btrfs_update_inode(trans
, root
, inode
);
5234 if (ret
&& ret
== -ENOSPC
) {
5235 /* whoops, lets try again with the full transaction */
5236 btrfs_end_transaction(trans
, root
);
5237 trans
= btrfs_start_transaction(root
, 1);
5239 return PTR_ERR(trans
);
5241 ret
= btrfs_update_inode(trans
, root
, inode
);
5243 btrfs_end_transaction(trans
, root
);
5244 if (BTRFS_I(inode
)->delayed_node
)
5245 btrfs_balance_delayed_items(root
);
5251 * This is a copy of file_update_time. We need this so we can return error on
5252 * ENOSPC for updating the inode in the case of file write and mmap writes.
5254 static int btrfs_update_time(struct inode
*inode
, struct timespec
*now
,
5257 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
5259 if (btrfs_root_readonly(root
))
5262 if (flags
& S_VERSION
)
5263 inode_inc_iversion(inode
);
5264 if (flags
& S_CTIME
)
5265 inode
->i_ctime
= *now
;
5266 if (flags
& S_MTIME
)
5267 inode
->i_mtime
= *now
;
5268 if (flags
& S_ATIME
)
5269 inode
->i_atime
= *now
;
5270 return btrfs_dirty_inode(inode
);
5274 * find the highest existing sequence number in a directory
5275 * and then set the in-memory index_cnt variable to reflect
5276 * free sequence numbers
5278 static int btrfs_set_inode_index_count(struct inode
*inode
)
5280 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
5281 struct btrfs_key key
, found_key
;
5282 struct btrfs_path
*path
;
5283 struct extent_buffer
*leaf
;
5286 key
.objectid
= btrfs_ino(inode
);
5287 btrfs_set_key_type(&key
, BTRFS_DIR_INDEX_KEY
);
5288 key
.offset
= (u64
)-1;
5290 path
= btrfs_alloc_path();
5294 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
5297 /* FIXME: we should be able to handle this */
5303 * MAGIC NUMBER EXPLANATION:
5304 * since we search a directory based on f_pos we have to start at 2
5305 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
5306 * else has to start at 2
5308 if (path
->slots
[0] == 0) {
5309 BTRFS_I(inode
)->index_cnt
= 2;
5315 leaf
= path
->nodes
[0];
5316 btrfs_item_key_to_cpu(leaf
, &found_key
, path
->slots
[0]);
5318 if (found_key
.objectid
!= btrfs_ino(inode
) ||
5319 btrfs_key_type(&found_key
) != BTRFS_DIR_INDEX_KEY
) {
5320 BTRFS_I(inode
)->index_cnt
= 2;
5324 BTRFS_I(inode
)->index_cnt
= found_key
.offset
+ 1;
5326 btrfs_free_path(path
);
5331 * helper to find a free sequence number in a given directory. This current
5332 * code is very simple, later versions will do smarter things in the btree
5334 int btrfs_set_inode_index(struct inode
*dir
, u64
*index
)
5338 if (BTRFS_I(dir
)->index_cnt
== (u64
)-1) {
5339 ret
= btrfs_inode_delayed_dir_index_count(dir
);
5341 ret
= btrfs_set_inode_index_count(dir
);
5347 *index
= BTRFS_I(dir
)->index_cnt
;
5348 BTRFS_I(dir
)->index_cnt
++;
5353 static struct inode
*btrfs_new_inode(struct btrfs_trans_handle
*trans
,
5354 struct btrfs_root
*root
,
5356 const char *name
, int name_len
,
5357 u64 ref_objectid
, u64 objectid
,
5358 umode_t mode
, u64
*index
)
5360 struct inode
*inode
;
5361 struct btrfs_inode_item
*inode_item
;
5362 struct btrfs_key
*location
;
5363 struct btrfs_path
*path
;
5364 struct btrfs_inode_ref
*ref
;
5365 struct btrfs_key key
[2];
5371 path
= btrfs_alloc_path();
5373 return ERR_PTR(-ENOMEM
);
5375 inode
= new_inode(root
->fs_info
->sb
);
5377 btrfs_free_path(path
);
5378 return ERR_PTR(-ENOMEM
);
5382 * we have to initialize this early, so we can reclaim the inode
5383 * number if we fail afterwards in this function.
5385 inode
->i_ino
= objectid
;
5388 trace_btrfs_inode_request(dir
);
5390 ret
= btrfs_set_inode_index(dir
, index
);
5392 btrfs_free_path(path
);
5394 return ERR_PTR(ret
);
5398 * index_cnt is ignored for everything but a dir,
5399 * btrfs_get_inode_index_count has an explanation for the magic
5402 BTRFS_I(inode
)->index_cnt
= 2;
5403 BTRFS_I(inode
)->root
= root
;
5404 BTRFS_I(inode
)->generation
= trans
->transid
;
5405 inode
->i_generation
= BTRFS_I(inode
)->generation
;
5408 * We could have gotten an inode number from somebody who was fsynced
5409 * and then removed in this same transaction, so let's just set full
5410 * sync since it will be a full sync anyway and this will blow away the
5411 * old info in the log.
5413 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC
, &BTRFS_I(inode
)->runtime_flags
);
5420 key
[0].objectid
= objectid
;
5421 btrfs_set_key_type(&key
[0], BTRFS_INODE_ITEM_KEY
);
5425 * Start new inodes with an inode_ref. This is slightly more
5426 * efficient for small numbers of hard links since they will
5427 * be packed into one item. Extended refs will kick in if we
5428 * add more hard links than can fit in the ref item.
5430 key
[1].objectid
= objectid
;
5431 btrfs_set_key_type(&key
[1], BTRFS_INODE_REF_KEY
);
5432 key
[1].offset
= ref_objectid
;
5434 sizes
[0] = sizeof(struct btrfs_inode_item
);
5435 sizes
[1] = name_len
+ sizeof(*ref
);
5437 path
->leave_spinning
= 1;
5438 ret
= btrfs_insert_empty_items(trans
, root
, path
, key
, sizes
, 2);
5442 inode_init_owner(inode
, dir
, mode
);
5443 inode_set_bytes(inode
, 0);
5444 inode
->i_mtime
= inode
->i_atime
= inode
->i_ctime
= CURRENT_TIME
;
5445 inode_item
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
5446 struct btrfs_inode_item
);
5447 memset_extent_buffer(path
->nodes
[0], 0, (unsigned long)inode_item
,
5448 sizeof(*inode_item
));
5449 fill_inode_item(trans
, path
->nodes
[0], inode_item
, inode
);
5451 ref
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0] + 1,
5452 struct btrfs_inode_ref
);
5453 btrfs_set_inode_ref_name_len(path
->nodes
[0], ref
, name_len
);
5454 btrfs_set_inode_ref_index(path
->nodes
[0], ref
, *index
);
5455 ptr
= (unsigned long)(ref
+ 1);
5456 write_extent_buffer(path
->nodes
[0], name
, ptr
, name_len
);
5458 btrfs_mark_buffer_dirty(path
->nodes
[0]);
5459 btrfs_free_path(path
);
5461 location
= &BTRFS_I(inode
)->location
;
5462 location
->objectid
= objectid
;
5463 location
->offset
= 0;
5464 btrfs_set_key_type(location
, BTRFS_INODE_ITEM_KEY
);
5466 btrfs_inherit_iflags(inode
, dir
);
5468 if (S_ISREG(mode
)) {
5469 if (btrfs_test_opt(root
, NODATASUM
))
5470 BTRFS_I(inode
)->flags
|= BTRFS_INODE_NODATASUM
;
5471 if (btrfs_test_opt(root
, NODATACOW
))
5472 BTRFS_I(inode
)->flags
|= BTRFS_INODE_NODATACOW
|
5473 BTRFS_INODE_NODATASUM
;
5476 insert_inode_hash(inode
);
5477 inode_tree_add(inode
);
5479 trace_btrfs_inode_new(inode
);
5480 btrfs_set_inode_last_trans(trans
, inode
);
5482 btrfs_update_root_times(trans
, root
);
5487 BTRFS_I(dir
)->index_cnt
--;
5488 btrfs_free_path(path
);
5490 return ERR_PTR(ret
);
5493 static inline u8
btrfs_inode_type(struct inode
*inode
)
5495 return btrfs_type_by_mode
[(inode
->i_mode
& S_IFMT
) >> S_SHIFT
];
5499 * utility function to add 'inode' into 'parent_inode' with
5500 * a give name and a given sequence number.
5501 * if 'add_backref' is true, also insert a backref from the
5502 * inode to the parent directory.
5504 int btrfs_add_link(struct btrfs_trans_handle
*trans
,
5505 struct inode
*parent_inode
, struct inode
*inode
,
5506 const char *name
, int name_len
, int add_backref
, u64 index
)
5509 struct btrfs_key key
;
5510 struct btrfs_root
*root
= BTRFS_I(parent_inode
)->root
;
5511 u64 ino
= btrfs_ino(inode
);
5512 u64 parent_ino
= btrfs_ino(parent_inode
);
5514 if (unlikely(ino
== BTRFS_FIRST_FREE_OBJECTID
)) {
5515 memcpy(&key
, &BTRFS_I(inode
)->root
->root_key
, sizeof(key
));
5518 btrfs_set_key_type(&key
, BTRFS_INODE_ITEM_KEY
);
5522 if (unlikely(ino
== BTRFS_FIRST_FREE_OBJECTID
)) {
5523 ret
= btrfs_add_root_ref(trans
, root
->fs_info
->tree_root
,
5524 key
.objectid
, root
->root_key
.objectid
,
5525 parent_ino
, index
, name
, name_len
);
5526 } else if (add_backref
) {
5527 ret
= btrfs_insert_inode_ref(trans
, root
, name
, name_len
, ino
,
5531 /* Nothing to clean up yet */
5535 ret
= btrfs_insert_dir_item(trans
, root
, name
, name_len
,
5537 btrfs_inode_type(inode
), index
);
5538 if (ret
== -EEXIST
|| ret
== -EOVERFLOW
)
5541 btrfs_abort_transaction(trans
, root
, ret
);
5545 btrfs_i_size_write(parent_inode
, parent_inode
->i_size
+
5547 inode_inc_iversion(parent_inode
);
5548 parent_inode
->i_mtime
= parent_inode
->i_ctime
= CURRENT_TIME
;
5549 ret
= btrfs_update_inode(trans
, root
, parent_inode
);
5551 btrfs_abort_transaction(trans
, root
, ret
);
5555 if (unlikely(ino
== BTRFS_FIRST_FREE_OBJECTID
)) {
5558 err
= btrfs_del_root_ref(trans
, root
->fs_info
->tree_root
,
5559 key
.objectid
, root
->root_key
.objectid
,
5560 parent_ino
, &local_index
, name
, name_len
);
5562 } else if (add_backref
) {
5566 err
= btrfs_del_inode_ref(trans
, root
, name
, name_len
,
5567 ino
, parent_ino
, &local_index
);
5572 static int btrfs_add_nondir(struct btrfs_trans_handle
*trans
,
5573 struct inode
*dir
, struct dentry
*dentry
,
5574 struct inode
*inode
, int backref
, u64 index
)
5576 int err
= btrfs_add_link(trans
, dir
, inode
,
5577 dentry
->d_name
.name
, dentry
->d_name
.len
,
5584 static int btrfs_mknod(struct inode
*dir
, struct dentry
*dentry
,
5585 umode_t mode
, dev_t rdev
)
5587 struct btrfs_trans_handle
*trans
;
5588 struct btrfs_root
*root
= BTRFS_I(dir
)->root
;
5589 struct inode
*inode
= NULL
;
5595 if (!new_valid_dev(rdev
))
5599 * 2 for inode item and ref
5601 * 1 for xattr if selinux is on
5603 trans
= btrfs_start_transaction(root
, 5);
5605 return PTR_ERR(trans
);
5607 err
= btrfs_find_free_ino(root
, &objectid
);
5611 inode
= btrfs_new_inode(trans
, root
, dir
, dentry
->d_name
.name
,
5612 dentry
->d_name
.len
, btrfs_ino(dir
), objectid
,
5614 if (IS_ERR(inode
)) {
5615 err
= PTR_ERR(inode
);
5619 err
= btrfs_init_inode_security(trans
, inode
, dir
, &dentry
->d_name
);
5626 * If the active LSM wants to access the inode during
5627 * d_instantiate it needs these. Smack checks to see
5628 * if the filesystem supports xattrs by looking at the
5632 inode
->i_op
= &btrfs_special_inode_operations
;
5633 err
= btrfs_add_nondir(trans
, dir
, dentry
, inode
, 0, index
);
5637 init_special_inode(inode
, inode
->i_mode
, rdev
);
5638 btrfs_update_inode(trans
, root
, inode
);
5639 d_instantiate(dentry
, inode
);
5642 btrfs_end_transaction(trans
, root
);
5643 btrfs_btree_balance_dirty(root
);
5645 inode_dec_link_count(inode
);
5651 static int btrfs_create(struct inode
*dir
, struct dentry
*dentry
,
5652 umode_t mode
, bool excl
)
5654 struct btrfs_trans_handle
*trans
;
5655 struct btrfs_root
*root
= BTRFS_I(dir
)->root
;
5656 struct inode
*inode
= NULL
;
5657 int drop_inode_on_err
= 0;
5663 * 2 for inode item and ref
5665 * 1 for xattr if selinux is on
5667 trans
= btrfs_start_transaction(root
, 5);
5669 return PTR_ERR(trans
);
5671 err
= btrfs_find_free_ino(root
, &objectid
);
5675 inode
= btrfs_new_inode(trans
, root
, dir
, dentry
->d_name
.name
,
5676 dentry
->d_name
.len
, btrfs_ino(dir
), objectid
,
5678 if (IS_ERR(inode
)) {
5679 err
= PTR_ERR(inode
);
5682 drop_inode_on_err
= 1;
5684 err
= btrfs_init_inode_security(trans
, inode
, dir
, &dentry
->d_name
);
5688 err
= btrfs_update_inode(trans
, root
, inode
);
5693 * If the active LSM wants to access the inode during
5694 * d_instantiate it needs these. Smack checks to see
5695 * if the filesystem supports xattrs by looking at the
5698 inode
->i_fop
= &btrfs_file_operations
;
5699 inode
->i_op
= &btrfs_file_inode_operations
;
5701 err
= btrfs_add_nondir(trans
, dir
, dentry
, inode
, 0, index
);
5705 inode
->i_mapping
->a_ops
= &btrfs_aops
;
5706 inode
->i_mapping
->backing_dev_info
= &root
->fs_info
->bdi
;
5707 BTRFS_I(inode
)->io_tree
.ops
= &btrfs_extent_io_ops
;
5708 d_instantiate(dentry
, inode
);
5711 btrfs_end_transaction(trans
, root
);
5712 if (err
&& drop_inode_on_err
) {
5713 inode_dec_link_count(inode
);
5716 btrfs_btree_balance_dirty(root
);
5720 static int btrfs_link(struct dentry
*old_dentry
, struct inode
*dir
,
5721 struct dentry
*dentry
)
5723 struct btrfs_trans_handle
*trans
;
5724 struct btrfs_root
*root
= BTRFS_I(dir
)->root
;
5725 struct inode
*inode
= old_dentry
->d_inode
;
5730 /* do not allow sys_link's with other subvols of the same device */
5731 if (root
->objectid
!= BTRFS_I(inode
)->root
->objectid
)
5734 if (inode
->i_nlink
>= BTRFS_LINK_MAX
)
5737 err
= btrfs_set_inode_index(dir
, &index
);
5742 * 2 items for inode and inode ref
5743 * 2 items for dir items
5744 * 1 item for parent inode
5746 trans
= btrfs_start_transaction(root
, 5);
5747 if (IS_ERR(trans
)) {
5748 err
= PTR_ERR(trans
);
5752 btrfs_inc_nlink(inode
);
5753 inode_inc_iversion(inode
);
5754 inode
->i_ctime
= CURRENT_TIME
;
5756 set_bit(BTRFS_INODE_COPY_EVERYTHING
, &BTRFS_I(inode
)->runtime_flags
);
5758 err
= btrfs_add_nondir(trans
, dir
, dentry
, inode
, 1, index
);
5763 struct dentry
*parent
= dentry
->d_parent
;
5764 err
= btrfs_update_inode(trans
, root
, inode
);
5767 d_instantiate(dentry
, inode
);
5768 btrfs_log_new_name(trans
, inode
, NULL
, parent
);
5771 btrfs_end_transaction(trans
, root
);
5774 inode_dec_link_count(inode
);
5777 btrfs_btree_balance_dirty(root
);
5781 static int btrfs_mkdir(struct inode
*dir
, struct dentry
*dentry
, umode_t mode
)
5783 struct inode
*inode
= NULL
;
5784 struct btrfs_trans_handle
*trans
;
5785 struct btrfs_root
*root
= BTRFS_I(dir
)->root
;
5787 int drop_on_err
= 0;
5792 * 2 items for inode and ref
5793 * 2 items for dir items
5794 * 1 for xattr if selinux is on
5796 trans
= btrfs_start_transaction(root
, 5);
5798 return PTR_ERR(trans
);
5800 err
= btrfs_find_free_ino(root
, &objectid
);
5804 inode
= btrfs_new_inode(trans
, root
, dir
, dentry
->d_name
.name
,
5805 dentry
->d_name
.len
, btrfs_ino(dir
), objectid
,
5806 S_IFDIR
| mode
, &index
);
5807 if (IS_ERR(inode
)) {
5808 err
= PTR_ERR(inode
);
5814 err
= btrfs_init_inode_security(trans
, inode
, dir
, &dentry
->d_name
);
5818 inode
->i_op
= &btrfs_dir_inode_operations
;
5819 inode
->i_fop
= &btrfs_dir_file_operations
;
5821 btrfs_i_size_write(inode
, 0);
5822 err
= btrfs_update_inode(trans
, root
, inode
);
5826 err
= btrfs_add_link(trans
, dir
, inode
, dentry
->d_name
.name
,
5827 dentry
->d_name
.len
, 0, index
);
5831 d_instantiate(dentry
, inode
);
5835 btrfs_end_transaction(trans
, root
);
5838 btrfs_btree_balance_dirty(root
);
5842 /* helper for btfs_get_extent. Given an existing extent in the tree,
5843 * and an extent that you want to insert, deal with overlap and insert
5844 * the new extent into the tree.
5846 static int merge_extent_mapping(struct extent_map_tree
*em_tree
,
5847 struct extent_map
*existing
,
5848 struct extent_map
*em
,
5849 u64 map_start
, u64 map_len
)
5853 BUG_ON(map_start
< em
->start
|| map_start
>= extent_map_end(em
));
5854 start_diff
= map_start
- em
->start
;
5855 em
->start
= map_start
;
5857 if (em
->block_start
< EXTENT_MAP_LAST_BYTE
&&
5858 !test_bit(EXTENT_FLAG_COMPRESSED
, &em
->flags
)) {
5859 em
->block_start
+= start_diff
;
5860 em
->block_len
-= start_diff
;
5862 return add_extent_mapping(em_tree
, em
, 0);
5865 static noinline
int uncompress_inline(struct btrfs_path
*path
,
5866 struct inode
*inode
, struct page
*page
,
5867 size_t pg_offset
, u64 extent_offset
,
5868 struct btrfs_file_extent_item
*item
)
5871 struct extent_buffer
*leaf
= path
->nodes
[0];
5874 unsigned long inline_size
;
5878 WARN_ON(pg_offset
!= 0);
5879 compress_type
= btrfs_file_extent_compression(leaf
, item
);
5880 max_size
= btrfs_file_extent_ram_bytes(leaf
, item
);
5881 inline_size
= btrfs_file_extent_inline_item_len(leaf
,
5882 btrfs_item_nr(leaf
, path
->slots
[0]));
5883 tmp
= kmalloc(inline_size
, GFP_NOFS
);
5886 ptr
= btrfs_file_extent_inline_start(item
);
5888 read_extent_buffer(leaf
, tmp
, ptr
, inline_size
);
5890 max_size
= min_t(unsigned long, PAGE_CACHE_SIZE
, max_size
);
5891 ret
= btrfs_decompress(compress_type
, tmp
, page
,
5892 extent_offset
, inline_size
, max_size
);
5894 char *kaddr
= kmap_atomic(page
);
5895 unsigned long copy_size
= min_t(u64
,
5896 PAGE_CACHE_SIZE
- pg_offset
,
5897 max_size
- extent_offset
);
5898 memset(kaddr
+ pg_offset
, 0, copy_size
);
5899 kunmap_atomic(kaddr
);
5906 * a bit scary, this does extent mapping from logical file offset to the disk.
5907 * the ugly parts come from merging extents from the disk with the in-ram
5908 * representation. This gets more complex because of the data=ordered code,
5909 * where the in-ram extents might be locked pending data=ordered completion.
5911 * This also copies inline extents directly into the page.
5914 struct extent_map
*btrfs_get_extent(struct inode
*inode
, struct page
*page
,
5915 size_t pg_offset
, u64 start
, u64 len
,
5921 u64 extent_start
= 0;
5923 u64 objectid
= btrfs_ino(inode
);
5925 struct btrfs_path
*path
= NULL
;
5926 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
5927 struct btrfs_file_extent_item
*item
;
5928 struct extent_buffer
*leaf
;
5929 struct btrfs_key found_key
;
5930 struct extent_map
*em
= NULL
;
5931 struct extent_map_tree
*em_tree
= &BTRFS_I(inode
)->extent_tree
;
5932 struct extent_io_tree
*io_tree
= &BTRFS_I(inode
)->io_tree
;
5933 struct btrfs_trans_handle
*trans
= NULL
;
5937 read_lock(&em_tree
->lock
);
5938 em
= lookup_extent_mapping(em_tree
, start
, len
);
5940 em
->bdev
= root
->fs_info
->fs_devices
->latest_bdev
;
5941 read_unlock(&em_tree
->lock
);
5944 if (em
->start
> start
|| em
->start
+ em
->len
<= start
)
5945 free_extent_map(em
);
5946 else if (em
->block_start
== EXTENT_MAP_INLINE
&& page
)
5947 free_extent_map(em
);
5951 em
= alloc_extent_map();
5956 em
->bdev
= root
->fs_info
->fs_devices
->latest_bdev
;
5957 em
->start
= EXTENT_MAP_HOLE
;
5958 em
->orig_start
= EXTENT_MAP_HOLE
;
5960 em
->block_len
= (u64
)-1;
5963 path
= btrfs_alloc_path();
5969 * Chances are we'll be called again, so go ahead and do
5975 ret
= btrfs_lookup_file_extent(trans
, root
, path
,
5976 objectid
, start
, trans
!= NULL
);
5983 if (path
->slots
[0] == 0)
5988 leaf
= path
->nodes
[0];
5989 item
= btrfs_item_ptr(leaf
, path
->slots
[0],
5990 struct btrfs_file_extent_item
);
5991 /* are we inside the extent that was found? */
5992 btrfs_item_key_to_cpu(leaf
, &found_key
, path
->slots
[0]);
5993 found_type
= btrfs_key_type(&found_key
);
5994 if (found_key
.objectid
!= objectid
||
5995 found_type
!= BTRFS_EXTENT_DATA_KEY
) {
5999 found_type
= btrfs_file_extent_type(leaf
, item
);
6000 extent_start
= found_key
.offset
;
6001 compress_type
= btrfs_file_extent_compression(leaf
, item
);
6002 if (found_type
== BTRFS_FILE_EXTENT_REG
||
6003 found_type
== BTRFS_FILE_EXTENT_PREALLOC
) {
6004 extent_end
= extent_start
+
6005 btrfs_file_extent_num_bytes(leaf
, item
);
6006 } else if (found_type
== BTRFS_FILE_EXTENT_INLINE
) {
6008 size
= btrfs_file_extent_inline_len(leaf
, item
);
6009 extent_end
= ALIGN(extent_start
+ size
, root
->sectorsize
);
6012 if (start
>= extent_end
) {
6014 if (path
->slots
[0] >= btrfs_header_nritems(leaf
)) {
6015 ret
= btrfs_next_leaf(root
, path
);
6022 leaf
= path
->nodes
[0];
6024 btrfs_item_key_to_cpu(leaf
, &found_key
, path
->slots
[0]);
6025 if (found_key
.objectid
!= objectid
||
6026 found_key
.type
!= BTRFS_EXTENT_DATA_KEY
)
6028 if (start
+ len
<= found_key
.offset
)
6031 em
->orig_start
= start
;
6032 em
->len
= found_key
.offset
- start
;
6036 em
->ram_bytes
= btrfs_file_extent_ram_bytes(leaf
, item
);
6037 if (found_type
== BTRFS_FILE_EXTENT_REG
||
6038 found_type
== BTRFS_FILE_EXTENT_PREALLOC
) {
6039 em
->start
= extent_start
;
6040 em
->len
= extent_end
- extent_start
;
6041 em
->orig_start
= extent_start
-
6042 btrfs_file_extent_offset(leaf
, item
);
6043 em
->orig_block_len
= btrfs_file_extent_disk_num_bytes(leaf
,
6045 bytenr
= btrfs_file_extent_disk_bytenr(leaf
, item
);
6047 em
->block_start
= EXTENT_MAP_HOLE
;
6050 if (compress_type
!= BTRFS_COMPRESS_NONE
) {
6051 set_bit(EXTENT_FLAG_COMPRESSED
, &em
->flags
);
6052 em
->compress_type
= compress_type
;
6053 em
->block_start
= bytenr
;
6054 em
->block_len
= em
->orig_block_len
;
6056 bytenr
+= btrfs_file_extent_offset(leaf
, item
);
6057 em
->block_start
= bytenr
;
6058 em
->block_len
= em
->len
;
6059 if (found_type
== BTRFS_FILE_EXTENT_PREALLOC
)
6060 set_bit(EXTENT_FLAG_PREALLOC
, &em
->flags
);
6063 } else if (found_type
== BTRFS_FILE_EXTENT_INLINE
) {
6067 size_t extent_offset
;
6070 em
->block_start
= EXTENT_MAP_INLINE
;
6071 if (!page
|| create
) {
6072 em
->start
= extent_start
;
6073 em
->len
= extent_end
- extent_start
;
6077 size
= btrfs_file_extent_inline_len(leaf
, item
);
6078 extent_offset
= page_offset(page
) + pg_offset
- extent_start
;
6079 copy_size
= min_t(u64
, PAGE_CACHE_SIZE
- pg_offset
,
6080 size
- extent_offset
);
6081 em
->start
= extent_start
+ extent_offset
;
6082 em
->len
= ALIGN(copy_size
, root
->sectorsize
);
6083 em
->orig_block_len
= em
->len
;
6084 em
->orig_start
= em
->start
;
6085 if (compress_type
) {
6086 set_bit(EXTENT_FLAG_COMPRESSED
, &em
->flags
);
6087 em
->compress_type
= compress_type
;
6089 ptr
= btrfs_file_extent_inline_start(item
) + extent_offset
;
6090 if (create
== 0 && !PageUptodate(page
)) {
6091 if (btrfs_file_extent_compression(leaf
, item
) !=
6092 BTRFS_COMPRESS_NONE
) {
6093 ret
= uncompress_inline(path
, inode
, page
,
6095 extent_offset
, item
);
6096 BUG_ON(ret
); /* -ENOMEM */
6099 read_extent_buffer(leaf
, map
+ pg_offset
, ptr
,
6101 if (pg_offset
+ copy_size
< PAGE_CACHE_SIZE
) {
6102 memset(map
+ pg_offset
+ copy_size
, 0,
6103 PAGE_CACHE_SIZE
- pg_offset
-
6108 flush_dcache_page(page
);
6109 } else if (create
&& PageUptodate(page
)) {
6113 free_extent_map(em
);
6116 btrfs_release_path(path
);
6117 trans
= btrfs_join_transaction(root
);
6120 return ERR_CAST(trans
);
6124 write_extent_buffer(leaf
, map
+ pg_offset
, ptr
,
6127 btrfs_mark_buffer_dirty(leaf
);
6129 set_extent_uptodate(io_tree
, em
->start
,
6130 extent_map_end(em
) - 1, NULL
, GFP_NOFS
);
6133 WARN(1, KERN_ERR
"btrfs unknown found_type %d\n", found_type
);
6137 em
->orig_start
= start
;
6140 em
->block_start
= EXTENT_MAP_HOLE
;
6141 set_bit(EXTENT_FLAG_VACANCY
, &em
->flags
);
6143 btrfs_release_path(path
);
6144 if (em
->start
> start
|| extent_map_end(em
) <= start
) {
6145 btrfs_err(root
->fs_info
, "bad extent! em: [%llu %llu] passed [%llu %llu]",
6146 (unsigned long long)em
->start
,
6147 (unsigned long long)em
->len
,
6148 (unsigned long long)start
,
6149 (unsigned long long)len
);
6155 write_lock(&em_tree
->lock
);
6156 ret
= add_extent_mapping(em_tree
, em
, 0);
6157 /* it is possible that someone inserted the extent into the tree
6158 * while we had the lock dropped. It is also possible that
6159 * an overlapping map exists in the tree
6161 if (ret
== -EEXIST
) {
6162 struct extent_map
*existing
;
6166 existing
= lookup_extent_mapping(em_tree
, start
, len
);
6167 if (existing
&& (existing
->start
> start
||
6168 existing
->start
+ existing
->len
<= start
)) {
6169 free_extent_map(existing
);
6173 existing
= lookup_extent_mapping(em_tree
, em
->start
,
6176 err
= merge_extent_mapping(em_tree
, existing
,
6179 free_extent_map(existing
);
6181 free_extent_map(em
);
6186 free_extent_map(em
);
6190 free_extent_map(em
);
6195 write_unlock(&em_tree
->lock
);
6199 trace_btrfs_get_extent(root
, em
);
6202 btrfs_free_path(path
);
6204 ret
= btrfs_end_transaction(trans
, root
);
6209 free_extent_map(em
);
6210 return ERR_PTR(err
);
6212 BUG_ON(!em
); /* Error is always set */
6216 struct extent_map
*btrfs_get_extent_fiemap(struct inode
*inode
, struct page
*page
,
6217 size_t pg_offset
, u64 start
, u64 len
,
6220 struct extent_map
*em
;
6221 struct extent_map
*hole_em
= NULL
;
6222 u64 range_start
= start
;
6228 em
= btrfs_get_extent(inode
, page
, pg_offset
, start
, len
, create
);
6235 * - a pre-alloc extent,
6236 * there might actually be delalloc bytes behind it.
6238 if (em
->block_start
!= EXTENT_MAP_HOLE
&&
6239 !test_bit(EXTENT_FLAG_PREALLOC
, &em
->flags
))
6245 /* check to see if we've wrapped (len == -1 or similar) */
6254 /* ok, we didn't find anything, lets look for delalloc */
6255 found
= count_range_bits(&BTRFS_I(inode
)->io_tree
, &range_start
,
6256 end
, len
, EXTENT_DELALLOC
, 1);
6257 found_end
= range_start
+ found
;
6258 if (found_end
< range_start
)
6259 found_end
= (u64
)-1;
6262 * we didn't find anything useful, return
6263 * the original results from get_extent()
6265 if (range_start
> end
|| found_end
<= start
) {
6271 /* adjust the range_start to make sure it doesn't
6272 * go backwards from the start they passed in
6274 range_start
= max(start
,range_start
);
6275 found
= found_end
- range_start
;
6278 u64 hole_start
= start
;
6281 em
= alloc_extent_map();
6287 * when btrfs_get_extent can't find anything it
6288 * returns one huge hole
6290 * make sure what it found really fits our range, and
6291 * adjust to make sure it is based on the start from
6295 u64 calc_end
= extent_map_end(hole_em
);
6297 if (calc_end
<= start
|| (hole_em
->start
> end
)) {
6298 free_extent_map(hole_em
);
6301 hole_start
= max(hole_em
->start
, start
);
6302 hole_len
= calc_end
- hole_start
;
6306 if (hole_em
&& range_start
> hole_start
) {
6307 /* our hole starts before our delalloc, so we
6308 * have to return just the parts of the hole
6309 * that go until the delalloc starts
6311 em
->len
= min(hole_len
,
6312 range_start
- hole_start
);
6313 em
->start
= hole_start
;
6314 em
->orig_start
= hole_start
;
6316 * don't adjust block start at all,
6317 * it is fixed at EXTENT_MAP_HOLE
6319 em
->block_start
= hole_em
->block_start
;
6320 em
->block_len
= hole_len
;
6321 if (test_bit(EXTENT_FLAG_PREALLOC
, &hole_em
->flags
))
6322 set_bit(EXTENT_FLAG_PREALLOC
, &em
->flags
);
6324 em
->start
= range_start
;
6326 em
->orig_start
= range_start
;
6327 em
->block_start
= EXTENT_MAP_DELALLOC
;
6328 em
->block_len
= found
;
6330 } else if (hole_em
) {
6335 free_extent_map(hole_em
);
6337 free_extent_map(em
);
6338 return ERR_PTR(err
);
6343 static struct extent_map
*btrfs_new_extent_direct(struct inode
*inode
,
6346 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
6347 struct btrfs_trans_handle
*trans
;
6348 struct extent_map
*em
;
6349 struct btrfs_key ins
;
6353 trans
= btrfs_join_transaction(root
);
6355 return ERR_CAST(trans
);
6357 trans
->block_rsv
= &root
->fs_info
->delalloc_block_rsv
;
6359 alloc_hint
= get_extent_allocation_hint(inode
, start
, len
);
6360 ret
= btrfs_reserve_extent(trans
, root
, len
, root
->sectorsize
, 0,
6361 alloc_hint
, &ins
, 1);
6367 em
= create_pinned_em(inode
, start
, ins
.offset
, start
, ins
.objectid
,
6368 ins
.offset
, ins
.offset
, ins
.offset
, 0);
6372 ret
= btrfs_add_ordered_extent_dio(inode
, start
, ins
.objectid
,
6373 ins
.offset
, ins
.offset
, 0);
6375 btrfs_free_reserved_extent(root
, ins
.objectid
, ins
.offset
);
6379 btrfs_end_transaction(trans
, root
);
6384 * returns 1 when the nocow is safe, < 1 on error, 0 if the
6385 * block must be cow'd
6387 noinline
int can_nocow_extent(struct btrfs_trans_handle
*trans
,
6388 struct inode
*inode
, u64 offset
, u64
*len
,
6389 u64
*orig_start
, u64
*orig_block_len
,
6392 struct btrfs_path
*path
;
6394 struct extent_buffer
*leaf
;
6395 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
6396 struct btrfs_file_extent_item
*fi
;
6397 struct btrfs_key key
;
6404 bool nocow
= (BTRFS_I(inode
)->flags
& BTRFS_INODE_NODATACOW
);
6405 path
= btrfs_alloc_path();
6409 ret
= btrfs_lookup_file_extent(trans
, root
, path
, btrfs_ino(inode
),
6414 slot
= path
->slots
[0];
6417 /* can't find the item, must cow */
6424 leaf
= path
->nodes
[0];
6425 btrfs_item_key_to_cpu(leaf
, &key
, slot
);
6426 if (key
.objectid
!= btrfs_ino(inode
) ||
6427 key
.type
!= BTRFS_EXTENT_DATA_KEY
) {
6428 /* not our file or wrong item type, must cow */
6432 if (key
.offset
> offset
) {
6433 /* Wrong offset, must cow */
6437 fi
= btrfs_item_ptr(leaf
, slot
, struct btrfs_file_extent_item
);
6438 found_type
= btrfs_file_extent_type(leaf
, fi
);
6439 if (found_type
!= BTRFS_FILE_EXTENT_REG
&&
6440 found_type
!= BTRFS_FILE_EXTENT_PREALLOC
) {
6441 /* not a regular extent, must cow */
6445 if (!nocow
&& found_type
== BTRFS_FILE_EXTENT_REG
)
6448 disk_bytenr
= btrfs_file_extent_disk_bytenr(leaf
, fi
);
6449 if (disk_bytenr
== 0)
6452 if (btrfs_file_extent_compression(leaf
, fi
) ||
6453 btrfs_file_extent_encryption(leaf
, fi
) ||
6454 btrfs_file_extent_other_encoding(leaf
, fi
))
6457 backref_offset
= btrfs_file_extent_offset(leaf
, fi
);
6460 *orig_start
= key
.offset
- backref_offset
;
6461 *orig_block_len
= btrfs_file_extent_disk_num_bytes(leaf
, fi
);
6462 *ram_bytes
= btrfs_file_extent_ram_bytes(leaf
, fi
);
6465 extent_end
= key
.offset
+ btrfs_file_extent_num_bytes(leaf
, fi
);
6467 if (btrfs_extent_readonly(root
, disk_bytenr
))
6471 * look for other files referencing this extent, if we
6472 * find any we must cow
6474 if (btrfs_cross_ref_exist(trans
, root
, btrfs_ino(inode
),
6475 key
.offset
- backref_offset
, disk_bytenr
))
6479 * adjust disk_bytenr and num_bytes to cover just the bytes
6480 * in this extent we are about to write. If there
6481 * are any csums in that range we have to cow in order
6482 * to keep the csums correct
6484 disk_bytenr
+= backref_offset
;
6485 disk_bytenr
+= offset
- key
.offset
;
6486 num_bytes
= min(offset
+ *len
, extent_end
) - offset
;
6487 if (csum_exist_in_range(root
, disk_bytenr
, num_bytes
))
6490 * all of the above have passed, it is safe to overwrite this extent
6496 btrfs_free_path(path
);
6500 static int lock_extent_direct(struct inode
*inode
, u64 lockstart
, u64 lockend
,
6501 struct extent_state
**cached_state
, int writing
)
6503 struct btrfs_ordered_extent
*ordered
;
6507 lock_extent_bits(&BTRFS_I(inode
)->io_tree
, lockstart
, lockend
,
6510 * We're concerned with the entire range that we're going to be
6511 * doing DIO to, so we need to make sure theres no ordered
6512 * extents in this range.
6514 ordered
= btrfs_lookup_ordered_range(inode
, lockstart
,
6515 lockend
- lockstart
+ 1);
6518 * We need to make sure there are no buffered pages in this
6519 * range either, we could have raced between the invalidate in
6520 * generic_file_direct_write and locking the extent. The
6521 * invalidate needs to happen so that reads after a write do not
6524 if (!ordered
&& (!writing
||
6525 !test_range_bit(&BTRFS_I(inode
)->io_tree
,
6526 lockstart
, lockend
, EXTENT_UPTODATE
, 0,
6530 unlock_extent_cached(&BTRFS_I(inode
)->io_tree
, lockstart
, lockend
,
6531 cached_state
, GFP_NOFS
);
6534 btrfs_start_ordered_extent(inode
, ordered
, 1);
6535 btrfs_put_ordered_extent(ordered
);
6537 /* Screw you mmap */
6538 ret
= filemap_write_and_wait_range(inode
->i_mapping
,
6545 * If we found a page that couldn't be invalidated just
6546 * fall back to buffered.
6548 ret
= invalidate_inode_pages2_range(inode
->i_mapping
,
6549 lockstart
>> PAGE_CACHE_SHIFT
,
6550 lockend
>> PAGE_CACHE_SHIFT
);
6561 static struct extent_map
*create_pinned_em(struct inode
*inode
, u64 start
,
6562 u64 len
, u64 orig_start
,
6563 u64 block_start
, u64 block_len
,
6564 u64 orig_block_len
, u64 ram_bytes
,
6567 struct extent_map_tree
*em_tree
;
6568 struct extent_map
*em
;
6569 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
6572 em_tree
= &BTRFS_I(inode
)->extent_tree
;
6573 em
= alloc_extent_map();
6575 return ERR_PTR(-ENOMEM
);
6578 em
->orig_start
= orig_start
;
6579 em
->mod_start
= start
;
6582 em
->block_len
= block_len
;
6583 em
->block_start
= block_start
;
6584 em
->bdev
= root
->fs_info
->fs_devices
->latest_bdev
;
6585 em
->orig_block_len
= orig_block_len
;
6586 em
->ram_bytes
= ram_bytes
;
6587 em
->generation
= -1;
6588 set_bit(EXTENT_FLAG_PINNED
, &em
->flags
);
6589 if (type
== BTRFS_ORDERED_PREALLOC
)
6590 set_bit(EXTENT_FLAG_FILLING
, &em
->flags
);
6593 btrfs_drop_extent_cache(inode
, em
->start
,
6594 em
->start
+ em
->len
- 1, 0);
6595 write_lock(&em_tree
->lock
);
6596 ret
= add_extent_mapping(em_tree
, em
, 1);
6597 write_unlock(&em_tree
->lock
);
6598 } while (ret
== -EEXIST
);
6601 free_extent_map(em
);
6602 return ERR_PTR(ret
);
6609 static int btrfs_get_blocks_direct(struct inode
*inode
, sector_t iblock
,
6610 struct buffer_head
*bh_result
, int create
)
6612 struct extent_map
*em
;
6613 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
6614 struct extent_state
*cached_state
= NULL
;
6615 u64 start
= iblock
<< inode
->i_blkbits
;
6616 u64 lockstart
, lockend
;
6617 u64 len
= bh_result
->b_size
;
6618 struct btrfs_trans_handle
*trans
;
6619 int unlock_bits
= EXTENT_LOCKED
;
6623 unlock_bits
|= EXTENT_DELALLOC
| EXTENT_DIRTY
;
6625 len
= min_t(u64
, len
, root
->sectorsize
);
6628 lockend
= start
+ len
- 1;
6631 * If this errors out it's because we couldn't invalidate pagecache for
6632 * this range and we need to fallback to buffered.
6634 if (lock_extent_direct(inode
, lockstart
, lockend
, &cached_state
, create
))
6637 em
= btrfs_get_extent(inode
, NULL
, 0, start
, len
, 0);
6644 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
6645 * io. INLINE is special, and we could probably kludge it in here, but
6646 * it's still buffered so for safety lets just fall back to the generic
6649 * For COMPRESSED we _have_ to read the entire extent in so we can
6650 * decompress it, so there will be buffering required no matter what we
6651 * do, so go ahead and fallback to buffered.
6653 * We return -ENOTBLK because thats what makes DIO go ahead and go back
6654 * to buffered IO. Don't blame me, this is the price we pay for using
6657 if (test_bit(EXTENT_FLAG_COMPRESSED
, &em
->flags
) ||
6658 em
->block_start
== EXTENT_MAP_INLINE
) {
6659 free_extent_map(em
);
6664 /* Just a good old fashioned hole, return */
6665 if (!create
&& (em
->block_start
== EXTENT_MAP_HOLE
||
6666 test_bit(EXTENT_FLAG_PREALLOC
, &em
->flags
))) {
6667 free_extent_map(em
);
6672 * We don't allocate a new extent in the following cases
6674 * 1) The inode is marked as NODATACOW. In this case we'll just use the
6676 * 2) The extent is marked as PREALLOC. We're good to go here and can
6677 * just use the extent.
6681 len
= min(len
, em
->len
- (start
- em
->start
));
6682 lockstart
= start
+ len
;
6686 if (test_bit(EXTENT_FLAG_PREALLOC
, &em
->flags
) ||
6687 ((BTRFS_I(inode
)->flags
& BTRFS_INODE_NODATACOW
) &&
6688 em
->block_start
!= EXTENT_MAP_HOLE
)) {
6691 u64 block_start
, orig_start
, orig_block_len
, ram_bytes
;
6693 if (test_bit(EXTENT_FLAG_PREALLOC
, &em
->flags
))
6694 type
= BTRFS_ORDERED_PREALLOC
;
6696 type
= BTRFS_ORDERED_NOCOW
;
6697 len
= min(len
, em
->len
- (start
- em
->start
));
6698 block_start
= em
->block_start
+ (start
- em
->start
);
6701 * we're not going to log anything, but we do need
6702 * to make sure the current transaction stays open
6703 * while we look for nocow cross refs
6705 trans
= btrfs_join_transaction(root
);
6709 if (can_nocow_extent(trans
, inode
, start
, &len
, &orig_start
,
6710 &orig_block_len
, &ram_bytes
) == 1) {
6711 if (type
== BTRFS_ORDERED_PREALLOC
) {
6712 free_extent_map(em
);
6713 em
= create_pinned_em(inode
, start
, len
,
6719 btrfs_end_transaction(trans
, root
);
6724 ret
= btrfs_add_ordered_extent_dio(inode
, start
,
6725 block_start
, len
, len
, type
);
6726 btrfs_end_transaction(trans
, root
);
6728 free_extent_map(em
);
6733 btrfs_end_transaction(trans
, root
);
6737 * this will cow the extent, reset the len in case we changed
6740 len
= bh_result
->b_size
;
6741 free_extent_map(em
);
6742 em
= btrfs_new_extent_direct(inode
, start
, len
);
6747 len
= min(len
, em
->len
- (start
- em
->start
));
6749 bh_result
->b_blocknr
= (em
->block_start
+ (start
- em
->start
)) >>
6751 bh_result
->b_size
= len
;
6752 bh_result
->b_bdev
= em
->bdev
;
6753 set_buffer_mapped(bh_result
);
6755 if (!test_bit(EXTENT_FLAG_PREALLOC
, &em
->flags
))
6756 set_buffer_new(bh_result
);
6759 * Need to update the i_size under the extent lock so buffered
6760 * readers will get the updated i_size when we unlock.
6762 if (start
+ len
> i_size_read(inode
))
6763 i_size_write(inode
, start
+ len
);
6765 spin_lock(&BTRFS_I(inode
)->lock
);
6766 BTRFS_I(inode
)->outstanding_extents
++;
6767 spin_unlock(&BTRFS_I(inode
)->lock
);
6769 ret
= set_extent_bit(&BTRFS_I(inode
)->io_tree
, lockstart
,
6770 lockstart
+ len
- 1, EXTENT_DELALLOC
, NULL
,
6771 &cached_state
, GFP_NOFS
);
6776 * In the case of write we need to clear and unlock the entire range,
6777 * in the case of read we need to unlock only the end area that we
6778 * aren't using if there is any left over space.
6780 if (lockstart
< lockend
) {
6781 clear_extent_bit(&BTRFS_I(inode
)->io_tree
, lockstart
,
6782 lockend
, unlock_bits
, 1, 0,
6783 &cached_state
, GFP_NOFS
);
6785 free_extent_state(cached_state
);
6788 free_extent_map(em
);
6793 clear_extent_bit(&BTRFS_I(inode
)->io_tree
, lockstart
, lockend
,
6794 unlock_bits
, 1, 0, &cached_state
, GFP_NOFS
);
6798 static void btrfs_endio_direct_read(struct bio
*bio
, int err
)
6800 struct btrfs_dio_private
*dip
= bio
->bi_private
;
6801 struct bio_vec
*bvec_end
= bio
->bi_io_vec
+ bio
->bi_vcnt
- 1;
6802 struct bio_vec
*bvec
= bio
->bi_io_vec
;
6803 struct inode
*inode
= dip
->inode
;
6804 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
6805 struct bio
*dio_bio
;
6806 u32
*csums
= (u32
*)dip
->csum
;
6810 start
= dip
->logical_offset
;
6812 if (!(BTRFS_I(inode
)->flags
& BTRFS_INODE_NODATASUM
)) {
6813 struct page
*page
= bvec
->bv_page
;
6816 unsigned long flags
;
6818 local_irq_save(flags
);
6819 kaddr
= kmap_atomic(page
);
6820 csum
= btrfs_csum_data(kaddr
+ bvec
->bv_offset
,
6821 csum
, bvec
->bv_len
);
6822 btrfs_csum_final(csum
, (char *)&csum
);
6823 kunmap_atomic(kaddr
);
6824 local_irq_restore(flags
);
6826 flush_dcache_page(bvec
->bv_page
);
6827 if (csum
!= csums
[index
]) {
6828 btrfs_err(root
->fs_info
, "csum failed ino %llu off %llu csum %u expected csum %u",
6829 (unsigned long long)btrfs_ino(inode
),
6830 (unsigned long long)start
,
6831 csum
, csums
[index
]);
6836 start
+= bvec
->bv_len
;
6839 } while (bvec
<= bvec_end
);
6841 unlock_extent(&BTRFS_I(inode
)->io_tree
, dip
->logical_offset
,
6842 dip
->logical_offset
+ dip
->bytes
- 1);
6843 dio_bio
= dip
->dio_bio
;
6847 /* If we had a csum failure make sure to clear the uptodate flag */
6849 clear_bit(BIO_UPTODATE
, &dio_bio
->bi_flags
);
6850 dio_end_io(dio_bio
, err
);
6854 static void btrfs_endio_direct_write(struct bio
*bio
, int err
)
6856 struct btrfs_dio_private
*dip
= bio
->bi_private
;
6857 struct inode
*inode
= dip
->inode
;
6858 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
6859 struct btrfs_ordered_extent
*ordered
= NULL
;
6860 u64 ordered_offset
= dip
->logical_offset
;
6861 u64 ordered_bytes
= dip
->bytes
;
6862 struct bio
*dio_bio
;
6868 ret
= btrfs_dec_test_first_ordered_pending(inode
, &ordered
,
6870 ordered_bytes
, !err
);
6874 ordered
->work
.func
= finish_ordered_fn
;
6875 ordered
->work
.flags
= 0;
6876 btrfs_queue_worker(&root
->fs_info
->endio_write_workers
,
6880 * our bio might span multiple ordered extents. If we haven't
6881 * completed the accounting for the whole dio, go back and try again
6883 if (ordered_offset
< dip
->logical_offset
+ dip
->bytes
) {
6884 ordered_bytes
= dip
->logical_offset
+ dip
->bytes
-
6890 dio_bio
= dip
->dio_bio
;
6894 /* If we had an error make sure to clear the uptodate flag */
6896 clear_bit(BIO_UPTODATE
, &dio_bio
->bi_flags
);
6897 dio_end_io(dio_bio
, err
);
6901 static int __btrfs_submit_bio_start_direct_io(struct inode
*inode
, int rw
,
6902 struct bio
*bio
, int mirror_num
,
6903 unsigned long bio_flags
, u64 offset
)
6906 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
6907 ret
= btrfs_csum_one_bio(root
, inode
, bio
, offset
, 1);
6908 BUG_ON(ret
); /* -ENOMEM */
6912 static void btrfs_end_dio_bio(struct bio
*bio
, int err
)
6914 struct btrfs_dio_private
*dip
= bio
->bi_private
;
6917 printk(KERN_ERR
"btrfs direct IO failed ino %llu rw %lu "
6918 "sector %#Lx len %u err no %d\n",
6919 (unsigned long long)btrfs_ino(dip
->inode
), bio
->bi_rw
,
6920 (unsigned long long)bio
->bi_sector
, bio
->bi_size
, err
);
6924 * before atomic variable goto zero, we must make sure
6925 * dip->errors is perceived to be set.
6927 smp_mb__before_atomic_dec();
6930 /* if there are more bios still pending for this dio, just exit */
6931 if (!atomic_dec_and_test(&dip
->pending_bios
))
6935 bio_io_error(dip
->orig_bio
);
6937 set_bit(BIO_UPTODATE
, &dip
->dio_bio
->bi_flags
);
6938 bio_endio(dip
->orig_bio
, 0);
6944 static struct bio
*btrfs_dio_bio_alloc(struct block_device
*bdev
,
6945 u64 first_sector
, gfp_t gfp_flags
)
6947 int nr_vecs
= bio_get_nr_vecs(bdev
);
6948 return btrfs_bio_alloc(bdev
, first_sector
, nr_vecs
, gfp_flags
);
6951 static inline int __btrfs_submit_dio_bio(struct bio
*bio
, struct inode
*inode
,
6952 int rw
, u64 file_offset
, int skip_sum
,
6955 struct btrfs_dio_private
*dip
= bio
->bi_private
;
6956 int write
= rw
& REQ_WRITE
;
6957 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
6961 async_submit
= !atomic_read(&BTRFS_I(inode
)->sync_writers
);
6966 ret
= btrfs_bio_wq_end_io(root
->fs_info
, bio
, 0);
6974 if (write
&& async_submit
) {
6975 ret
= btrfs_wq_submit_bio(root
->fs_info
,
6976 inode
, rw
, bio
, 0, 0,
6978 __btrfs_submit_bio_start_direct_io
,
6979 __btrfs_submit_bio_done
);
6983 * If we aren't doing async submit, calculate the csum of the
6986 ret
= btrfs_csum_one_bio(root
, inode
, bio
, file_offset
, 1);
6989 } else if (!skip_sum
) {
6990 ret
= btrfs_lookup_bio_sums_dio(root
, inode
, dip
, bio
,
6997 ret
= btrfs_map_bio(root
, rw
, bio
, 0, async_submit
);
7003 static int btrfs_submit_direct_hook(int rw
, struct btrfs_dio_private
*dip
,
7006 struct inode
*inode
= dip
->inode
;
7007 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
7009 struct bio
*orig_bio
= dip
->orig_bio
;
7010 struct bio_vec
*bvec
= orig_bio
->bi_io_vec
;
7011 u64 start_sector
= orig_bio
->bi_sector
;
7012 u64 file_offset
= dip
->logical_offset
;
7017 int async_submit
= 0;
7019 map_length
= orig_bio
->bi_size
;
7020 ret
= btrfs_map_block(root
->fs_info
, rw
, start_sector
<< 9,
7021 &map_length
, NULL
, 0);
7027 if (map_length
>= orig_bio
->bi_size
) {
7032 /* async crcs make it difficult to collect full stripe writes. */
7033 if (btrfs_get_alloc_profile(root
, 1) &
7034 (BTRFS_BLOCK_GROUP_RAID5
| BTRFS_BLOCK_GROUP_RAID6
))
7039 bio
= btrfs_dio_bio_alloc(orig_bio
->bi_bdev
, start_sector
, GFP_NOFS
);
7042 bio
->bi_private
= dip
;
7043 bio
->bi_end_io
= btrfs_end_dio_bio
;
7044 atomic_inc(&dip
->pending_bios
);
7046 while (bvec
<= (orig_bio
->bi_io_vec
+ orig_bio
->bi_vcnt
- 1)) {
7047 if (unlikely(map_length
< submit_len
+ bvec
->bv_len
||
7048 bio_add_page(bio
, bvec
->bv_page
, bvec
->bv_len
,
7049 bvec
->bv_offset
) < bvec
->bv_len
)) {
7051 * inc the count before we submit the bio so
7052 * we know the end IO handler won't happen before
7053 * we inc the count. Otherwise, the dip might get freed
7054 * before we're done setting it up
7056 atomic_inc(&dip
->pending_bios
);
7057 ret
= __btrfs_submit_dio_bio(bio
, inode
, rw
,
7058 file_offset
, skip_sum
,
7062 atomic_dec(&dip
->pending_bios
);
7066 start_sector
+= submit_len
>> 9;
7067 file_offset
+= submit_len
;
7072 bio
= btrfs_dio_bio_alloc(orig_bio
->bi_bdev
,
7073 start_sector
, GFP_NOFS
);
7076 bio
->bi_private
= dip
;
7077 bio
->bi_end_io
= btrfs_end_dio_bio
;
7079 map_length
= orig_bio
->bi_size
;
7080 ret
= btrfs_map_block(root
->fs_info
, rw
,
7082 &map_length
, NULL
, 0);
7088 submit_len
+= bvec
->bv_len
;
7095 ret
= __btrfs_submit_dio_bio(bio
, inode
, rw
, file_offset
, skip_sum
,
7104 * before atomic variable goto zero, we must
7105 * make sure dip->errors is perceived to be set.
7107 smp_mb__before_atomic_dec();
7108 if (atomic_dec_and_test(&dip
->pending_bios
))
7109 bio_io_error(dip
->orig_bio
);
7111 /* bio_end_io() will handle error, so we needn't return it */
7115 static void btrfs_submit_direct(int rw
, struct bio
*dio_bio
,
7116 struct inode
*inode
, loff_t file_offset
)
7118 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
7119 struct btrfs_dio_private
*dip
;
7123 int write
= rw
& REQ_WRITE
;
7127 skip_sum
= BTRFS_I(inode
)->flags
& BTRFS_INODE_NODATASUM
;
7129 io_bio
= btrfs_bio_clone(dio_bio
, GFP_NOFS
);
7135 if (!skip_sum
&& !write
) {
7136 csum_size
= btrfs_super_csum_size(root
->fs_info
->super_copy
);
7137 sum_len
= dio_bio
->bi_size
>> inode
->i_sb
->s_blocksize_bits
;
7138 sum_len
*= csum_size
;
7143 dip
= kmalloc(sizeof(*dip
) + sum_len
, GFP_NOFS
);
7149 dip
->private = dio_bio
->bi_private
;
7151 dip
->logical_offset
= file_offset
;
7152 dip
->bytes
= dio_bio
->bi_size
;
7153 dip
->disk_bytenr
= (u64
)dio_bio
->bi_sector
<< 9;
7154 io_bio
->bi_private
= dip
;
7156 dip
->orig_bio
= io_bio
;
7157 dip
->dio_bio
= dio_bio
;
7158 atomic_set(&dip
->pending_bios
, 0);
7161 io_bio
->bi_end_io
= btrfs_endio_direct_write
;
7163 io_bio
->bi_end_io
= btrfs_endio_direct_read
;
7165 ret
= btrfs_submit_direct_hook(rw
, dip
, skip_sum
);
7174 * If this is a write, we need to clean up the reserved space and kill
7175 * the ordered extent.
7178 struct btrfs_ordered_extent
*ordered
;
7179 ordered
= btrfs_lookup_ordered_extent(inode
, file_offset
);
7180 if (!test_bit(BTRFS_ORDERED_PREALLOC
, &ordered
->flags
) &&
7181 !test_bit(BTRFS_ORDERED_NOCOW
, &ordered
->flags
))
7182 btrfs_free_reserved_extent(root
, ordered
->start
,
7184 btrfs_put_ordered_extent(ordered
);
7185 btrfs_put_ordered_extent(ordered
);
7187 bio_endio(dio_bio
, ret
);
7190 static ssize_t
check_direct_IO(struct btrfs_root
*root
, int rw
, struct kiocb
*iocb
,
7191 const struct iovec
*iov
, loff_t offset
,
7192 unsigned long nr_segs
)
7198 unsigned blocksize_mask
= root
->sectorsize
- 1;
7199 ssize_t retval
= -EINVAL
;
7200 loff_t end
= offset
;
7202 if (offset
& blocksize_mask
)
7205 /* Check the memory alignment. Blocks cannot straddle pages */
7206 for (seg
= 0; seg
< nr_segs
; seg
++) {
7207 addr
= (unsigned long)iov
[seg
].iov_base
;
7208 size
= iov
[seg
].iov_len
;
7210 if ((addr
& blocksize_mask
) || (size
& blocksize_mask
))
7213 /* If this is a write we don't need to check anymore */
7218 * Check to make sure we don't have duplicate iov_base's in this
7219 * iovec, if so return EINVAL, otherwise we'll get csum errors
7220 * when reading back.
7222 for (i
= seg
+ 1; i
< nr_segs
; i
++) {
7223 if (iov
[seg
].iov_base
== iov
[i
].iov_base
)
7232 static ssize_t
btrfs_direct_IO(int rw
, struct kiocb
*iocb
,
7233 const struct iovec
*iov
, loff_t offset
,
7234 unsigned long nr_segs
)
7236 struct file
*file
= iocb
->ki_filp
;
7237 struct inode
*inode
= file
->f_mapping
->host
;
7241 bool relock
= false;
7244 if (check_direct_IO(BTRFS_I(inode
)->root
, rw
, iocb
, iov
,
7248 atomic_inc(&inode
->i_dio_count
);
7249 smp_mb__after_atomic_inc();
7252 * The generic stuff only does filemap_write_and_wait_range, which isn't
7253 * enough if we've written compressed pages to this area, so we need to
7254 * call btrfs_wait_ordered_range to make absolutely sure that any
7255 * outstanding dirty pages are on disk.
7257 count
= iov_length(iov
, nr_segs
);
7258 btrfs_wait_ordered_range(inode
, offset
, count
);
7262 * If the write DIO is beyond the EOF, we need update
7263 * the isize, but it is protected by i_mutex. So we can
7264 * not unlock the i_mutex at this case.
7266 if (offset
+ count
<= inode
->i_size
) {
7267 mutex_unlock(&inode
->i_mutex
);
7270 ret
= btrfs_delalloc_reserve_space(inode
, count
);
7273 } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK
,
7274 &BTRFS_I(inode
)->runtime_flags
))) {
7275 inode_dio_done(inode
);
7276 flags
= DIO_LOCKING
| DIO_SKIP_HOLES
;
7280 ret
= __blockdev_direct_IO(rw
, iocb
, inode
,
7281 BTRFS_I(inode
)->root
->fs_info
->fs_devices
->latest_bdev
,
7282 iov
, offset
, nr_segs
, btrfs_get_blocks_direct
, NULL
,
7283 btrfs_submit_direct
, flags
);
7285 if (ret
< 0 && ret
!= -EIOCBQUEUED
)
7286 btrfs_delalloc_release_space(inode
, count
);
7287 else if (ret
>= 0 && (size_t)ret
< count
)
7288 btrfs_delalloc_release_space(inode
,
7289 count
- (size_t)ret
);
7291 btrfs_delalloc_release_metadata(inode
, 0);
7295 inode_dio_done(inode
);
7297 mutex_lock(&inode
->i_mutex
);
7302 #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
7304 static int btrfs_fiemap(struct inode
*inode
, struct fiemap_extent_info
*fieinfo
,
7305 __u64 start
, __u64 len
)
7309 ret
= fiemap_check_flags(fieinfo
, BTRFS_FIEMAP_FLAGS
);
7313 return extent_fiemap(inode
, fieinfo
, start
, len
, btrfs_get_extent_fiemap
);
7316 int btrfs_readpage(struct file
*file
, struct page
*page
)
7318 struct extent_io_tree
*tree
;
7319 tree
= &BTRFS_I(page
->mapping
->host
)->io_tree
;
7320 return extent_read_full_page(tree
, page
, btrfs_get_extent
, 0);
7323 static int btrfs_writepage(struct page
*page
, struct writeback_control
*wbc
)
7325 struct extent_io_tree
*tree
;
7328 if (current
->flags
& PF_MEMALLOC
) {
7329 redirty_page_for_writepage(wbc
, page
);
7333 tree
= &BTRFS_I(page
->mapping
->host
)->io_tree
;
7334 return extent_write_full_page(tree
, page
, btrfs_get_extent
, wbc
);
7337 static int btrfs_writepages(struct address_space
*mapping
,
7338 struct writeback_control
*wbc
)
7340 struct extent_io_tree
*tree
;
7342 tree
= &BTRFS_I(mapping
->host
)->io_tree
;
7343 return extent_writepages(tree
, mapping
, btrfs_get_extent
, wbc
);
7347 btrfs_readpages(struct file
*file
, struct address_space
*mapping
,
7348 struct list_head
*pages
, unsigned nr_pages
)
7350 struct extent_io_tree
*tree
;
7351 tree
= &BTRFS_I(mapping
->host
)->io_tree
;
7352 return extent_readpages(tree
, mapping
, pages
, nr_pages
,
7355 static int __btrfs_releasepage(struct page
*page
, gfp_t gfp_flags
)
7357 struct extent_io_tree
*tree
;
7358 struct extent_map_tree
*map
;
7361 tree
= &BTRFS_I(page
->mapping
->host
)->io_tree
;
7362 map
= &BTRFS_I(page
->mapping
->host
)->extent_tree
;
7363 ret
= try_release_extent_mapping(map
, tree
, page
, gfp_flags
);
7365 ClearPagePrivate(page
);
7366 set_page_private(page
, 0);
7367 page_cache_release(page
);
7372 static int btrfs_releasepage(struct page
*page
, gfp_t gfp_flags
)
7374 if (PageWriteback(page
) || PageDirty(page
))
7376 return __btrfs_releasepage(page
, gfp_flags
& GFP_NOFS
);
7379 static void btrfs_invalidatepage(struct page
*page
, unsigned int offset
,
7380 unsigned int length
)
7382 struct inode
*inode
= page
->mapping
->host
;
7383 struct extent_io_tree
*tree
;
7384 struct btrfs_ordered_extent
*ordered
;
7385 struct extent_state
*cached_state
= NULL
;
7386 u64 page_start
= page_offset(page
);
7387 u64 page_end
= page_start
+ PAGE_CACHE_SIZE
- 1;
7390 * we have the page locked, so new writeback can't start,
7391 * and the dirty bit won't be cleared while we are here.
7393 * Wait for IO on this page so that we can safely clear
7394 * the PagePrivate2 bit and do ordered accounting
7396 wait_on_page_writeback(page
);
7398 tree
= &BTRFS_I(inode
)->io_tree
;
7400 btrfs_releasepage(page
, GFP_NOFS
);
7403 lock_extent_bits(tree
, page_start
, page_end
, 0, &cached_state
);
7404 ordered
= btrfs_lookup_ordered_extent(inode
, page_offset(page
));
7407 * IO on this page will never be started, so we need
7408 * to account for any ordered extents now
7410 clear_extent_bit(tree
, page_start
, page_end
,
7411 EXTENT_DIRTY
| EXTENT_DELALLOC
|
7412 EXTENT_LOCKED
| EXTENT_DO_ACCOUNTING
|
7413 EXTENT_DEFRAG
, 1, 0, &cached_state
, GFP_NOFS
);
7415 * whoever cleared the private bit is responsible
7416 * for the finish_ordered_io
7418 if (TestClearPagePrivate2(page
) &&
7419 btrfs_dec_test_ordered_pending(inode
, &ordered
, page_start
,
7420 PAGE_CACHE_SIZE
, 1)) {
7421 btrfs_finish_ordered_io(ordered
);
7423 btrfs_put_ordered_extent(ordered
);
7424 cached_state
= NULL
;
7425 lock_extent_bits(tree
, page_start
, page_end
, 0, &cached_state
);
7427 clear_extent_bit(tree
, page_start
, page_end
,
7428 EXTENT_LOCKED
| EXTENT_DIRTY
| EXTENT_DELALLOC
|
7429 EXTENT_DO_ACCOUNTING
| EXTENT_DEFRAG
, 1, 1,
7430 &cached_state
, GFP_NOFS
);
7431 __btrfs_releasepage(page
, GFP_NOFS
);
7433 ClearPageChecked(page
);
7434 if (PagePrivate(page
)) {
7435 ClearPagePrivate(page
);
7436 set_page_private(page
, 0);
7437 page_cache_release(page
);
7442 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
7443 * called from a page fault handler when a page is first dirtied. Hence we must
7444 * be careful to check for EOF conditions here. We set the page up correctly
7445 * for a written page which means we get ENOSPC checking when writing into
7446 * holes and correct delalloc and unwritten extent mapping on filesystems that
7447 * support these features.
7449 * We are not allowed to take the i_mutex here so we have to play games to
7450 * protect against truncate races as the page could now be beyond EOF. Because
7451 * vmtruncate() writes the inode size before removing pages, once we have the
7452 * page lock we can determine safely if the page is beyond EOF. If it is not
7453 * beyond EOF, then the page is guaranteed safe against truncation until we
7456 int btrfs_page_mkwrite(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
7458 struct page
*page
= vmf
->page
;
7459 struct inode
*inode
= file_inode(vma
->vm_file
);
7460 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
7461 struct extent_io_tree
*io_tree
= &BTRFS_I(inode
)->io_tree
;
7462 struct btrfs_ordered_extent
*ordered
;
7463 struct extent_state
*cached_state
= NULL
;
7465 unsigned long zero_start
;
7472 sb_start_pagefault(inode
->i_sb
);
7473 ret
= btrfs_delalloc_reserve_space(inode
, PAGE_CACHE_SIZE
);
7475 ret
= file_update_time(vma
->vm_file
);
7481 else /* -ENOSPC, -EIO, etc */
7482 ret
= VM_FAULT_SIGBUS
;
7488 ret
= VM_FAULT_NOPAGE
; /* make the VM retry the fault */
7491 size
= i_size_read(inode
);
7492 page_start
= page_offset(page
);
7493 page_end
= page_start
+ PAGE_CACHE_SIZE
- 1;
7495 if ((page
->mapping
!= inode
->i_mapping
) ||
7496 (page_start
>= size
)) {
7497 /* page got truncated out from underneath us */
7500 wait_on_page_writeback(page
);
7502 lock_extent_bits(io_tree
, page_start
, page_end
, 0, &cached_state
);
7503 set_page_extent_mapped(page
);
7506 * we can't set the delalloc bits if there are pending ordered
7507 * extents. Drop our locks and wait for them to finish
7509 ordered
= btrfs_lookup_ordered_extent(inode
, page_start
);
7511 unlock_extent_cached(io_tree
, page_start
, page_end
,
7512 &cached_state
, GFP_NOFS
);
7514 btrfs_start_ordered_extent(inode
, ordered
, 1);
7515 btrfs_put_ordered_extent(ordered
);
7520 * XXX - page_mkwrite gets called every time the page is dirtied, even
7521 * if it was already dirty, so for space accounting reasons we need to
7522 * clear any delalloc bits for the range we are fixing to save. There
7523 * is probably a better way to do this, but for now keep consistent with
7524 * prepare_pages in the normal write path.
7526 clear_extent_bit(&BTRFS_I(inode
)->io_tree
, page_start
, page_end
,
7527 EXTENT_DIRTY
| EXTENT_DELALLOC
|
7528 EXTENT_DO_ACCOUNTING
| EXTENT_DEFRAG
,
7529 0, 0, &cached_state
, GFP_NOFS
);
7531 ret
= btrfs_set_extent_delalloc(inode
, page_start
, page_end
,
7534 unlock_extent_cached(io_tree
, page_start
, page_end
,
7535 &cached_state
, GFP_NOFS
);
7536 ret
= VM_FAULT_SIGBUS
;
7541 /* page is wholly or partially inside EOF */
7542 if (page_start
+ PAGE_CACHE_SIZE
> size
)
7543 zero_start
= size
& ~PAGE_CACHE_MASK
;
7545 zero_start
= PAGE_CACHE_SIZE
;
7547 if (zero_start
!= PAGE_CACHE_SIZE
) {
7549 memset(kaddr
+ zero_start
, 0, PAGE_CACHE_SIZE
- zero_start
);
7550 flush_dcache_page(page
);
7553 ClearPageChecked(page
);
7554 set_page_dirty(page
);
7555 SetPageUptodate(page
);
7557 BTRFS_I(inode
)->last_trans
= root
->fs_info
->generation
;
7558 BTRFS_I(inode
)->last_sub_trans
= BTRFS_I(inode
)->root
->log_transid
;
7559 BTRFS_I(inode
)->last_log_commit
= BTRFS_I(inode
)->root
->last_log_commit
;
7561 unlock_extent_cached(io_tree
, page_start
, page_end
, &cached_state
, GFP_NOFS
);
7565 sb_end_pagefault(inode
->i_sb
);
7566 return VM_FAULT_LOCKED
;
7570 btrfs_delalloc_release_space(inode
, PAGE_CACHE_SIZE
);
7572 sb_end_pagefault(inode
->i_sb
);
7576 static int btrfs_truncate(struct inode
*inode
)
7578 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
7579 struct btrfs_block_rsv
*rsv
;
7582 struct btrfs_trans_handle
*trans
;
7583 u64 mask
= root
->sectorsize
- 1;
7584 u64 min_size
= btrfs_calc_trunc_metadata_size(root
, 1);
7586 btrfs_wait_ordered_range(inode
, inode
->i_size
& (~mask
), (u64
)-1);
7587 btrfs_ordered_update_i_size(inode
, inode
->i_size
, NULL
);
7590 * Yes ladies and gentelment, this is indeed ugly. The fact is we have
7591 * 3 things going on here
7593 * 1) We need to reserve space for our orphan item and the space to
7594 * delete our orphan item. Lord knows we don't want to have a dangling
7595 * orphan item because we didn't reserve space to remove it.
7597 * 2) We need to reserve space to update our inode.
7599 * 3) We need to have something to cache all the space that is going to
7600 * be free'd up by the truncate operation, but also have some slack
7601 * space reserved in case it uses space during the truncate (thank you
7602 * very much snapshotting).
7604 * And we need these to all be seperate. The fact is we can use alot of
7605 * space doing the truncate, and we have no earthly idea how much space
7606 * we will use, so we need the truncate reservation to be seperate so it
7607 * doesn't end up using space reserved for updating the inode or
7608 * removing the orphan item. We also need to be able to stop the
7609 * transaction and start a new one, which means we need to be able to
7610 * update the inode several times, and we have no idea of knowing how
7611 * many times that will be, so we can't just reserve 1 item for the
7612 * entirety of the opration, so that has to be done seperately as well.
7613 * Then there is the orphan item, which does indeed need to be held on
7614 * to for the whole operation, and we need nobody to touch this reserved
7615 * space except the orphan code.
7617 * So that leaves us with
7619 * 1) root->orphan_block_rsv - for the orphan deletion.
7620 * 2) rsv - for the truncate reservation, which we will steal from the
7621 * transaction reservation.
7622 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
7623 * updating the inode.
7625 rsv
= btrfs_alloc_block_rsv(root
, BTRFS_BLOCK_RSV_TEMP
);
7628 rsv
->size
= min_size
;
7632 * 1 for the truncate slack space
7633 * 1 for updating the inode.
7635 trans
= btrfs_start_transaction(root
, 2);
7636 if (IS_ERR(trans
)) {
7637 err
= PTR_ERR(trans
);
7641 /* Migrate the slack space for the truncate to our reserve */
7642 ret
= btrfs_block_rsv_migrate(&root
->fs_info
->trans_block_rsv
, rsv
,
7647 * setattr is responsible for setting the ordered_data_close flag,
7648 * but that is only tested during the last file release. That
7649 * could happen well after the next commit, leaving a great big
7650 * window where new writes may get lost if someone chooses to write
7651 * to this file after truncating to zero
7653 * The inode doesn't have any dirty data here, and so if we commit
7654 * this is a noop. If someone immediately starts writing to the inode
7655 * it is very likely we'll catch some of their writes in this
7656 * transaction, and the commit will find this file on the ordered
7657 * data list with good things to send down.
7659 * This is a best effort solution, there is still a window where
7660 * using truncate to replace the contents of the file will
7661 * end up with a zero length file after a crash.
7663 if (inode
->i_size
== 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE
,
7664 &BTRFS_I(inode
)->runtime_flags
))
7665 btrfs_add_ordered_operation(trans
, root
, inode
);
7668 * So if we truncate and then write and fsync we normally would just
7669 * write the extents that changed, which is a problem if we need to
7670 * first truncate that entire inode. So set this flag so we write out
7671 * all of the extents in the inode to the sync log so we're completely
7674 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC
, &BTRFS_I(inode
)->runtime_flags
);
7675 trans
->block_rsv
= rsv
;
7678 ret
= btrfs_truncate_inode_items(trans
, root
, inode
,
7680 BTRFS_EXTENT_DATA_KEY
);
7681 if (ret
!= -ENOSPC
) {
7686 trans
->block_rsv
= &root
->fs_info
->trans_block_rsv
;
7687 ret
= btrfs_update_inode(trans
, root
, inode
);
7693 btrfs_end_transaction(trans
, root
);
7694 btrfs_btree_balance_dirty(root
);
7696 trans
= btrfs_start_transaction(root
, 2);
7697 if (IS_ERR(trans
)) {
7698 ret
= err
= PTR_ERR(trans
);
7703 ret
= btrfs_block_rsv_migrate(&root
->fs_info
->trans_block_rsv
,
7705 BUG_ON(ret
); /* shouldn't happen */
7706 trans
->block_rsv
= rsv
;
7709 if (ret
== 0 && inode
->i_nlink
> 0) {
7710 trans
->block_rsv
= root
->orphan_block_rsv
;
7711 ret
= btrfs_orphan_del(trans
, inode
);
7717 trans
->block_rsv
= &root
->fs_info
->trans_block_rsv
;
7718 ret
= btrfs_update_inode(trans
, root
, inode
);
7722 ret
= btrfs_end_transaction(trans
, root
);
7723 btrfs_btree_balance_dirty(root
);
7727 btrfs_free_block_rsv(root
, rsv
);
7736 * create a new subvolume directory/inode (helper for the ioctl).
7738 int btrfs_create_subvol_root(struct btrfs_trans_handle
*trans
,
7739 struct btrfs_root
*new_root
, u64 new_dirid
)
7741 struct inode
*inode
;
7745 inode
= btrfs_new_inode(trans
, new_root
, NULL
, "..", 2,
7746 new_dirid
, new_dirid
,
7747 S_IFDIR
| (~current_umask() & S_IRWXUGO
),
7750 return PTR_ERR(inode
);
7751 inode
->i_op
= &btrfs_dir_inode_operations
;
7752 inode
->i_fop
= &btrfs_dir_file_operations
;
7754 set_nlink(inode
, 1);
7755 btrfs_i_size_write(inode
, 0);
7757 err
= btrfs_update_inode(trans
, new_root
, inode
);
7763 struct inode
*btrfs_alloc_inode(struct super_block
*sb
)
7765 struct btrfs_inode
*ei
;
7766 struct inode
*inode
;
7768 ei
= kmem_cache_alloc(btrfs_inode_cachep
, GFP_NOFS
);
7775 ei
->last_sub_trans
= 0;
7776 ei
->logged_trans
= 0;
7777 ei
->delalloc_bytes
= 0;
7778 ei
->disk_i_size
= 0;
7781 ei
->index_cnt
= (u64
)-1;
7782 ei
->last_unlink_trans
= 0;
7783 ei
->last_log_commit
= 0;
7785 spin_lock_init(&ei
->lock
);
7786 ei
->outstanding_extents
= 0;
7787 ei
->reserved_extents
= 0;
7789 ei
->runtime_flags
= 0;
7790 ei
->force_compress
= BTRFS_COMPRESS_NONE
;
7792 ei
->delayed_node
= NULL
;
7794 inode
= &ei
->vfs_inode
;
7795 extent_map_tree_init(&ei
->extent_tree
);
7796 extent_io_tree_init(&ei
->io_tree
, &inode
->i_data
);
7797 extent_io_tree_init(&ei
->io_failure_tree
, &inode
->i_data
);
7798 ei
->io_tree
.track_uptodate
= 1;
7799 ei
->io_failure_tree
.track_uptodate
= 1;
7800 atomic_set(&ei
->sync_writers
, 0);
7801 mutex_init(&ei
->log_mutex
);
7802 mutex_init(&ei
->delalloc_mutex
);
7803 btrfs_ordered_inode_tree_init(&ei
->ordered_tree
);
7804 INIT_LIST_HEAD(&ei
->delalloc_inodes
);
7805 INIT_LIST_HEAD(&ei
->ordered_operations
);
7806 RB_CLEAR_NODE(&ei
->rb_node
);
7811 static void btrfs_i_callback(struct rcu_head
*head
)
7813 struct inode
*inode
= container_of(head
, struct inode
, i_rcu
);
7814 kmem_cache_free(btrfs_inode_cachep
, BTRFS_I(inode
));
7817 void btrfs_destroy_inode(struct inode
*inode
)
7819 struct btrfs_ordered_extent
*ordered
;
7820 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
7822 WARN_ON(!hlist_empty(&inode
->i_dentry
));
7823 WARN_ON(inode
->i_data
.nrpages
);
7824 WARN_ON(BTRFS_I(inode
)->outstanding_extents
);
7825 WARN_ON(BTRFS_I(inode
)->reserved_extents
);
7826 WARN_ON(BTRFS_I(inode
)->delalloc_bytes
);
7827 WARN_ON(BTRFS_I(inode
)->csum_bytes
);
7830 * This can happen where we create an inode, but somebody else also
7831 * created the same inode and we need to destroy the one we already
7838 * Make sure we're properly removed from the ordered operation
7842 if (!list_empty(&BTRFS_I(inode
)->ordered_operations
)) {
7843 spin_lock(&root
->fs_info
->ordered_root_lock
);
7844 list_del_init(&BTRFS_I(inode
)->ordered_operations
);
7845 spin_unlock(&root
->fs_info
->ordered_root_lock
);
7848 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM
,
7849 &BTRFS_I(inode
)->runtime_flags
)) {
7850 btrfs_info(root
->fs_info
, "inode %llu still on the orphan list",
7851 (unsigned long long)btrfs_ino(inode
));
7852 atomic_dec(&root
->orphan_inodes
);
7856 ordered
= btrfs_lookup_first_ordered_extent(inode
, (u64
)-1);
7860 btrfs_err(root
->fs_info
, "found ordered extent %llu %llu on inode cleanup",
7861 (unsigned long long)ordered
->file_offset
,
7862 (unsigned long long)ordered
->len
);
7863 btrfs_remove_ordered_extent(inode
, ordered
);
7864 btrfs_put_ordered_extent(ordered
);
7865 btrfs_put_ordered_extent(ordered
);
7868 inode_tree_del(inode
);
7869 btrfs_drop_extent_cache(inode
, 0, (u64
)-1, 0);
7871 call_rcu(&inode
->i_rcu
, btrfs_i_callback
);
7874 int btrfs_drop_inode(struct inode
*inode
)
7876 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
7881 /* the snap/subvol tree is on deleting */
7882 if (btrfs_root_refs(&root
->root_item
) == 0 &&
7883 root
!= root
->fs_info
->tree_root
)
7886 return generic_drop_inode(inode
);
7889 static void init_once(void *foo
)
7891 struct btrfs_inode
*ei
= (struct btrfs_inode
*) foo
;
7893 inode_init_once(&ei
->vfs_inode
);
7896 void btrfs_destroy_cachep(void)
7899 * Make sure all delayed rcu free inodes are flushed before we
7903 if (btrfs_inode_cachep
)
7904 kmem_cache_destroy(btrfs_inode_cachep
);
7905 if (btrfs_trans_handle_cachep
)
7906 kmem_cache_destroy(btrfs_trans_handle_cachep
);
7907 if (btrfs_transaction_cachep
)
7908 kmem_cache_destroy(btrfs_transaction_cachep
);
7909 if (btrfs_path_cachep
)
7910 kmem_cache_destroy(btrfs_path_cachep
);
7911 if (btrfs_free_space_cachep
)
7912 kmem_cache_destroy(btrfs_free_space_cachep
);
7913 if (btrfs_delalloc_work_cachep
)
7914 kmem_cache_destroy(btrfs_delalloc_work_cachep
);
7917 int btrfs_init_cachep(void)
7919 btrfs_inode_cachep
= kmem_cache_create("btrfs_inode",
7920 sizeof(struct btrfs_inode
), 0,
7921 SLAB_RECLAIM_ACCOUNT
| SLAB_MEM_SPREAD
, init_once
);
7922 if (!btrfs_inode_cachep
)
7925 btrfs_trans_handle_cachep
= kmem_cache_create("btrfs_trans_handle",
7926 sizeof(struct btrfs_trans_handle
), 0,
7927 SLAB_RECLAIM_ACCOUNT
| SLAB_MEM_SPREAD
, NULL
);
7928 if (!btrfs_trans_handle_cachep
)
7931 btrfs_transaction_cachep
= kmem_cache_create("btrfs_transaction",
7932 sizeof(struct btrfs_transaction
), 0,
7933 SLAB_RECLAIM_ACCOUNT
| SLAB_MEM_SPREAD
, NULL
);
7934 if (!btrfs_transaction_cachep
)
7937 btrfs_path_cachep
= kmem_cache_create("btrfs_path",
7938 sizeof(struct btrfs_path
), 0,
7939 SLAB_RECLAIM_ACCOUNT
| SLAB_MEM_SPREAD
, NULL
);
7940 if (!btrfs_path_cachep
)
7943 btrfs_free_space_cachep
= kmem_cache_create("btrfs_free_space",
7944 sizeof(struct btrfs_free_space
), 0,
7945 SLAB_RECLAIM_ACCOUNT
| SLAB_MEM_SPREAD
, NULL
);
7946 if (!btrfs_free_space_cachep
)
7949 btrfs_delalloc_work_cachep
= kmem_cache_create("btrfs_delalloc_work",
7950 sizeof(struct btrfs_delalloc_work
), 0,
7951 SLAB_RECLAIM_ACCOUNT
| SLAB_MEM_SPREAD
,
7953 if (!btrfs_delalloc_work_cachep
)
7958 btrfs_destroy_cachep();
7962 static int btrfs_getattr(struct vfsmount
*mnt
,
7963 struct dentry
*dentry
, struct kstat
*stat
)
7966 struct inode
*inode
= dentry
->d_inode
;
7967 u32 blocksize
= inode
->i_sb
->s_blocksize
;
7969 generic_fillattr(inode
, stat
);
7970 stat
->dev
= BTRFS_I(inode
)->root
->anon_dev
;
7971 stat
->blksize
= PAGE_CACHE_SIZE
;
7973 spin_lock(&BTRFS_I(inode
)->lock
);
7974 delalloc_bytes
= BTRFS_I(inode
)->delalloc_bytes
;
7975 spin_unlock(&BTRFS_I(inode
)->lock
);
7976 stat
->blocks
= (ALIGN(inode_get_bytes(inode
), blocksize
) +
7977 ALIGN(delalloc_bytes
, blocksize
)) >> 9;
7981 static int btrfs_rename(struct inode
*old_dir
, struct dentry
*old_dentry
,
7982 struct inode
*new_dir
, struct dentry
*new_dentry
)
7984 struct btrfs_trans_handle
*trans
;
7985 struct btrfs_root
*root
= BTRFS_I(old_dir
)->root
;
7986 struct btrfs_root
*dest
= BTRFS_I(new_dir
)->root
;
7987 struct inode
*new_inode
= new_dentry
->d_inode
;
7988 struct inode
*old_inode
= old_dentry
->d_inode
;
7989 struct timespec ctime
= CURRENT_TIME
;
7993 u64 old_ino
= btrfs_ino(old_inode
);
7995 if (btrfs_ino(new_dir
) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID
)
7998 /* we only allow rename subvolume link between subvolumes */
7999 if (old_ino
!= BTRFS_FIRST_FREE_OBJECTID
&& root
!= dest
)
8002 if (old_ino
== BTRFS_EMPTY_SUBVOL_DIR_OBJECTID
||
8003 (new_inode
&& btrfs_ino(new_inode
) == BTRFS_FIRST_FREE_OBJECTID
))
8006 if (S_ISDIR(old_inode
->i_mode
) && new_inode
&&
8007 new_inode
->i_size
> BTRFS_EMPTY_DIR_SIZE
)
8011 /* check for collisions, even if the name isn't there */
8012 ret
= btrfs_check_dir_item_collision(root
, new_dir
->i_ino
,
8013 new_dentry
->d_name
.name
,
8014 new_dentry
->d_name
.len
);
8017 if (ret
== -EEXIST
) {
8019 * eexist without a new_inode */
8025 /* maybe -EOVERFLOW */
8032 * we're using rename to replace one file with another.
8033 * and the replacement file is large. Start IO on it now so
8034 * we don't add too much work to the end of the transaction
8036 if (new_inode
&& S_ISREG(old_inode
->i_mode
) && new_inode
->i_size
&&
8037 old_inode
->i_size
> BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT
)
8038 filemap_flush(old_inode
->i_mapping
);
8040 /* close the racy window with snapshot create/destroy ioctl */
8041 if (old_ino
== BTRFS_FIRST_FREE_OBJECTID
)
8042 down_read(&root
->fs_info
->subvol_sem
);
8044 * We want to reserve the absolute worst case amount of items. So if
8045 * both inodes are subvols and we need to unlink them then that would
8046 * require 4 item modifications, but if they are both normal inodes it
8047 * would require 5 item modifications, so we'll assume their normal
8048 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
8049 * should cover the worst case number of items we'll modify.
8051 trans
= btrfs_start_transaction(root
, 11);
8052 if (IS_ERR(trans
)) {
8053 ret
= PTR_ERR(trans
);
8058 btrfs_record_root_in_trans(trans
, dest
);
8060 ret
= btrfs_set_inode_index(new_dir
, &index
);
8064 if (unlikely(old_ino
== BTRFS_FIRST_FREE_OBJECTID
)) {
8065 /* force full log commit if subvolume involved. */
8066 root
->fs_info
->last_trans_log_full_commit
= trans
->transid
;
8068 ret
= btrfs_insert_inode_ref(trans
, dest
,
8069 new_dentry
->d_name
.name
,
8070 new_dentry
->d_name
.len
,
8072 btrfs_ino(new_dir
), index
);
8076 * this is an ugly little race, but the rename is required
8077 * to make sure that if we crash, the inode is either at the
8078 * old name or the new one. pinning the log transaction lets
8079 * us make sure we don't allow a log commit to come in after
8080 * we unlink the name but before we add the new name back in.
8082 btrfs_pin_log_trans(root
);
8085 * make sure the inode gets flushed if it is replacing
8088 if (new_inode
&& new_inode
->i_size
&& S_ISREG(old_inode
->i_mode
))
8089 btrfs_add_ordered_operation(trans
, root
, old_inode
);
8091 inode_inc_iversion(old_dir
);
8092 inode_inc_iversion(new_dir
);
8093 inode_inc_iversion(old_inode
);
8094 old_dir
->i_ctime
= old_dir
->i_mtime
= ctime
;
8095 new_dir
->i_ctime
= new_dir
->i_mtime
= ctime
;
8096 old_inode
->i_ctime
= ctime
;
8098 if (old_dentry
->d_parent
!= new_dentry
->d_parent
)
8099 btrfs_record_unlink_dir(trans
, old_dir
, old_inode
, 1);
8101 if (unlikely(old_ino
== BTRFS_FIRST_FREE_OBJECTID
)) {
8102 root_objectid
= BTRFS_I(old_inode
)->root
->root_key
.objectid
;
8103 ret
= btrfs_unlink_subvol(trans
, root
, old_dir
, root_objectid
,
8104 old_dentry
->d_name
.name
,
8105 old_dentry
->d_name
.len
);
8107 ret
= __btrfs_unlink_inode(trans
, root
, old_dir
,
8108 old_dentry
->d_inode
,
8109 old_dentry
->d_name
.name
,
8110 old_dentry
->d_name
.len
);
8112 ret
= btrfs_update_inode(trans
, root
, old_inode
);
8115 btrfs_abort_transaction(trans
, root
, ret
);
8120 inode_inc_iversion(new_inode
);
8121 new_inode
->i_ctime
= CURRENT_TIME
;
8122 if (unlikely(btrfs_ino(new_inode
) ==
8123 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID
)) {
8124 root_objectid
= BTRFS_I(new_inode
)->location
.objectid
;
8125 ret
= btrfs_unlink_subvol(trans
, dest
, new_dir
,
8127 new_dentry
->d_name
.name
,
8128 new_dentry
->d_name
.len
);
8129 BUG_ON(new_inode
->i_nlink
== 0);
8131 ret
= btrfs_unlink_inode(trans
, dest
, new_dir
,
8132 new_dentry
->d_inode
,
8133 new_dentry
->d_name
.name
,
8134 new_dentry
->d_name
.len
);
8136 if (!ret
&& new_inode
->i_nlink
== 0)
8137 ret
= btrfs_orphan_add(trans
, new_dentry
->d_inode
);
8139 btrfs_abort_transaction(trans
, root
, ret
);
8144 ret
= btrfs_add_link(trans
, new_dir
, old_inode
,
8145 new_dentry
->d_name
.name
,
8146 new_dentry
->d_name
.len
, 0, index
);
8148 btrfs_abort_transaction(trans
, root
, ret
);
8152 if (old_ino
!= BTRFS_FIRST_FREE_OBJECTID
) {
8153 struct dentry
*parent
= new_dentry
->d_parent
;
8154 btrfs_log_new_name(trans
, old_inode
, old_dir
, parent
);
8155 btrfs_end_log_trans(root
);
8158 btrfs_end_transaction(trans
, root
);
8160 if (old_ino
== BTRFS_FIRST_FREE_OBJECTID
)
8161 up_read(&root
->fs_info
->subvol_sem
);
8166 static void btrfs_run_delalloc_work(struct btrfs_work
*work
)
8168 struct btrfs_delalloc_work
*delalloc_work
;
8170 delalloc_work
= container_of(work
, struct btrfs_delalloc_work
,
8172 if (delalloc_work
->wait
)
8173 btrfs_wait_ordered_range(delalloc_work
->inode
, 0, (u64
)-1);
8175 filemap_flush(delalloc_work
->inode
->i_mapping
);
8177 if (delalloc_work
->delay_iput
)
8178 btrfs_add_delayed_iput(delalloc_work
->inode
);
8180 iput(delalloc_work
->inode
);
8181 complete(&delalloc_work
->completion
);
8184 struct btrfs_delalloc_work
*btrfs_alloc_delalloc_work(struct inode
*inode
,
8185 int wait
, int delay_iput
)
8187 struct btrfs_delalloc_work
*work
;
8189 work
= kmem_cache_zalloc(btrfs_delalloc_work_cachep
, GFP_NOFS
);
8193 init_completion(&work
->completion
);
8194 INIT_LIST_HEAD(&work
->list
);
8195 work
->inode
= inode
;
8197 work
->delay_iput
= delay_iput
;
8198 work
->work
.func
= btrfs_run_delalloc_work
;
8203 void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work
*work
)
8205 wait_for_completion(&work
->completion
);
8206 kmem_cache_free(btrfs_delalloc_work_cachep
, work
);
8210 * some fairly slow code that needs optimization. This walks the list
8211 * of all the inodes with pending delalloc and forces them to disk.
8213 static int __start_delalloc_inodes(struct btrfs_root
*root
, int delay_iput
)
8215 struct btrfs_inode
*binode
;
8216 struct inode
*inode
;
8217 struct btrfs_delalloc_work
*work
, *next
;
8218 struct list_head works
;
8219 struct list_head splice
;
8222 INIT_LIST_HEAD(&works
);
8223 INIT_LIST_HEAD(&splice
);
8225 spin_lock(&root
->delalloc_lock
);
8226 list_splice_init(&root
->delalloc_inodes
, &splice
);
8227 while (!list_empty(&splice
)) {
8228 binode
= list_entry(splice
.next
, struct btrfs_inode
,
8231 list_move_tail(&binode
->delalloc_inodes
,
8232 &root
->delalloc_inodes
);
8233 inode
= igrab(&binode
->vfs_inode
);
8235 cond_resched_lock(&root
->delalloc_lock
);
8238 spin_unlock(&root
->delalloc_lock
);
8240 work
= btrfs_alloc_delalloc_work(inode
, 0, delay_iput
);
8241 if (unlikely(!work
)) {
8245 list_add_tail(&work
->list
, &works
);
8246 btrfs_queue_worker(&root
->fs_info
->flush_workers
,
8250 spin_lock(&root
->delalloc_lock
);
8252 spin_unlock(&root
->delalloc_lock
);
8254 list_for_each_entry_safe(work
, next
, &works
, list
) {
8255 list_del_init(&work
->list
);
8256 btrfs_wait_and_free_delalloc_work(work
);
8260 list_for_each_entry_safe(work
, next
, &works
, list
) {
8261 list_del_init(&work
->list
);
8262 btrfs_wait_and_free_delalloc_work(work
);
8265 if (!list_empty_careful(&splice
)) {
8266 spin_lock(&root
->delalloc_lock
);
8267 list_splice_tail(&splice
, &root
->delalloc_inodes
);
8268 spin_unlock(&root
->delalloc_lock
);
8273 int btrfs_start_delalloc_inodes(struct btrfs_root
*root
, int delay_iput
)
8277 if (root
->fs_info
->sb
->s_flags
& MS_RDONLY
)
8280 ret
= __start_delalloc_inodes(root
, delay_iput
);
8282 * the filemap_flush will queue IO into the worker threads, but
8283 * we have to make sure the IO is actually started and that
8284 * ordered extents get created before we return
8286 atomic_inc(&root
->fs_info
->async_submit_draining
);
8287 while (atomic_read(&root
->fs_info
->nr_async_submits
) ||
8288 atomic_read(&root
->fs_info
->async_delalloc_pages
)) {
8289 wait_event(root
->fs_info
->async_submit_wait
,
8290 (atomic_read(&root
->fs_info
->nr_async_submits
) == 0 &&
8291 atomic_read(&root
->fs_info
->async_delalloc_pages
) == 0));
8293 atomic_dec(&root
->fs_info
->async_submit_draining
);
8297 int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info
*fs_info
,
8300 struct btrfs_root
*root
;
8301 struct list_head splice
;
8304 if (fs_info
->sb
->s_flags
& MS_RDONLY
)
8307 INIT_LIST_HEAD(&splice
);
8309 spin_lock(&fs_info
->delalloc_root_lock
);
8310 list_splice_init(&fs_info
->delalloc_roots
, &splice
);
8311 while (!list_empty(&splice
)) {
8312 root
= list_first_entry(&splice
, struct btrfs_root
,
8314 root
= btrfs_grab_fs_root(root
);
8316 list_move_tail(&root
->delalloc_root
,
8317 &fs_info
->delalloc_roots
);
8318 spin_unlock(&fs_info
->delalloc_root_lock
);
8320 ret
= __start_delalloc_inodes(root
, delay_iput
);
8321 btrfs_put_fs_root(root
);
8325 spin_lock(&fs_info
->delalloc_root_lock
);
8327 spin_unlock(&fs_info
->delalloc_root_lock
);
8329 atomic_inc(&fs_info
->async_submit_draining
);
8330 while (atomic_read(&fs_info
->nr_async_submits
) ||
8331 atomic_read(&fs_info
->async_delalloc_pages
)) {
8332 wait_event(fs_info
->async_submit_wait
,
8333 (atomic_read(&fs_info
->nr_async_submits
) == 0 &&
8334 atomic_read(&fs_info
->async_delalloc_pages
) == 0));
8336 atomic_dec(&fs_info
->async_submit_draining
);
8339 if (!list_empty_careful(&splice
)) {
8340 spin_lock(&fs_info
->delalloc_root_lock
);
8341 list_splice_tail(&splice
, &fs_info
->delalloc_roots
);
8342 spin_unlock(&fs_info
->delalloc_root_lock
);
8347 static int btrfs_symlink(struct inode
*dir
, struct dentry
*dentry
,
8348 const char *symname
)
8350 struct btrfs_trans_handle
*trans
;
8351 struct btrfs_root
*root
= BTRFS_I(dir
)->root
;
8352 struct btrfs_path
*path
;
8353 struct btrfs_key key
;
8354 struct inode
*inode
= NULL
;
8362 struct btrfs_file_extent_item
*ei
;
8363 struct extent_buffer
*leaf
;
8365 name_len
= strlen(symname
) + 1;
8366 if (name_len
> BTRFS_MAX_INLINE_DATA_SIZE(root
))
8367 return -ENAMETOOLONG
;
8370 * 2 items for inode item and ref
8371 * 2 items for dir items
8372 * 1 item for xattr if selinux is on
8374 trans
= btrfs_start_transaction(root
, 5);
8376 return PTR_ERR(trans
);
8378 err
= btrfs_find_free_ino(root
, &objectid
);
8382 inode
= btrfs_new_inode(trans
, root
, dir
, dentry
->d_name
.name
,
8383 dentry
->d_name
.len
, btrfs_ino(dir
), objectid
,
8384 S_IFLNK
|S_IRWXUGO
, &index
);
8385 if (IS_ERR(inode
)) {
8386 err
= PTR_ERR(inode
);
8390 err
= btrfs_init_inode_security(trans
, inode
, dir
, &dentry
->d_name
);
8397 * If the active LSM wants to access the inode during
8398 * d_instantiate it needs these. Smack checks to see
8399 * if the filesystem supports xattrs by looking at the
8402 inode
->i_fop
= &btrfs_file_operations
;
8403 inode
->i_op
= &btrfs_file_inode_operations
;
8405 err
= btrfs_add_nondir(trans
, dir
, dentry
, inode
, 0, index
);
8409 inode
->i_mapping
->a_ops
= &btrfs_aops
;
8410 inode
->i_mapping
->backing_dev_info
= &root
->fs_info
->bdi
;
8411 BTRFS_I(inode
)->io_tree
.ops
= &btrfs_extent_io_ops
;
8416 path
= btrfs_alloc_path();
8422 key
.objectid
= btrfs_ino(inode
);
8424 btrfs_set_key_type(&key
, BTRFS_EXTENT_DATA_KEY
);
8425 datasize
= btrfs_file_extent_calc_inline_size(name_len
);
8426 err
= btrfs_insert_empty_item(trans
, root
, path
, &key
,
8430 btrfs_free_path(path
);
8433 leaf
= path
->nodes
[0];
8434 ei
= btrfs_item_ptr(leaf
, path
->slots
[0],
8435 struct btrfs_file_extent_item
);
8436 btrfs_set_file_extent_generation(leaf
, ei
, trans
->transid
);
8437 btrfs_set_file_extent_type(leaf
, ei
,
8438 BTRFS_FILE_EXTENT_INLINE
);
8439 btrfs_set_file_extent_encryption(leaf
, ei
, 0);
8440 btrfs_set_file_extent_compression(leaf
, ei
, 0);
8441 btrfs_set_file_extent_other_encoding(leaf
, ei
, 0);
8442 btrfs_set_file_extent_ram_bytes(leaf
, ei
, name_len
);
8444 ptr
= btrfs_file_extent_inline_start(ei
);
8445 write_extent_buffer(leaf
, symname
, ptr
, name_len
);
8446 btrfs_mark_buffer_dirty(leaf
);
8447 btrfs_free_path(path
);
8449 inode
->i_op
= &btrfs_symlink_inode_operations
;
8450 inode
->i_mapping
->a_ops
= &btrfs_symlink_aops
;
8451 inode
->i_mapping
->backing_dev_info
= &root
->fs_info
->bdi
;
8452 inode_set_bytes(inode
, name_len
);
8453 btrfs_i_size_write(inode
, name_len
- 1);
8454 err
= btrfs_update_inode(trans
, root
, inode
);
8460 d_instantiate(dentry
, inode
);
8461 btrfs_end_transaction(trans
, root
);
8463 inode_dec_link_count(inode
);
8466 btrfs_btree_balance_dirty(root
);
8470 static int __btrfs_prealloc_file_range(struct inode
*inode
, int mode
,
8471 u64 start
, u64 num_bytes
, u64 min_size
,
8472 loff_t actual_len
, u64
*alloc_hint
,
8473 struct btrfs_trans_handle
*trans
)
8475 struct extent_map_tree
*em_tree
= &BTRFS_I(inode
)->extent_tree
;
8476 struct extent_map
*em
;
8477 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
8478 struct btrfs_key ins
;
8479 u64 cur_offset
= start
;
8483 bool own_trans
= true;
8487 while (num_bytes
> 0) {
8489 trans
= btrfs_start_transaction(root
, 3);
8490 if (IS_ERR(trans
)) {
8491 ret
= PTR_ERR(trans
);
8496 cur_bytes
= min(num_bytes
, 256ULL * 1024 * 1024);
8497 cur_bytes
= max(cur_bytes
, min_size
);
8498 ret
= btrfs_reserve_extent(trans
, root
, cur_bytes
,
8499 min_size
, 0, *alloc_hint
, &ins
, 1);
8502 btrfs_end_transaction(trans
, root
);
8506 ret
= insert_reserved_file_extent(trans
, inode
,
8507 cur_offset
, ins
.objectid
,
8508 ins
.offset
, ins
.offset
,
8509 ins
.offset
, 0, 0, 0,
8510 BTRFS_FILE_EXTENT_PREALLOC
);
8512 btrfs_abort_transaction(trans
, root
, ret
);
8514 btrfs_end_transaction(trans
, root
);
8517 btrfs_drop_extent_cache(inode
, cur_offset
,
8518 cur_offset
+ ins
.offset
-1, 0);
8520 em
= alloc_extent_map();
8522 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC
,
8523 &BTRFS_I(inode
)->runtime_flags
);
8527 em
->start
= cur_offset
;
8528 em
->orig_start
= cur_offset
;
8529 em
->len
= ins
.offset
;
8530 em
->block_start
= ins
.objectid
;
8531 em
->block_len
= ins
.offset
;
8532 em
->orig_block_len
= ins
.offset
;
8533 em
->ram_bytes
= ins
.offset
;
8534 em
->bdev
= root
->fs_info
->fs_devices
->latest_bdev
;
8535 set_bit(EXTENT_FLAG_PREALLOC
, &em
->flags
);
8536 em
->generation
= trans
->transid
;
8539 write_lock(&em_tree
->lock
);
8540 ret
= add_extent_mapping(em_tree
, em
, 1);
8541 write_unlock(&em_tree
->lock
);
8544 btrfs_drop_extent_cache(inode
, cur_offset
,
8545 cur_offset
+ ins
.offset
- 1,
8548 free_extent_map(em
);
8550 num_bytes
-= ins
.offset
;
8551 cur_offset
+= ins
.offset
;
8552 *alloc_hint
= ins
.objectid
+ ins
.offset
;
8554 inode_inc_iversion(inode
);
8555 inode
->i_ctime
= CURRENT_TIME
;
8556 BTRFS_I(inode
)->flags
|= BTRFS_INODE_PREALLOC
;
8557 if (!(mode
& FALLOC_FL_KEEP_SIZE
) &&
8558 (actual_len
> inode
->i_size
) &&
8559 (cur_offset
> inode
->i_size
)) {
8560 if (cur_offset
> actual_len
)
8561 i_size
= actual_len
;
8563 i_size
= cur_offset
;
8564 i_size_write(inode
, i_size
);
8565 btrfs_ordered_update_i_size(inode
, i_size
, NULL
);
8568 ret
= btrfs_update_inode(trans
, root
, inode
);
8571 btrfs_abort_transaction(trans
, root
, ret
);
8573 btrfs_end_transaction(trans
, root
);
8578 btrfs_end_transaction(trans
, root
);
8583 int btrfs_prealloc_file_range(struct inode
*inode
, int mode
,
8584 u64 start
, u64 num_bytes
, u64 min_size
,
8585 loff_t actual_len
, u64
*alloc_hint
)
8587 return __btrfs_prealloc_file_range(inode
, mode
, start
, num_bytes
,
8588 min_size
, actual_len
, alloc_hint
,
8592 int btrfs_prealloc_file_range_trans(struct inode
*inode
,
8593 struct btrfs_trans_handle
*trans
, int mode
,
8594 u64 start
, u64 num_bytes
, u64 min_size
,
8595 loff_t actual_len
, u64
*alloc_hint
)
8597 return __btrfs_prealloc_file_range(inode
, mode
, start
, num_bytes
,
8598 min_size
, actual_len
, alloc_hint
, trans
);
8601 static int btrfs_set_page_dirty(struct page
*page
)
8603 return __set_page_dirty_nobuffers(page
);
8606 static int btrfs_permission(struct inode
*inode
, int mask
)
8608 struct btrfs_root
*root
= BTRFS_I(inode
)->root
;
8609 umode_t mode
= inode
->i_mode
;
8611 if (mask
& MAY_WRITE
&&
8612 (S_ISREG(mode
) || S_ISDIR(mode
) || S_ISLNK(mode
))) {
8613 if (btrfs_root_readonly(root
))
8615 if (BTRFS_I(inode
)->flags
& BTRFS_INODE_READONLY
)
8618 return generic_permission(inode
, mask
);
8621 static const struct inode_operations btrfs_dir_inode_operations
= {
8622 .getattr
= btrfs_getattr
,
8623 .lookup
= btrfs_lookup
,
8624 .create
= btrfs_create
,
8625 .unlink
= btrfs_unlink
,
8627 .mkdir
= btrfs_mkdir
,
8628 .rmdir
= btrfs_rmdir
,
8629 .rename
= btrfs_rename
,
8630 .symlink
= btrfs_symlink
,
8631 .setattr
= btrfs_setattr
,
8632 .mknod
= btrfs_mknod
,
8633 .setxattr
= btrfs_setxattr
,
8634 .getxattr
= btrfs_getxattr
,
8635 .listxattr
= btrfs_listxattr
,
8636 .removexattr
= btrfs_removexattr
,
8637 .permission
= btrfs_permission
,
8638 .get_acl
= btrfs_get_acl
,
8640 static const struct inode_operations btrfs_dir_ro_inode_operations
= {
8641 .lookup
= btrfs_lookup
,
8642 .permission
= btrfs_permission
,
8643 .get_acl
= btrfs_get_acl
,
8646 static const struct file_operations btrfs_dir_file_operations
= {
8647 .llseek
= generic_file_llseek
,
8648 .read
= generic_read_dir
,
8649 .iterate
= btrfs_real_readdir
,
8650 .unlocked_ioctl
= btrfs_ioctl
,
8651 #ifdef CONFIG_COMPAT
8652 .compat_ioctl
= btrfs_ioctl
,
8654 .release
= btrfs_release_file
,
8655 .fsync
= btrfs_sync_file
,
8658 static struct extent_io_ops btrfs_extent_io_ops
= {
8659 .fill_delalloc
= run_delalloc_range
,
8660 .submit_bio_hook
= btrfs_submit_bio_hook
,
8661 .merge_bio_hook
= btrfs_merge_bio_hook
,
8662 .readpage_end_io_hook
= btrfs_readpage_end_io_hook
,
8663 .writepage_end_io_hook
= btrfs_writepage_end_io_hook
,
8664 .writepage_start_hook
= btrfs_writepage_start_hook
,
8665 .set_bit_hook
= btrfs_set_bit_hook
,
8666 .clear_bit_hook
= btrfs_clear_bit_hook
,
8667 .merge_extent_hook
= btrfs_merge_extent_hook
,
8668 .split_extent_hook
= btrfs_split_extent_hook
,
8672 * btrfs doesn't support the bmap operation because swapfiles
8673 * use bmap to make a mapping of extents in the file. They assume
8674 * these extents won't change over the life of the file and they
8675 * use the bmap result to do IO directly to the drive.
8677 * the btrfs bmap call would return logical addresses that aren't
8678 * suitable for IO and they also will change frequently as COW
8679 * operations happen. So, swapfile + btrfs == corruption.
8681 * For now we're avoiding this by dropping bmap.
8683 static const struct address_space_operations btrfs_aops
= {
8684 .readpage
= btrfs_readpage
,
8685 .writepage
= btrfs_writepage
,
8686 .writepages
= btrfs_writepages
,
8687 .readpages
= btrfs_readpages
,
8688 .direct_IO
= btrfs_direct_IO
,
8689 .invalidatepage
= btrfs_invalidatepage
,
8690 .releasepage
= btrfs_releasepage
,
8691 .set_page_dirty
= btrfs_set_page_dirty
,
8692 .error_remove_page
= generic_error_remove_page
,
8695 static const struct address_space_operations btrfs_symlink_aops
= {
8696 .readpage
= btrfs_readpage
,
8697 .writepage
= btrfs_writepage
,
8698 .invalidatepage
= btrfs_invalidatepage
,
8699 .releasepage
= btrfs_releasepage
,
8702 static const struct inode_operations btrfs_file_inode_operations
= {
8703 .getattr
= btrfs_getattr
,
8704 .setattr
= btrfs_setattr
,
8705 .setxattr
= btrfs_setxattr
,
8706 .getxattr
= btrfs_getxattr
,
8707 .listxattr
= btrfs_listxattr
,
8708 .removexattr
= btrfs_removexattr
,
8709 .permission
= btrfs_permission
,
8710 .fiemap
= btrfs_fiemap
,
8711 .get_acl
= btrfs_get_acl
,
8712 .update_time
= btrfs_update_time
,
8714 static const struct inode_operations btrfs_special_inode_operations
= {
8715 .getattr
= btrfs_getattr
,
8716 .setattr
= btrfs_setattr
,
8717 .permission
= btrfs_permission
,
8718 .setxattr
= btrfs_setxattr
,
8719 .getxattr
= btrfs_getxattr
,
8720 .listxattr
= btrfs_listxattr
,
8721 .removexattr
= btrfs_removexattr
,
8722 .get_acl
= btrfs_get_acl
,
8723 .update_time
= btrfs_update_time
,
8725 static const struct inode_operations btrfs_symlink_inode_operations
= {
8726 .readlink
= generic_readlink
,
8727 .follow_link
= page_follow_link_light
,
8728 .put_link
= page_put_link
,
8729 .getattr
= btrfs_getattr
,
8730 .setattr
= btrfs_setattr
,
8731 .permission
= btrfs_permission
,
8732 .setxattr
= btrfs_setxattr
,
8733 .getxattr
= btrfs_getxattr
,
8734 .listxattr
= btrfs_listxattr
,
8735 .removexattr
= btrfs_removexattr
,
8736 .get_acl
= btrfs_get_acl
,
8737 .update_time
= btrfs_update_time
,
8740 const struct dentry_operations btrfs_dentry_operations
= {
8741 .d_delete
= btrfs_dentry_delete
,
8742 .d_release
= btrfs_dentry_release
,