Btrfs: fix wrong mapping flags for free space inode
[deliverable/linux.git] / fs / btrfs / inode.c
index da828cf5e8f885b542012dc3d23854916dc35106..0020b5675fa9700d86ef2f497a35c43a8d6e41b6 100644 (file)
@@ -43,6 +43,7 @@
 #include <linux/btrfs.h>
 #include <linux/blkdev.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/uio.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -59,6 +60,7 @@
 #include "backref.h"
 #include "hash.h"
 #include "props.h"
+#include "qgroup.h"
 
 struct btrfs_iget_args {
        struct btrfs_key *location;
@@ -108,6 +110,13 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
 
 static int btrfs_dirty_inode(struct inode *inode);
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_inode_set_ops(struct inode *inode)
+{
+       BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+}
+#endif
+
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
                                     struct inode *inode,  struct inode *dir,
                                     const struct qstr *qstr)
@@ -463,7 +472,7 @@ again:
         */
        if (inode_need_compress(inode)) {
                WARN_ON(pages);
-               pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+               pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
                if (!pages) {
                        /* just bail out to the uncompressed code */
                        goto cont;
@@ -745,7 +754,6 @@ retry:
                        }
                        goto out_free;
                }
-
                /*
                 * here we're doing allocation and writeback of the
                 * compressed pages
@@ -1542,30 +1550,17 @@ static void btrfs_split_extent_hook(struct inode *inode,
                u64 new_size;
 
                /*
-                * We need the largest size of the remaining extent to see if we
-                * need to add a new outstanding extent.  Think of the following
-                * case
-                *
-                * [MEAX_EXTENT_SIZEx2 - 4k][4k]
-                *
-                * The new_size would just be 4k and we'd think we had enough
-                * outstanding extents for this if we only took one side of the
-                * split, same goes for the other direction.  We need to see if
-                * the larger size still is the same amount of extents as the
-                * original size, because if it is we need to add a new
-                * outstanding extent.  But if we split up and the larger size
-                * is less than the original then we are good to go since we've
-                * already accounted for the extra extent in our original
-                * accounting.
+                * See the explanation in btrfs_merge_extent_hook, the same
+                * applies here, just in reverse.
                 */
                new_size = orig->end - split + 1;
-               if ((split - orig->start) > new_size)
-                       new_size = split - orig->start;
-
-               num_extents = div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
+               num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+                                       BTRFS_MAX_EXTENT_SIZE);
+               new_size = split - orig->start;
+               num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
                                        BTRFS_MAX_EXTENT_SIZE);
-               if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
-                             BTRFS_MAX_EXTENT_SIZE) < num_extents)
+               if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
+                             BTRFS_MAX_EXTENT_SIZE) >= num_extents)
                        return;
        }
 
@@ -1591,8 +1586,10 @@ static void btrfs_merge_extent_hook(struct inode *inode,
        if (!(other->state & EXTENT_DELALLOC))
                return;
 
-       old_size = other->end - other->start + 1;
-       new_size = old_size + (new->end - new->start + 1);
+       if (new->start > other->start)
+               new_size = new->end - other->start + 1;
+       else
+               new_size = other->end - new->start + 1;
 
        /* we're not bigger than the max, unreserve the space and go */
        if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
@@ -1603,13 +1600,32 @@ static void btrfs_merge_extent_hook(struct inode *inode,
        }
 
        /*
-        * If we grew by another max_extent, just return, we want to keep that
-        * reserved amount.
+        * We have to add up either side to figure out how many extents were
+        * accounted for before we merged into one big extent.  If the number of
+        * extents we accounted for is <= the amount we need for the new range
+        * then we can return, otherwise drop.  Think of it like this
+        *
+        * [ 4k][MAX_SIZE]
+        *
+        * So we've grown the extent by a MAX_SIZE extent, this would mean we
+        * need 2 outstanding extents, on one side we have 1 and the other side
+        * we have 1 so they are == and we can return.  But in this case
+        *
+        * [MAX_SIZE+4k][MAX_SIZE+4k]
+        *
+        * Each range on their own accounts for 2 extents, but merged together
+        * they are only 3 extents worth of accounting, so we need to drop in
+        * this case.
         */
+       old_size = other->end - other->start + 1;
        num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
                                BTRFS_MAX_EXTENT_SIZE);
+       old_size = new->end - new->start + 1;
+       num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
+                                BTRFS_MAX_EXTENT_SIZE);
+
        if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
-                     BTRFS_MAX_EXTENT_SIZE) > num_extents)
+                     BTRFS_MAX_EXTENT_SIZE) >= num_extents)
                return;
 
        spin_lock(&BTRFS_I(inode)->lock);
@@ -1686,6 +1702,10 @@ static void btrfs_set_bit_hook(struct inode *inode,
                        spin_unlock(&BTRFS_I(inode)->lock);
                }
 
+               /* For sanity tests */
+               if (btrfs_test_is_dummy_root(root))
+                       return;
+
                __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
                                     root->fs_info->delalloc_batch);
                spin_lock(&BTRFS_I(inode)->lock);
@@ -1741,6 +1761,10 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                    root != root->fs_info->tree_root)
                        btrfs_delalloc_release_metadata(inode, len);
 
+               /* For sanity tests. */
+               if (btrfs_test_is_dummy_root(root))
+                       return;
+
                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
                    && do_list && !(state->state & EXTENT_NORESERVE))
                        btrfs_free_reserved_data_space(inode, len);
@@ -3087,6 +3111,8 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
        if (empty)
                return;
 
+       down_read(&fs_info->delayed_iput_sem);
+
        spin_lock(&fs_info->delayed_iput_lock);
        list_splice_init(&fs_info->delayed_iputs, &list);
        spin_unlock(&fs_info->delayed_iput_lock);
@@ -3097,6 +3123,8 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
                iput(delayed->inode);
                kfree(delayed);
        }
+
+       up_read(&root->fs_info->delayed_iput_sem);
 }
 
 /*
@@ -3605,25 +3633,28 @@ static void btrfs_read_locked_inode(struct inode *inode)
        BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
        BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
 
+       inode->i_version = btrfs_inode_sequence(leaf, inode_item);
+       inode->i_generation = BTRFS_I(inode)->generation;
+       inode->i_rdev = 0;
+       rdev = btrfs_inode_rdev(leaf, inode_item);
+
+       BTRFS_I(inode)->index_cnt = (u64)-1;
+       BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+
+cache_index:
        /*
         * If we were modified in the current generation and evicted from memory
         * and then re-read we need to do a full sync since we don't have any
         * idea about which extents were modified before we were evicted from
         * cache.
+        *
+        * This is required for both inode re-read from disk and delayed inode
+        * in delayed_nodes_tree.
         */
        if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                        &BTRFS_I(inode)->runtime_flags);
 
-       inode->i_version = btrfs_inode_sequence(leaf, inode_item);
-       inode->i_generation = BTRFS_I(inode)->generation;
-       inode->i_rdev = 0;
-       rdev = btrfs_inode_rdev(leaf, inode_item);
-
-       BTRFS_I(inode)->index_cnt = (u64)-1;
-       BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
-
-cache_index:
        path->slots[0]++;
        if (inode->i_nlink != 1 ||
            path->slots[0] >= btrfs_header_nritems(leaf))
@@ -4139,6 +4170,21 @@ out:
        return err;
 }
 
+static int truncate_space_check(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               u64 bytes_deleted)
+{
+       int ret;
+
+       bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted);
+       ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv,
+                                 bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
+       if (!ret)
+               trans->bytes_reserved += bytes_deleted;
+       return ret;
+
+}
+
 /*
  * this can truncate away extent items, csum items and directory items.
  * It starts at a high offset and removes keys until it can't find
@@ -4174,9 +4220,21 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        int ret;
        int err = 0;
        u64 ino = btrfs_ino(inode);
+       u64 bytes_deleted = 0;
+       bool be_nice = 0;
+       bool should_throttle = 0;
+       bool should_end = 0;
 
        BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
 
+       /*
+        * for non-free space inodes and ref cows, we want to back off from
+        * time to time
+        */
+       if (!btrfs_is_free_space_inode(inode) &&
+           test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+               be_nice = 1;
+
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -4206,6 +4264,19 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        key.type = (u8)-1;
 
 search_again:
+       /*
+        * with a 16K leaf size and 128MB extents, you can actually queue
+        * up a huge file in a single leaf.  Most of the time that
+        * bytes_deleted is > 0, it will be huge by the time we get here
+        */
+       if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+               if (btrfs_should_end_transaction(trans, root)) {
+                       err = -EAGAIN;
+                       goto error;
+               }
+       }
+
+
        path->leave_spinning = 1;
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret < 0) {
@@ -4348,22 +4419,39 @@ delete:
                } else {
                        break;
                }
+               should_throttle = 0;
+
                if (found_extent &&
                    (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
                     root == root->fs_info->tree_root)) {
                        btrfs_set_path_blocking(path);
+                       bytes_deleted += extent_num_bytes;
                        ret = btrfs_free_extent(trans, root, extent_start,
                                                extent_num_bytes, 0,
                                                btrfs_header_owner(leaf),
                                                ino, extent_offset, 0);
                        BUG_ON(ret);
+                       if (btrfs_should_throttle_delayed_refs(trans, root))
+                               btrfs_async_run_delayed_refs(root,
+                                       trans->delayed_ref_updates * 2, 0);
+                       if (be_nice) {
+                               if (truncate_space_check(trans, root,
+                                                        extent_num_bytes)) {
+                                       should_end = 1;
+                               }
+                               if (btrfs_should_throttle_delayed_refs(trans,
+                                                                      root)) {
+                                       should_throttle = 1;
+                               }
+                       }
                }
 
                if (found_type == BTRFS_INODE_ITEM_KEY)
                        break;
 
                if (path->slots[0] == 0 ||
-                   path->slots[0] != pending_del_slot) {
+                   path->slots[0] != pending_del_slot ||
+                   should_throttle || should_end) {
                        if (pending_del_nr) {
                                ret = btrfs_del_items(trans, root, path,
                                                pending_del_slot,
@@ -4376,6 +4464,23 @@ delete:
                                pending_del_nr = 0;
                        }
                        btrfs_release_path(path);
+                       if (should_throttle) {
+                               unsigned long updates = trans->delayed_ref_updates;
+                               if (updates) {
+                                       trans->delayed_ref_updates = 0;
+                                       ret = btrfs_run_delayed_refs(trans, root, updates * 2);
+                                       if (ret && !err)
+                                               err = ret;
+                               }
+                       }
+                       /*
+                        * if we failed to refill our space rsv, bail out
+                        * and let the transaction restart
+                        */
+                       if (should_end) {
+                               err = -EAGAIN;
+                               goto error;
+                       }
                        goto search_again;
                } else {
                        path->slots[0]--;
@@ -4392,7 +4497,18 @@ error:
        if (last_size != (u64)-1 &&
            root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
                btrfs_ordered_update_i_size(inode, last_size, NULL);
+
        btrfs_free_path(path);
+
+       if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+               unsigned long updates = trans->delayed_ref_updates;
+               if (updates) {
+                       trans->delayed_ref_updates = 0;
+                       ret = btrfs_run_delayed_refs(trans, root, updates * 2);
+                       if (ret && !err)
+                               err = ret;
+               }
+       }
        return err;
 }
 
@@ -4901,6 +5017,7 @@ void btrfs_evict_inode(struct inode *inode)
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_block_rsv *rsv, *global_rsv;
+       int steal_from_global = 0;
        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
        int ret;
 
@@ -4968,9 +5085,20 @@ void btrfs_evict_inode(struct inode *inode)
                 * hard as possible to get this to work.
                 */
                if (ret)
-                       ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
+                       steal_from_global++;
+               else
+                       steal_from_global = 0;
+               ret = 0;
 
-               if (ret) {
+               /*
+                * steal_from_global == 0: we reserved stuff, hooray!
+                * steal_from_global == 1: we didn't reserve stuff, boo!
+                * steal_from_global == 2: we've committed, still not a lot of
+                * room but maybe we'll have room in the global reserve this
+                * time.
+                * steal_from_global == 3: abandon all hope!
+                */
+               if (steal_from_global > 2) {
                        btrfs_warn(root->fs_info,
                                "Could not get space for a delete, will truncate on mount %d",
                                ret);
@@ -4986,10 +5114,40 @@ void btrfs_evict_inode(struct inode *inode)
                        goto no_delete;
                }
 
+               /*
+                * We can't just steal from the global reserve, we need tomake
+                * sure there is room to do it, if not we need to commit and try
+                * again.
+                */
+               if (steal_from_global) {
+                       if (!btrfs_check_space_for_delayed_refs(trans, root))
+                               ret = btrfs_block_rsv_migrate(global_rsv, rsv,
+                                                             min_size);
+                       else
+                               ret = -ENOSPC;
+               }
+
+               /*
+                * Couldn't steal from the global reserve, we have too much
+                * pending stuff built up, commit the transaction and try it
+                * again.
+                */
+               if (ret) {
+                       ret = btrfs_commit_transaction(trans, root);
+                       if (ret) {
+                               btrfs_orphan_del(NULL, inode);
+                               btrfs_free_block_rsv(root, rsv);
+                               goto no_delete;
+                       }
+                       continue;
+               } else {
+                       steal_from_global = 0;
+               }
+
                trans->block_rsv = rsv;
 
                ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
-               if (ret != -ENOSPC)
+               if (ret != -ENOSPC && ret != -EAGAIN)
                        break;
 
                trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -7213,7 +7371,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        u64 start = iblock << inode->i_blkbits;
        u64 lockstart, lockend;
        u64 len = bh_result->b_size;
-       u64 orig_len = len;
+       u64 *outstanding_extents = NULL;
        int unlock_bits = EXTENT_LOCKED;
        int ret = 0;
 
@@ -7225,6 +7383,16 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        lockstart = start;
        lockend = start + len - 1;
 
+       if (current->journal_info) {
+               /*
+                * Need to pull our outstanding extents and set journal_info to NULL so
+                * that anything that needs to check if there's a transction doesn't get
+                * confused.
+                */
+               outstanding_extents = current->journal_info;
+               current->journal_info = NULL;
+       }
+
        /*
         * If this errors out it's because we couldn't invalidate pagecache for
         * this range and we need to fallback to buffered.
@@ -7348,11 +7516,20 @@ unlock:
                if (start + len > i_size_read(inode))
                        i_size_write(inode, start + len);
 
-               if (len < orig_len) {
+               /*
+                * If we have an outstanding_extents count still set then we're
+                * within our reservation, otherwise we need to adjust our inode
+                * counter appropriately.
+                */
+               if (*outstanding_extents) {
+                       (*outstanding_extents)--;
+               } else {
                        spin_lock(&BTRFS_I(inode)->lock);
                        BTRFS_I(inode)->outstanding_extents++;
                        spin_unlock(&BTRFS_I(inode)->lock);
                }
+
+               current->journal_info = outstanding_extents;
                btrfs_free_reserved_data_space(inode, len);
        }
 
@@ -7376,6 +7553,8 @@ unlock:
 unlock_err:
        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                         unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+       if (outstanding_extents)
+               current->journal_info = outstanding_extents;
        return ret;
 }
 
@@ -8075,6 +8254,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
+       u64 outstanding_extents = 0;
        size_t count = 0;
        int flags = 0;
        bool wakeup = true;
@@ -8112,6 +8292,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                ret = btrfs_delalloc_reserve_space(inode, count);
                if (ret)
                        goto out;
+               outstanding_extents = div64_u64(count +
+                                               BTRFS_MAX_EXTENT_SIZE - 1,
+                                               BTRFS_MAX_EXTENT_SIZE);
+
+               /*
+                * We need to know how many extents we reserved so that we can
+                * do the accounting properly if we go over the number we
+                * originally calculated.  Abuse current->journal_info for this.
+                */
+               current->journal_info = &outstanding_extents;
        } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
                                     &BTRFS_I(inode)->runtime_flags)) {
                inode_dio_done(inode);
@@ -8124,6 +8314,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                        iter, offset, btrfs_get_blocks_direct, NULL,
                        btrfs_submit_direct, flags);
        if (rw & WRITE) {
+               current->journal_info = NULL;
                if (ret < 0 && ret != -EIOCBQUEUED)
                        btrfs_delalloc_release_space(inode, count);
                else if (ret >= 0 && (size_t)ret < count)
@@ -8525,7 +8716,7 @@ static int btrfs_truncate(struct inode *inode)
                ret = btrfs_truncate_inode_items(trans, root, inode,
                                                 inode->i_size,
                                                 BTRFS_EXTENT_DATA_KEY);
-               if (ret != -ENOSPC) {
+               if (ret != -ENOSPC && ret != -EAGAIN) {
                        err = ret;
                        break;
                }
@@ -9395,6 +9586,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                                btrfs_end_transaction(trans, root);
                        break;
                }
+
                btrfs_drop_extent_cache(inode, cur_offset,
                                        cur_offset + ins.offset -1, 0);
 
This page took 0.044717 seconds and 5 git commands to generate.