fs/btrfs/extent-tree.c

   1 /*
   2  * Copyright (C) 2007 Oracle.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18 #include <linux/sched.h>
  19 #include <linux/pagemap.h>
  20 #include <linux/writeback.h>
  21 #include <linux/blkdev.h>
  22 #include <linux/sort.h>
  23 #include <linux/rcupdate.h>
  24 #include <linux/kthread.h>
  25 #include <linux/slab.h>
  26 #include <linux/ratelimit.h>
  27 #include <linux/percpu_counter.h>
  28 #include "hash.h"
  29 #include "tree-log.h"
  30 #include "disk-io.h"
  31 #include "print-tree.h"
  32 #include "volumes.h"
  33 #include "raid56.h"
  34 #include "locking.h"
  35 #include "free-space-cache.h"
  36 #include "free-space-tree.h"
  37 #include "math.h"
  38 #include "sysfs.h"
  39 #include "qgroup.h"
  40
  41 #undef SCRAMBLE_DELAYED_REFS
  42
  43 /*
  44  * control flags for do_chunk_alloc's force field
  45  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
  46  * if we really need one.
  47  *
  48  * CHUNK_ALLOC_LIMITED means to only try and allocate one
  49  * if we have very few chunks already allocated.  This is
  50  * used as part of the clustering code to help make sure
  51  * we have a good pool of storage to cluster in, without
  52  * filling the FS with empty chunks
  53  *
  54  * CHUNK_ALLOC_FORCE means it must try to allocate one
  55  *
  56  */
  57 enum {
  58         CHUNK_ALLOC_NO_FORCE = 0,
  59         CHUNK_ALLOC_LIMITED = 1,
  60         CHUNK_ALLOC_FORCE = 2,
  61 };
  62
  63 static int update_block_group(struct btrfs_trans_handle *trans,
  64                               struct btrfs_root *root, u64 bytenr,
  65                               u64 num_bytes, int alloc);
  66 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
  67                                 struct btrfs_root *root,
  68                                 struct btrfs_delayed_ref_node *node, u64 parent,
  69                                 u64 root_objectid, u64 owner_objectid,
  70                                 u64 owner_offset, int refs_to_drop,
  71                                 struct btrfs_delayed_extent_op *extra_op);
  72 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
  73                                     struct extent_buffer *leaf,
  74                                     struct btrfs_extent_item *ei);
  75 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  76                                       struct btrfs_root *root,
  77                                       u64 parent, u64 root_objectid,
  78                                       u64 flags, u64 owner, u64 offset,
  79                                       struct btrfs_key *ins, int ref_mod);
  80 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
  81                                      struct btrfs_root *root,
  82                                      u64 parent, u64 root_objectid,
  83                                      u64 flags, struct btrfs_disk_key *key,
  84                                      int level, struct btrfs_key *ins);
  85 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
  86                           struct btrfs_root *extent_root, u64 flags,
  87                           int force);
  88 static int find_next_key(struct btrfs_path *path, int level,
  89                          struct btrfs_key *key);
  90 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
  91                             int dump_block_groups);
  92 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
  93                                     u64 ram_bytes, u64 num_bytes, int delalloc);
  94 static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
  95                                      u64 num_bytes, int delalloc);
  96 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
  97                                u64 num_bytes);
  98 int btrfs_pin_extent(struct btrfs_root *root,
  99                      u64 bytenr, u64 num_bytes, int reserved);
 100 static int __reserve_metadata_bytes(struct btrfs_root *root,
 101                                     struct btrfs_space_info *space_info,
 102                                     u64 orig_bytes,
 103                                     enum btrfs_reserve_flush_enum flush);
 104 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
 105                                      struct btrfs_space_info *space_info,
 106                                      u64 num_bytes);
 107 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
 108                                      struct btrfs_space_info *space_info,
 109                                      u64 num_bytes);
 110
 111 static noinline int
 112 block_group_cache_done(struct btrfs_block_group_cache *cache)
 113 {
 114         smp_mb();
 115         return cache->cached == BTRFS_CACHE_FINISHED ||
 116                 cache->cached == BTRFS_CACHE_ERROR;
 117 }
 118
 119 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 120 {
 121         return (cache->flags & bits) == bits;
 122 }
 123
 124 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 125 {
 126         atomic_inc(&cache->count);
 127 }
 128
 129 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 130 {
 131         if (atomic_dec_and_test(&cache->count)) {
 132                 WARN_ON(cache->pinned > 0);
 133                 WARN_ON(cache->reserved > 0);
 134                 kfree(cache->free_space_ctl);
 135                 kfree(cache);
 136         }
 137 }
 138
 139 /*
 140  * this adds the block group to the fs_info rb tree for the block group
 141  * cache
 142  */
 143 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 144                                 struct btrfs_block_group_cache *block_group)
 145 {
 146         struct rb_node **p;
 147         struct rb_node *parent = NULL;
 148         struct btrfs_block_group_cache *cache;
 149
 150         spin_lock(&info->block_group_cache_lock);
 151         p = &info->block_group_cache_tree.rb_node;
 152
 153         while (*p) {
 154                 parent = *p;
 155                 cache = rb_entry(parent, struct btrfs_block_group_cache,
 156                                  cache_node);
 157                 if (block_group->key.objectid < cache->key.objectid) {
 158                         p = &(*p)->rb_left;
 159                 } else if (block_group->key.objectid > cache->key.objectid) {
 160                         p = &(*p)->rb_right;
 161                 } else {
 162                         spin_unlock(&info->block_group_cache_lock);
 163                         return -EEXIST;
 164                 }
 165         }
 166
 167         rb_link_node(&block_group->cache_node, parent, p);
 168         rb_insert_color(&block_group->cache_node,
 169                         &info->block_group_cache_tree);
 170
 171         if (info->first_logical_byte > block_group->key.objectid)
 172                 info->first_logical_byte = block_group->key.objectid;
 173
 174         spin_unlock(&info->block_group_cache_lock);
 175
 176         return 0;
 177 }
 178
 179 /*
 180  * This will return the block group at or after bytenr if contains is 0, else
 181  * it will return the block group that contains the bytenr
 182  */
 183 static struct btrfs_block_group_cache *
 184 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 185                               int contains)
 186 {
 187         struct btrfs_block_group_cache *cache, *ret = NULL;
 188         struct rb_node *n;
 189         u64 end, start;
 190
 191         spin_lock(&info->block_group_cache_lock);
 192         n = info->block_group_cache_tree.rb_node;
 193
 194         while (n) {
 195                 cache = rb_entry(n, struct btrfs_block_group_cache,
 196                                  cache_node);
 197                 end = cache->key.objectid + cache->key.offset - 1;
 198                 start = cache->key.objectid;
 199
 200                 if (bytenr < start) {
 201                         if (!contains && (!ret || start < ret->key.objectid))
 202                                 ret = cache;
 203                         n = n->rb_left;
 204                 } else if (bytenr > start) {
 205                         if (contains && bytenr <= end) {
 206                                 ret = cache;
 207                                 break;
 208                         }
 209                         n = n->rb_right;
 210                 } else {
 211                         ret = cache;
 212                         break;
 213                 }
 214         }
 215         if (ret) {
 216                 btrfs_get_block_group(ret);
 217                 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
 218                         info->first_logical_byte = ret->key.objectid;
 219         }
 220         spin_unlock(&info->block_group_cache_lock);
 221
 222         return ret;
 223 }
 224
 225 static int add_excluded_extent(struct btrfs_root *root,
 226                                u64 start, u64 num_bytes)
 227 {
 228         u64 end = start + num_bytes - 1;
 229         set_extent_bits(&root->fs_info->freed_extents[0],
 230                         start, end, EXTENT_UPTODATE);
 231         set_extent_bits(&root->fs_info->freed_extents[1],
 232                         start, end, EXTENT_UPTODATE);
 233         return 0;
 234 }
 235
 236 static void free_excluded_extents(struct btrfs_root *root,
 237                                   struct btrfs_block_group_cache *cache)
 238 {
 239         u64 start, end;
 240
 241         start = cache->key.objectid;
 242         end = start + cache->key.offset - 1;
 243
 244         clear_extent_bits(&root->fs_info->freed_extents[0],
 245                           start, end, EXTENT_UPTODATE);
 246         clear_extent_bits(&root->fs_info->freed_extents[1],
 247                           start, end, EXTENT_UPTODATE);
 248 }
 249
 250 static int exclude_super_stripes(struct btrfs_root *root,
 251                                  struct btrfs_block_group_cache *cache)
 252 {
 253         u64 bytenr;
 254         u64 *logical;
 255         int stripe_len;
 256         int i, nr, ret;
 257
 258         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
 259                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
 260                 cache->bytes_super += stripe_len;
 261                 ret = add_excluded_extent(root, cache->key.objectid,
 262                                           stripe_len);
 263                 if (ret)
 264                         return ret;
 265         }
 266
 267         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 268                 bytenr = btrfs_sb_offset(i);
 269                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
 270                                        cache->key.objectid, bytenr,
 271                                        0, &logical, &nr, &stripe_len);
 272                 if (ret)
 273                         return ret;
 274
 275                 while (nr--) {
 276                         u64 start, len;
 277
 278                         if (logical[nr] > cache->key.objectid +
 279                             cache->key.offset)
 280                                 continue;
 281
 282                         if (logical[nr] + stripe_len <= cache->key.objectid)
 283                                 continue;
 284
 285                         start = logical[nr];
 286                         if (start < cache->key.objectid) {
 287                                 start = cache->key.objectid;
 288                                 len = (logical[nr] + stripe_len) - start;
 289                         } else {
 290                                 len = min_t(u64, stripe_len,
 291                                             cache->key.objectid +
 292                                             cache->key.offset - start);
 293                         }
 294
 295                         cache->bytes_super += len;
 296                         ret = add_excluded_extent(root, start, len);
 297                         if (ret) {
 298                                 kfree(logical);
 299                                 return ret;
 300                         }
 301                 }
 302
 303                 kfree(logical);
 304         }
 305         return 0;
 306 }
 307
 308 static struct btrfs_caching_control *
 309 get_caching_control(struct btrfs_block_group_cache *cache)
 310 {
 311         struct btrfs_caching_control *ctl;
 312
 313         spin_lock(&cache->lock);
 314         if (!cache->caching_ctl) {
 315                 spin_unlock(&cache->lock);
 316                 return NULL;
 317         }
 318
 319         ctl = cache->caching_ctl;
 320         atomic_inc(&ctl->count);
 321         spin_unlock(&cache->lock);
 322         return ctl;
 323 }
 324
 325 static void put_caching_control(struct btrfs_caching_control *ctl)
 326 {
 327         if (atomic_dec_and_test(&ctl->count))
 328                 kfree(ctl);
 329 }
 330
 331 #ifdef CONFIG_BTRFS_DEBUG
 332 static void fragment_free_space(struct btrfs_root *root,
 333                                 struct btrfs_block_group_cache *block_group)
 334 {
 335         u64 start = block_group->key.objectid;
 336         u64 len = block_group->key.offset;
 337         u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
 338                 root->nodesize : root->sectorsize;
 339         u64 step = chunk << 1;
 340
 341         while (len > chunk) {
 342                 btrfs_remove_free_space(block_group, start, chunk);
 343                 start += step;
 344                 if (len < step)
 345                         len = 0;
 346                 else
 347                         len -= step;
 348         }
 349 }
 350 #endif
 351
 352 /*
 353  * this is only called by cache_block_group, since we could have freed extents
 354  * we need to check the pinned_extents for any extents that can't be used yet
 355  * since their free space will be released as soon as the transaction commits.
 356  */
 357 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 358                        struct btrfs_fs_info *info, u64 start, u64 end)
 359 {
 360         u64 extent_start, extent_end, size, total_added = 0;
 361         int ret;
 362
 363         while (start < end) {
 364                 ret = find_first_extent_bit(info->pinned_extents, start,
 365                                             &extent_start, &extent_end,
 366                                             EXTENT_DIRTY | EXTENT_UPTODATE,
 367                                             NULL);
 368                 if (ret)
 369                         break;
 370
 371                 if (extent_start <= start) {
 372                         start = extent_end + 1;
 373                 } else if (extent_start > start && extent_start < end) {
 374                         size = extent_start - start;
 375                         total_added += size;
 376                         ret = btrfs_add_free_space(block_group, start,
 377                                                    size);
 378                         BUG_ON(ret); /* -ENOMEM or logic error */
 379                         start = extent_end + 1;
 380                 } else {
 381                         break;
 382                 }
 383         }
 384
 385         if (start < end) {
 386                 size = end - start;
 387                 total_added += size;
 388                 ret = btrfs_add_free_space(block_group, start, size);
 389                 BUG_ON(ret); /* -ENOMEM or logic error */
 390         }
 391
 392         return total_added;
 393 }
 394
 395 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
 396 {
 397         struct btrfs_block_group_cache *block_group;
 398         struct btrfs_fs_info *fs_info;
 399         struct btrfs_root *extent_root;
 400         struct btrfs_path *path;
 401         struct extent_buffer *leaf;
 402         struct btrfs_key key;
 403         u64 total_found = 0;
 404         u64 last = 0;
 405         u32 nritems;
 406         int ret;
 407         bool wakeup = true;
 408
 409         block_group = caching_ctl->block_group;
 410         fs_info = block_group->fs_info;
 411         extent_root = fs_info->extent_root;
 412
 413         path = btrfs_alloc_path();
 414         if (!path)
 415                 return -ENOMEM;
 416
 417         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 418
 419 #ifdef CONFIG_BTRFS_DEBUG
 420         /*
 421          * If we're fragmenting we don't want to make anybody think we can
 422          * allocate from this block group until we've had a chance to fragment
 423          * the free space.
 424          */
 425         if (btrfs_should_fragment_free_space(extent_root, block_group))
 426                 wakeup = false;
 427 #endif
 428         /*
 429          * We don't want to deadlock with somebody trying to allocate a new
 430          * extent for the extent root while also trying to search the extent
 431          * root to add free space.  So we skip locking and search the commit
 432          * root, since its read-only
 433          */
 434         path->skip_locking = 1;
 435         path->search_commit_root = 1;
 436         path->reada = READA_FORWARD;
 437
 438         key.objectid = last;
 439         key.offset = 0;
 440         key.type = BTRFS_EXTENT_ITEM_KEY;
 441
 442 next:
 443         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 444         if (ret < 0)
 445                 goto out;
 446
 447         leaf = path->nodes[0];
 448         nritems = btrfs_header_nritems(leaf);
 449
 450         while (1) {
 451                 if (btrfs_fs_closing(fs_info) > 1) {
 452                         last = (u64)-1;
 453                         break;
 454                 }
 455
 456                 if (path->slots[0] < nritems) {
 457                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 458                 } else {
 459                         ret = find_next_key(path, 0, &key);
 460                         if (ret)
 461                                 break;
 462
 463                         if (need_resched() ||
 464                             rwsem_is_contended(&fs_info->commit_root_sem)) {
 465                                 if (wakeup)
 466                                         caching_ctl->progress = last;
 467                                 btrfs_release_path(path);
 468                                 up_read(&fs_info->commit_root_sem);
 469                                 mutex_unlock(&caching_ctl->mutex);
 470                                 cond_resched();
 471                                 mutex_lock(&caching_ctl->mutex);
 472                                 down_read(&fs_info->commit_root_sem);
 473                                 goto next;
 474                         }
 475
 476                         ret = btrfs_next_leaf(extent_root, path);
 477                         if (ret < 0)
 478                                 goto out;
 479                         if (ret)
 480                                 break;
 481                         leaf = path->nodes[0];
 482                         nritems = btrfs_header_nritems(leaf);
 483                         continue;
 484                 }
 485
 486                 if (key.objectid < last) {
 487                         key.objectid = last;
 488                         key.offset = 0;
 489                         key.type = BTRFS_EXTENT_ITEM_KEY;
 490
 491                         if (wakeup)
 492                                 caching_ctl->progress = last;
 493                         btrfs_release_path(path);
 494                         goto next;
 495                 }
 496
 497                 if (key.objectid < block_group->key.objectid) {
 498                         path->slots[0]++;
 499                         continue;
 500                 }
 501
 502                 if (key.objectid >= block_group->key.objectid +
 503                     block_group->key.offset)
 504                         break;
 505
 506                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 507                     key.type == BTRFS_METADATA_ITEM_KEY) {
 508                         total_found += add_new_free_space(block_group,
 509                                                           fs_info, last,
 510                                                           key.objectid);
 511                         if (key.type == BTRFS_METADATA_ITEM_KEY)
 512                                 last = key.objectid +
 513                                         fs_info->tree_root->nodesize;
 514                         else
 515                                 last = key.objectid + key.offset;
 516
 517                         if (total_found > CACHING_CTL_WAKE_UP) {
 518                                 total_found = 0;
 519                                 if (wakeup)
 520                                         wake_up(&caching_ctl->wait);
 521                         }
 522                 }
 523                 path->slots[0]++;
 524         }
 525         ret = 0;
 526
 527         total_found += add_new_free_space(block_group, fs_info, last,
 528                                           block_group->key.objectid +
 529                                           block_group->key.offset);
 530         caching_ctl->progress = (u64)-1;
 531
 532 out:
 533         btrfs_free_path(path);
 534         return ret;
 535 }
 536
 537 static noinline void caching_thread(struct btrfs_work *work)
 538 {
 539         struct btrfs_block_group_cache *block_group;
 540         struct btrfs_fs_info *fs_info;
 541         struct btrfs_caching_control *caching_ctl;
 542         struct btrfs_root *extent_root;
 543         int ret;
 544
 545         caching_ctl = container_of(work, struct btrfs_caching_control, work);
 546         block_group = caching_ctl->block_group;
 547         fs_info = block_group->fs_info;
 548         extent_root = fs_info->extent_root;
 549
 550         mutex_lock(&caching_ctl->mutex);
 551         down_read(&fs_info->commit_root_sem);
 552
 553         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
 554                 ret = load_free_space_tree(caching_ctl);
 555         else
 556                 ret = load_extent_tree_free(caching_ctl);
 557
 558         spin_lock(&block_group->lock);
 559         block_group->caching_ctl = NULL;
 560         block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
 561         spin_unlock(&block_group->lock);
 562
 563 #ifdef CONFIG_BTRFS_DEBUG
 564         if (btrfs_should_fragment_free_space(extent_root, block_group)) {
 565                 u64 bytes_used;
 566
 567                 spin_lock(&block_group->space_info->lock);
 568                 spin_lock(&block_group->lock);
 569                 bytes_used = block_group->key.offset -
 570                         btrfs_block_group_used(&block_group->item);
 571                 block_group->space_info->bytes_used += bytes_used >> 1;
 572                 spin_unlock(&block_group->lock);
 573                 spin_unlock(&block_group->space_info->lock);
 574                 fragment_free_space(extent_root, block_group);
 575         }
 576 #endif
 577
 578         caching_ctl->progress = (u64)-1;
 579
 580         up_read(&fs_info->commit_root_sem);
 581         free_excluded_extents(fs_info->extent_root, block_group);
 582         mutex_unlock(&caching_ctl->mutex);
 583
 584         wake_up(&caching_ctl->wait);
 585
 586         put_caching_control(caching_ctl);
 587         btrfs_put_block_group(block_group);
 588 }
 589
 590 static int cache_block_group(struct btrfs_block_group_cache *cache,
 591                              int load_cache_only)
 592 {
 593         DEFINE_WAIT(wait);
 594         struct btrfs_fs_info *fs_info = cache->fs_info;
 595         struct btrfs_caching_control *caching_ctl;
 596         int ret = 0;
 597
 598         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 599         if (!caching_ctl)
 600                 return -ENOMEM;
 601
 602         INIT_LIST_HEAD(&caching_ctl->list);
 603         mutex_init(&caching_ctl->mutex);
 604         init_waitqueue_head(&caching_ctl->wait);
 605         caching_ctl->block_group = cache;
 606         caching_ctl->progress = cache->key.objectid;
 607         atomic_set(&caching_ctl->count, 1);
 608         btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
 609                         caching_thread, NULL, NULL);
 610
 611         spin_lock(&cache->lock);
 612         /*
 613          * This should be a rare occasion, but this could happen I think in the
 614          * case where one thread starts to load the space cache info, and then
 615          * some other thread starts a transaction commit which tries to do an
 616          * allocation while the other thread is still loading the space cache
 617          * info.  The previous loop should have kept us from choosing this block
 618          * group, but if we've moved to the state where we will wait on caching
 619          * block groups we need to first check if we're doing a fast load here,
 620          * so we can wait for it to finish, otherwise we could end up allocating
 621          * from a block group who's cache gets evicted for one reason or
 622          * another.
 623          */
 624         while (cache->cached == BTRFS_CACHE_FAST) {
 625                 struct btrfs_caching_control *ctl;
 626
 627                 ctl = cache->caching_ctl;
 628                 atomic_inc(&ctl->count);
 629                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
 630                 spin_unlock(&cache->lock);
 631
 632                 schedule();
 633
 634                 finish_wait(&ctl->wait, &wait);
 635                 put_caching_control(ctl);
 636                 spin_lock(&cache->lock);
 637         }
 638
 639         if (cache->cached != BTRFS_CACHE_NO) {
 640                 spin_unlock(&cache->lock);
 641                 kfree(caching_ctl);
 642                 return 0;
 643         }
 644         WARN_ON(cache->caching_ctl);
 645         cache->caching_ctl = caching_ctl;
 646         cache->cached = BTRFS_CACHE_FAST;
 647         spin_unlock(&cache->lock);
 648
 649         if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
 650                 mutex_lock(&caching_ctl->mutex);
 651                 ret = load_free_space_cache(fs_info, cache);
 652
 653                 spin_lock(&cache->lock);
 654                 if (ret == 1) {
 655                         cache->caching_ctl = NULL;
 656                         cache->cached = BTRFS_CACHE_FINISHED;
 657                         cache->last_byte_to_unpin = (u64)-1;
 658                         caching_ctl->progress = (u64)-1;
 659                 } else {
 660                         if (load_cache_only) {
 661                                 cache->caching_ctl = NULL;
 662                                 cache->cached = BTRFS_CACHE_NO;
 663                         } else {
 664                                 cache->cached = BTRFS_CACHE_STARTED;
 665                                 cache->has_caching_ctl = 1;
 666                         }
 667                 }
 668                 spin_unlock(&cache->lock);
 669 #ifdef CONFIG_BTRFS_DEBUG
 670                 if (ret == 1 &&
 671                     btrfs_should_fragment_free_space(fs_info->extent_root,
 672                                                      cache)) {
 673                         u64 bytes_used;
 674
 675                         spin_lock(&cache->space_info->lock);
 676                         spin_lock(&cache->lock);
 677                         bytes_used = cache->key.offset -
 678                                 btrfs_block_group_used(&cache->item);
 679                         cache->space_info->bytes_used += bytes_used >> 1;
 680                         spin_unlock(&cache->lock);
 681                         spin_unlock(&cache->space_info->lock);
 682                         fragment_free_space(fs_info->extent_root, cache);
 683                 }
 684 #endif
 685                 mutex_unlock(&caching_ctl->mutex);
 686
 687                 wake_up(&caching_ctl->wait);
 688                 if (ret == 1) {
 689                         put_caching_control(caching_ctl);
 690                         free_excluded_extents(fs_info->extent_root, cache);
 691                         return 0;
 692                 }
 693         } else {
 694                 /*
 695                  * We're either using the free space tree or no caching at all.
 696                  * Set cached to the appropriate value and wakeup any waiters.
 697                  */
 698                 spin_lock(&cache->lock);
 699                 if (load_cache_only) {
 700                         cache->caching_ctl = NULL;
 701                         cache->cached = BTRFS_CACHE_NO;
 702                 } else {
 703                         cache->cached = BTRFS_CACHE_STARTED;
 704                         cache->has_caching_ctl = 1;
 705                 }
 706                 spin_unlock(&cache->lock);
 707                 wake_up(&caching_ctl->wait);
 708         }
 709
 710         if (load_cache_only) {
 711                 put_caching_control(caching_ctl);
 712                 return 0;
 713         }
 714
 715         down_write(&fs_info->commit_root_sem);
 716         atomic_inc(&caching_ctl->count);
 717         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 718         up_write(&fs_info->commit_root_sem);
 719
 720         btrfs_get_block_group(cache);
 721
 722         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 723
 724         return ret;
 725 }
 726
 727 /*
 728  * return the block group that starts at or after bytenr
 729  */
 730 static struct btrfs_block_group_cache *
 731 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 732 {
 733         struct btrfs_block_group_cache *cache;
 734
 735         cache = block_group_cache_tree_search(info, bytenr, 0);
 736
 737         return cache;
 738 }
 739
 740 /*
 741  * return the block group that contains the given bytenr
 742  */
 743 struct btrfs_block_group_cache *btrfs_lookup_block_group(
 744                                                  struct btrfs_fs_info *info,
 745                                                  u64 bytenr)
 746 {
 747         struct btrfs_block_group_cache *cache;
 748
 749         cache = block_group_cache_tree_search(info, bytenr, 1);
 750
 751         return cache;
 752 }
 753
 754 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 755                                                   u64 flags)
 756 {
 757         struct list_head *head = &info->space_info;
 758         struct btrfs_space_info *found;
 759
 760         flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
 761
 762         rcu_read_lock();
 763         list_for_each_entry_rcu(found, head, list) {
 764                 if (found->flags & flags) {
 765                         rcu_read_unlock();
 766                         return found;
 767                 }
 768         }
 769         rcu_read_unlock();
 770         return NULL;
 771 }
 772
 773 /*
 774  * after adding space to the filesystem, we need to clear the full flags
 775  * on all the space infos.
 776  */
 777 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 778 {
 779         struct list_head *head = &info->space_info;
 780         struct btrfs_space_info *found;
 781
 782         rcu_read_lock();
 783         list_for_each_entry_rcu(found, head, list)
 784                 found->full = 0;
 785         rcu_read_unlock();
 786 }
 787
 788 /* simple helper to search for an existing data extent at a given offset */
 789 int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
 790 {
 791         int ret;
 792         struct btrfs_key key;
 793         struct btrfs_path *path;
 794
 795         path = btrfs_alloc_path();
 796         if (!path)
 797                 return -ENOMEM;
 798
 799         key.objectid = start;
 800         key.offset = len;
 801         key.type = BTRFS_EXTENT_ITEM_KEY;
 802         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 803                                 0, 0);
 804         btrfs_free_path(path);
 805         return ret;
 806 }
 807
 808 /*
 809  * helper function to lookup reference count and flags of a tree block.
 810  *
 811  * the head node for delayed ref is used to store the sum of all the
 812  * reference count modifications queued up in the rbtree. the head
 813  * node may also store the extent flags to set. This way you can check
 814  * to see what the reference count and extent flags would be if all of
 815  * the delayed refs are not processed.
 816  */
 817 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 818                              struct btrfs_root *root, u64 bytenr,
 819                              u64 offset, int metadata, u64 *refs, u64 *flags)
 820 {
 821         struct btrfs_delayed_ref_head *head;
 822         struct btrfs_delayed_ref_root *delayed_refs;
 823         struct btrfs_path *path;
 824         struct btrfs_extent_item *ei;
 825         struct extent_buffer *leaf;
 826         struct btrfs_key key;
 827         u32 item_size;
 828         u64 num_refs;
 829         u64 extent_flags;
 830         int ret;
 831
 832         /*
 833          * If we don't have skinny metadata, don't bother doing anything
 834          * different
 835          */
 836         if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
 837                 offset = root->nodesize;
 838                 metadata = 0;
 839         }
 840
 841         path = btrfs_alloc_path();
 842         if (!path)
 843                 return -ENOMEM;
 844
 845         if (!trans) {
 846                 path->skip_locking = 1;
 847                 path->search_commit_root = 1;
 848         }
 849
 850 search_again:
 851         key.objectid = bytenr;
 852         key.offset = offset;
 853         if (metadata)
 854                 key.type = BTRFS_METADATA_ITEM_KEY;
 855         else
 856                 key.type = BTRFS_EXTENT_ITEM_KEY;
 857
 858         ret = btrfs_search_slot(trans, root->fs_info->extent_root,
 859                                 &key, path, 0, 0);
 860         if (ret < 0)
 861                 goto out_free;
 862
 863         if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
 864                 if (path->slots[0]) {
 865                         path->slots[0]--;
 866                         btrfs_item_key_to_cpu(path->nodes[0], &key,
 867                                               path->slots[0]);
 868                         if (key.objectid == bytenr &&
 869                             key.type == BTRFS_EXTENT_ITEM_KEY &&
 870                             key.offset == root->nodesize)
 871                                 ret = 0;
 872                 }
 873         }
 874
 875         if (ret == 0) {
 876                 leaf = path->nodes[0];
 877                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 878                 if (item_size >= sizeof(*ei)) {
 879                         ei = btrfs_item_ptr(leaf, path->slots[0],
 880                                             struct btrfs_extent_item);
 881                         num_refs = btrfs_extent_refs(leaf, ei);
 882                         extent_flags = btrfs_extent_flags(leaf, ei);
 883                 } else {
 884 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 885                         struct btrfs_extent_item_v0 *ei0;
 886                         BUG_ON(item_size != sizeof(*ei0));
 887                         ei0 = btrfs_item_ptr(leaf, path->slots[0],
 888                                              struct btrfs_extent_item_v0);
 889                         num_refs = btrfs_extent_refs_v0(leaf, ei0);
 890                         /* FIXME: this isn't correct for data */
 891                         extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 892 #else
 893                         BUG();
 894 #endif
 895                 }
 896                 BUG_ON(num_refs == 0);
 897         } else {
 898                 num_refs = 0;
 899                 extent_flags = 0;
 900                 ret = 0;
 901         }
 902
 903         if (!trans)
 904                 goto out;
 905
 906         delayed_refs = &trans->transaction->delayed_refs;
 907         spin_lock(&delayed_refs->lock);
 908         head = btrfs_find_delayed_ref_head(trans, bytenr);
 909         if (head) {
 910                 if (!mutex_trylock(&head->mutex)) {
 911                         atomic_inc(&head->node.refs);
 912                         spin_unlock(&delayed_refs->lock);
 913
 914                         btrfs_release_path(path);
 915
 916                         /*
 917                          * Mutex was contended, block until it's released and try
 918                          * again
 919                          */
 920                         mutex_lock(&head->mutex);
 921                         mutex_unlock(&head->mutex);
 922                         btrfs_put_delayed_ref(&head->node);
 923                         goto search_again;
 924                 }
 925                 spin_lock(&head->lock);
 926                 if (head->extent_op && head->extent_op->update_flags)
 927                         extent_flags |= head->extent_op->flags_to_set;
 928                 else
 929                         BUG_ON(num_refs == 0);
 930
 931                 num_refs += head->node.ref_mod;
 932                 spin_unlock(&head->lock);
 933                 mutex_unlock(&head->mutex);
 934         }
 935         spin_unlock(&delayed_refs->lock);
 936 out:
 937         WARN_ON(num_refs == 0);
 938         if (refs)
 939                 *refs = num_refs;
 940         if (flags)
 941                 *flags = extent_flags;
 942 out_free:
 943         btrfs_free_path(path);
 944         return ret;
 945 }
 946
 947 /*
 948  * Back reference rules.  Back refs have three main goals:
 949  *
 950  * 1) differentiate between all holders of references to an extent so that
 951  *    when a reference is dropped we can make sure it was a valid reference
 952  *    before freeing the extent.
 953  *
 954  * 2) Provide enough information to quickly find the holders of an extent
 955  *    if we notice a given block is corrupted or bad.
 956  *
 957  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 958  *    maintenance.  This is actually the same as #2, but with a slightly
 959  *    different use case.
 960  *
 961  * There are two kinds of back refs. The implicit back refs is optimized
 962  * for pointers in non-shared tree blocks. For a given pointer in a block,
 963  * back refs of this kind provide information about the block's owner tree
 964  * and the pointer's key. These information allow us to find the block by
 965  * b-tree searching. The full back refs is for pointers in tree blocks not
 966  * referenced by their owner trees. The location of tree block is recorded
 967  * in the back refs. Actually the full back refs is generic, and can be
 968  * used in all cases the implicit back refs is used. The major shortcoming
 969  * of the full back refs is its overhead. Every time a tree block gets
 970  * COWed, we have to update back refs entry for all pointers in it.
 971  *
 972  * For a newly allocated tree block, we use implicit back refs for
 973  * pointers in it. This means most tree related operations only involve
 974  * implicit back refs. For a tree block created in old transaction, the
 975  * only way to drop a reference to it is COW it. So we can detect the
 976  * event that tree block loses its owner tree's reference and do the
 977  * back refs conversion.
 978  *
 979  * When a tree block is COWed through a tree, there are four cases:
 980  *
 981  * The reference count of the block is one and the tree is the block's
 982  * owner tree. Nothing to do in this case.
 983  *
 984  * The reference count of the block is one and the tree is not the
 985  * block's owner tree. In this case, full back refs is used for pointers
 986  * in the block. Remove these full back refs, add implicit back refs for
 987  * every pointers in the new block.
 988  *
 989  * The reference count of the block is greater than one and the tree is
 990  * the block's owner tree. In this case, implicit back refs is used for
 991  * pointers in the block. Add full back refs for every pointers in the
 992  * block, increase lower level extents' reference counts. The original
 993  * implicit back refs are entailed to the new block.
 994  *
 995  * The reference count of the block is greater than one and the tree is
 996  * not the block's owner tree. Add implicit back refs for every pointer in
 997  * the new block, increase lower level extents' reference count.
 998  *
 999  * Back Reference Key composing:
1000  *
1001  * The key objectid corresponds to the first byte in the extent,
1002  * The key type is used to differentiate between types of back refs.
1003  * There are different meanings of the key offset for different types
1004  * of back refs.
1005  *
1006  * File extents can be referenced by:
1007  *
1008  * - multiple snapshots, subvolumes, or different generations in one subvol
1009  * - different files inside a single subvolume
1010  * - different offsets inside a file (bookend extents in file.c)
1011  *
1012  * The extent ref structure for the implicit back refs has fields for:
1013  *
1014  * - Objectid of the subvolume root
1015  * - objectid of the file holding the reference
1016  * - original offset in the file
1017  * - how many bookend extents
1018  *
1019  * The key offset for the implicit back refs is hash of the first
1020  * three fields.
1021  *
1022  * The extent ref structure for the full back refs has field for:
1023  *
1024  * - number of pointers in the tree leaf
1025  *
1026  * The key offset for the implicit back refs is the first byte of
1027  * the tree leaf
1028  *
1029  * When a file extent is allocated, The implicit back refs is used.
1030  * the fields are filled in:
1031  *
1032  *     (root_key.objectid, inode objectid, offset in file, 1)
1033  *
1034  * When a file extent is removed file truncation, we find the
1035  * corresponding implicit back refs and check the following fields:
1036  *
1037  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
1038  *
1039  * Btree extents can be referenced by:
1040  *
1041  * - Different subvolumes
1042  *
1043  * Both the implicit back refs and the full back refs for tree blocks
1044  * only consist of key. The key offset for the implicit back refs is
1045  * objectid of block's owner tree. The key offset for the full back refs
1046  * is the first byte of parent block.
1047  *
1048  * When implicit back refs is used, information about the lowest key and
1049  * level of the tree block are required. These information are stored in
1050  * tree block info structure.
1051  */
1052
1053 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1054 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
1055                                   struct btrfs_root *root,
1056                                   struct btrfs_path *path,
1057                                   u64 owner, u32 extra_size)
1058 {
1059         struct btrfs_extent_item *item;
1060         struct btrfs_extent_item_v0 *ei0;
1061         struct btrfs_extent_ref_v0 *ref0;
1062         struct btrfs_tree_block_info *bi;
1063         struct extent_buffer *leaf;
1064         struct btrfs_key key;
1065         struct btrfs_key found_key;
1066         u32 new_size = sizeof(*item);
1067         u64 refs;
1068         int ret;
1069
1070         leaf = path->nodes[0];
1071         BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
1072
1073         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1074         ei0 = btrfs_item_ptr(leaf, path->slots[0],
1075                              struct btrfs_extent_item_v0);
1076         refs = btrfs_extent_refs_v0(leaf, ei0);
1077
1078         if (owner == (u64)-1) {
1079                 while (1) {
1080                         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1081                                 ret = btrfs_next_leaf(root, path);
1082                                 if (ret < 0)
1083                                         return ret;
1084                                 BUG_ON(ret > 0); /* Corruption */
1085                                 leaf = path->nodes[0];
1086                         }
1087                         btrfs_item_key_to_cpu(leaf, &found_key,
1088                                               path->slots[0]);
1089                         BUG_ON(key.objectid != found_key.objectid);
1090                         if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
1091                                 path->slots[0]++;
1092                                 continue;
1093                         }
1094                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1095                                               struct btrfs_extent_ref_v0);
1096                         owner = btrfs_ref_objectid_v0(leaf, ref0);
1097                         break;
1098                 }
1099         }
1100         btrfs_release_path(path);
1101
1102         if (owner < BTRFS_FIRST_FREE_OBJECTID)
1103                 new_size += sizeof(*bi);
1104
1105         new_size -= sizeof(*ei0);
1106         ret = btrfs_search_slot(trans, root, &key, path,
1107                                 new_size + extra_size, 1);
1108         if (ret < 0)
1109                 return ret;
1110         BUG_ON(ret); /* Corruption */
1111
1112         btrfs_extend_item(root, path, new_size);
1113
1114         leaf = path->nodes[0];
1115         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1116         btrfs_set_extent_refs(leaf, item, refs);
1117         /* FIXME: get real generation */
1118         btrfs_set_extent_generation(leaf, item, 0);
1119         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1120                 btrfs_set_extent_flags(leaf, item,
1121                                        BTRFS_EXTENT_FLAG_TREE_BLOCK |
1122                                        BTRFS_BLOCK_FLAG_FULL_BACKREF);
1123                 bi = (struct btrfs_tree_block_info *)(item + 1);
1124                 /* FIXME: get first key of the block */
1125                 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1126                 btrfs_set_tree_block_level(leaf, bi, (int)owner);
1127         } else {
1128                 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1129         }
1130         btrfs_mark_buffer_dirty(leaf);
1131         return 0;
1132 }
1133 #endif
1134
1135 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1136 {
1137         u32 high_crc = ~(u32)0;
1138         u32 low_crc = ~(u32)0;
1139         __le64 lenum;
1140
1141         lenum = cpu_to_le64(root_objectid);
1142         high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1143         lenum = cpu_to_le64(owner);
1144         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1145         lenum = cpu_to_le64(offset);
1146         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1147
1148         return ((u64)high_crc << 31) ^ (u64)low_crc;
1149 }
1150
1151 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1152                                      struct btrfs_extent_data_ref *ref)
1153 {
1154         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1155                                     btrfs_extent_data_ref_objectid(leaf, ref),
1156                                     btrfs_extent_data_ref_offset(leaf, ref));
1157 }
1158
1159 static int match_extent_data_ref(struct extent_buffer *leaf,
1160                                  struct btrfs_extent_data_ref *ref,
1161                                  u64 root_objectid, u64 owner, u64 offset)
1162 {
1163         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1164             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1165             btrfs_extent_data_ref_offset(leaf, ref) != offset)
1166                 return 0;
1167         return 1;
1168 }
1169
1170 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1171                                            struct btrfs_root *root,
1172                                            struct btrfs_path *path,
1173                                            u64 bytenr, u64 parent,
1174                                            u64 root_objectid,
1175                                            u64 owner, u64 offset)
1176 {
1177         struct btrfs_key key;
1178         struct btrfs_extent_data_ref *ref;
1179         struct extent_buffer *leaf;
1180         u32 nritems;
1181         int ret;
1182         int recow;
1183         int err = -ENOENT;
1184
1185         key.objectid = bytenr;
1186         if (parent) {
1187                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1188                 key.offset = parent;
1189         } else {
1190                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1191                 key.offset = hash_extent_data_ref(root_objectid,
1192                                                   owner, offset);
1193         }
1194 again:
1195         recow = 0;
1196         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1197         if (ret < 0) {
1198                 err = ret;
1199                 goto fail;
1200         }
1201
1202         if (parent) {
1203                 if (!ret)
1204                         return 0;
1205 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1206                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1207                 btrfs_release_path(path);
1208                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1209                 if (ret < 0) {
1210                         err = ret;
1211                         goto fail;
1212                 }
1213                 if (!ret)
1214                         return 0;
1215 #endif
1216                 goto fail;
1217         }
1218
1219         leaf = path->nodes[0];
1220         nritems = btrfs_header_nritems(leaf);
1221         while (1) {
1222                 if (path->slots[0] >= nritems) {
1223                         ret = btrfs_next_leaf(root, path);
1224                         if (ret < 0)
1225                                 err = ret;
1226                         if (ret)
1227                                 goto fail;
1228
1229                         leaf = path->nodes[0];
1230                         nritems = btrfs_header_nritems(leaf);
1231                         recow = 1;
1232                 }
1233
1234                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1235                 if (key.objectid != bytenr ||
1236                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1237                         goto fail;
1238
1239                 ref = btrfs_item_ptr(leaf, path->slots[0],
1240                                      struct btrfs_extent_data_ref);
1241
1242                 if (match_extent_data_ref(leaf, ref, root_objectid,
1243                                           owner, offset)) {
1244                         if (recow) {
1245                                 btrfs_release_path(path);
1246                                 goto again;
1247                         }
1248                         err = 0;
1249                         break;
1250                 }
1251                 path->slots[0]++;
1252         }
1253 fail:
1254         return err;
1255 }
1256
1257 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1258                                            struct btrfs_root *root,
1259                                            struct btrfs_path *path,
1260                                            u64 bytenr, u64 parent,
1261                                            u64 root_objectid, u64 owner,
1262                                            u64 offset, int refs_to_add)
1263 {
1264         struct btrfs_key key;
1265         struct extent_buffer *leaf;
1266         u32 size;
1267         u32 num_refs;
1268         int ret;
1269
1270         key.objectid = bytenr;
1271         if (parent) {
1272                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1273                 key.offset = parent;
1274                 size = sizeof(struct btrfs_shared_data_ref);
1275         } else {
1276                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1277                 key.offset = hash_extent_data_ref(root_objectid,
1278                                                   owner, offset);
1279                 size = sizeof(struct btrfs_extent_data_ref);
1280         }
1281
1282         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1283         if (ret && ret != -EEXIST)
1284                 goto fail;
1285
1286         leaf = path->nodes[0];
1287         if (parent) {
1288                 struct btrfs_shared_data_ref *ref;
1289                 ref = btrfs_item_ptr(leaf, path->slots[0],
1290                                      struct btrfs_shared_data_ref);
1291                 if (ret == 0) {
1292                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1293                 } else {
1294                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1295                         num_refs += refs_to_add;
1296                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1297                 }
1298         } else {
1299                 struct btrfs_extent_data_ref *ref;
1300                 while (ret == -EEXIST) {
1301                         ref = btrfs_item_ptr(leaf, path->slots[0],
1302                                              struct btrfs_extent_data_ref);
1303                         if (match_extent_data_ref(leaf, ref, root_objectid,
1304                                                   owner, offset))
1305                                 break;
1306                         btrfs_release_path(path);
1307                         key.offset++;
1308                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1309                                                       size);
1310                         if (ret && ret != -EEXIST)
1311                                 goto fail;
1312
1313                         leaf = path->nodes[0];
1314                 }
1315                 ref = btrfs_item_ptr(leaf, path->slots[0],
1316                                      struct btrfs_extent_data_ref);
1317                 if (ret == 0) {
1318                         btrfs_set_extent_data_ref_root(leaf, ref,
1319                                                        root_objectid);
1320                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1321                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1322                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1323                 } else {
1324                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1325                         num_refs += refs_to_add;
1326                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1327                 }
1328         }
1329         btrfs_mark_buffer_dirty(leaf);
1330         ret = 0;
1331 fail:
1332         btrfs_release_path(path);
1333         return ret;
1334 }
1335
1336 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1337                                            struct btrfs_root *root,
1338                                            struct btrfs_path *path,
1339                                            int refs_to_drop, int *last_ref)
1340 {
1341         struct btrfs_key key;
1342         struct btrfs_extent_data_ref *ref1 = NULL;
1343         struct btrfs_shared_data_ref *ref2 = NULL;
1344         struct extent_buffer *leaf;
1345         u32 num_refs = 0;
1346         int ret = 0;
1347
1348         leaf = path->nodes[0];
1349         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1350
1351         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1352                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1353                                       struct btrfs_extent_data_ref);
1354                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1355         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1356                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1357                                       struct btrfs_shared_data_ref);
1358                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1359 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1360         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1361                 struct btrfs_extent_ref_v0 *ref0;
1362                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1363                                       struct btrfs_extent_ref_v0);
1364                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1365 #endif
1366         } else {
1367                 BUG();
1368         }
1369
1370         BUG_ON(num_refs < refs_to_drop);
1371         num_refs -= refs_to_drop;
1372
1373         if (num_refs == 0) {
1374                 ret = btrfs_del_item(trans, root, path);
1375                 *last_ref = 1;
1376         } else {
1377                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1378                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1379                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1380                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1381 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1382                 else {
1383                         struct btrfs_extent_ref_v0 *ref0;
1384                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1385                                         struct btrfs_extent_ref_v0);
1386                         btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1387                 }
1388 #endif
1389                 btrfs_mark_buffer_dirty(leaf);
1390         }
1391         return ret;
1392 }
1393
1394 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1395                                           struct btrfs_extent_inline_ref *iref)
1396 {
1397         struct btrfs_key key;
1398         struct extent_buffer *leaf;
1399         struct btrfs_extent_data_ref *ref1;
1400         struct btrfs_shared_data_ref *ref2;
1401         u32 num_refs = 0;
1402
1403         leaf = path->nodes[0];
1404         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1405         if (iref) {
1406                 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1407                     BTRFS_EXTENT_DATA_REF_KEY) {
1408                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1409                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1410                 } else {
1411                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1412                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1413                 }
1414         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1415                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1416                                       struct btrfs_extent_data_ref);
1417                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1418         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1419                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1420                                       struct btrfs_shared_data_ref);
1421                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1422 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1423         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1424                 struct btrfs_extent_ref_v0 *ref0;
1425                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1426                                       struct btrfs_extent_ref_v0);
1427                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1428 #endif
1429         } else {
1430                 WARN_ON(1);
1431         }
1432         return num_refs;
1433 }
1434
1435 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1436                                           struct btrfs_root *root,
1437                                           struct btrfs_path *path,
1438                                           u64 bytenr, u64 parent,
1439                                           u64 root_objectid)
1440 {
1441         struct btrfs_key key;
1442         int ret;
1443
1444         key.objectid = bytenr;
1445         if (parent) {
1446                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1447                 key.offset = parent;
1448         } else {
1449                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1450                 key.offset = root_objectid;
1451         }
1452
1453         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1454         if (ret > 0)
1455                 ret = -ENOENT;
1456 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1457         if (ret == -ENOENT && parent) {
1458                 btrfs_release_path(path);
1459                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1460                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1461                 if (ret > 0)
1462                         ret = -ENOENT;
1463         }
1464 #endif
1465         return ret;
1466 }
1467
1468 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1469                                           struct btrfs_root *root,
1470                                           struct btrfs_path *path,
1471                                           u64 bytenr, u64 parent,
1472                                           u64 root_objectid)
1473 {
1474         struct btrfs_key key;
1475         int ret;
1476
1477         key.objectid = bytenr;
1478         if (parent) {
1479                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1480                 key.offset = parent;
1481         } else {
1482                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1483                 key.offset = root_objectid;
1484         }
1485
1486         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1487         btrfs_release_path(path);
1488         return ret;
1489 }
1490
1491 static inline int extent_ref_type(u64 parent, u64 owner)
1492 {
1493         int type;
1494         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1495                 if (parent > 0)
1496                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1497                 else
1498                         type = BTRFS_TREE_BLOCK_REF_KEY;
1499         } else {
1500                 if (parent > 0)
1501                         type = BTRFS_SHARED_DATA_REF_KEY;
1502                 else
1503                         type = BTRFS_EXTENT_DATA_REF_KEY;
1504         }
1505         return type;
1506 }
1507
1508 static int find_next_key(struct btrfs_path *path, int level,
1509                          struct btrfs_key *key)
1510
1511 {
1512         for (; level < BTRFS_MAX_LEVEL; level++) {
1513                 if (!path->nodes[level])
1514                         break;
1515                 if (path->slots[level] + 1 >=
1516                     btrfs_header_nritems(path->nodes[level]))
1517                         continue;
1518                 if (level == 0)
1519                         btrfs_item_key_to_cpu(path->nodes[level], key,
1520                                               path->slots[level] + 1);
1521                 else
1522                         btrfs_node_key_to_cpu(path->nodes[level], key,
1523                                               path->slots[level] + 1);
1524                 return 0;
1525         }
1526         return 1;
1527 }
1528
1529 /*
1530  * look for inline back ref. if back ref is found, *ref_ret is set
1531  * to the address of inline back ref, and 0 is returned.
1532  *
1533  * if back ref isn't found, *ref_ret is set to the address where it
1534  * should be inserted, and -ENOENT is returned.
1535  *
1536  * if insert is true and there are too many inline back refs, the path
1537  * points to the extent item, and -EAGAIN is returned.
1538  *
1539  * NOTE: inline back refs are ordered in the same way that back ref
1540  *       items in the tree are ordered.
1541  */
1542 static noinline_for_stack
1543 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1544                                  struct btrfs_root *root,
1545                                  struct btrfs_path *path,
1546                                  struct btrfs_extent_inline_ref **ref_ret,
1547                                  u64 bytenr, u64 num_bytes,
1548                                  u64 parent, u64 root_objectid,
1549                                  u64 owner, u64 offset, int insert)
1550 {
1551         struct btrfs_key key;
1552         struct extent_buffer *leaf;
1553         struct btrfs_extent_item *ei;
1554         struct btrfs_extent_inline_ref *iref;
1555         u64 flags;
1556         u64 item_size;
1557         unsigned long ptr;
1558         unsigned long end;
1559         int extra_size;
1560         int type;
1561         int want;
1562         int ret;
1563         int err = 0;
1564         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
1565                                                  SKINNY_METADATA);
1566
1567         key.objectid = bytenr;
1568         key.type = BTRFS_EXTENT_ITEM_KEY;
1569         key.offset = num_bytes;
1570
1571         want = extent_ref_type(parent, owner);
1572         if (insert) {
1573                 extra_size = btrfs_extent_inline_ref_size(want);
1574                 path->keep_locks = 1;
1575         } else
1576                 extra_size = -1;
1577
1578         /*
1579          * Owner is our parent level, so we can just add one to get the level
1580          * for the block we are interested in.
1581          */
1582         if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1583                 key.type = BTRFS_METADATA_ITEM_KEY;
1584                 key.offset = owner;
1585         }
1586
1587 again:
1588         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1589         if (ret < 0) {
1590                 err = ret;
1591                 goto out;
1592         }
1593
1594         /*
1595          * We may be a newly converted file system which still has the old fat
1596          * extent entries for metadata, so try and see if we have one of those.
1597          */
1598         if (ret > 0 && skinny_metadata) {
1599                 skinny_metadata = false;
1600                 if (path->slots[0]) {
1601                         path->slots[0]--;
1602                         btrfs_item_key_to_cpu(path->nodes[0], &key,
1603                                               path->slots[0]);
1604                         if (key.objectid == bytenr &&
1605                             key.type == BTRFS_EXTENT_ITEM_KEY &&
1606                             key.offset == num_bytes)
1607                                 ret = 0;
1608                 }
1609                 if (ret) {
1610                         key.objectid = bytenr;
1611                         key.type = BTRFS_EXTENT_ITEM_KEY;
1612                         key.offset = num_bytes;
1613                         btrfs_release_path(path);
1614                         goto again;
1615                 }
1616         }
1617
1618         if (ret && !insert) {
1619                 err = -ENOENT;
1620                 goto out;
1621         } else if (WARN_ON(ret)) {
1622                 err = -EIO;
1623                 goto out;
1624         }
1625
1626         leaf = path->nodes[0];
1627         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1628 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1629         if (item_size < sizeof(*ei)) {
1630                 if (!insert) {
1631                         err = -ENOENT;
1632                         goto out;
1633                 }
1634                 ret = convert_extent_item_v0(trans, root, path, owner,
1635                                              extra_size);
1636                 if (ret < 0) {
1637                         err = ret;
1638                         goto out;
1639                 }
1640                 leaf = path->nodes[0];
1641                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1642         }
1643 #endif
1644         BUG_ON(item_size < sizeof(*ei));
1645
1646         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1647         flags = btrfs_extent_flags(leaf, ei);
1648
1649         ptr = (unsigned long)(ei + 1);
1650         end = (unsigned long)ei + item_size;
1651
1652         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1653                 ptr += sizeof(struct btrfs_tree_block_info);
1654                 BUG_ON(ptr > end);
1655         }
1656
1657         err = -ENOENT;
1658         while (1) {
1659                 if (ptr >= end) {
1660                         WARN_ON(ptr > end);
1661                         break;
1662                 }
1663                 iref = (struct btrfs_extent_inline_ref *)ptr;
1664                 type = btrfs_extent_inline_ref_type(leaf, iref);
1665                 if (want < type)
1666                         break;
1667                 if (want > type) {
1668                         ptr += btrfs_extent_inline_ref_size(type);
1669                         continue;
1670                 }
1671
1672                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1673                         struct btrfs_extent_data_ref *dref;
1674                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1675                         if (match_extent_data_ref(leaf, dref, root_objectid,
1676                                                   owner, offset)) {
1677                                 err = 0;
1678                                 break;
1679                         }
1680                         if (hash_extent_data_ref_item(leaf, dref) <
1681                             hash_extent_data_ref(root_objectid, owner, offset))
1682                                 break;
1683                 } else {
1684                         u64 ref_offset;
1685                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1686                         if (parent > 0) {
1687                                 if (parent == ref_offset) {
1688                                         err = 0;
1689                                         break;
1690                                 }
1691                                 if (ref_offset < parent)
1692                                         break;
1693                         } else {
1694                                 if (root_objectid == ref_offset) {
1695                                         err = 0;
1696                                         break;
1697                                 }
1698                                 if (ref_offset < root_objectid)
1699                                         break;
1700                         }
1701                 }
1702                 ptr += btrfs_extent_inline_ref_size(type);
1703         }
1704         if (err == -ENOENT && insert) {
1705                 if (item_size + extra_size >=
1706                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1707                         err = -EAGAIN;
1708                         goto out;
1709                 }
1710                 /*
1711                  * To add new inline back ref, we have to make sure
1712                  * there is no corresponding back ref item.
1713                  * For simplicity, we just do not add new inline back
1714                  * ref if there is any kind of item for this block
1715                  */
1716                 if (find_next_key(path, 0, &key) == 0 &&
1717                     key.objectid == bytenr &&
1718                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1719                         err = -EAGAIN;
1720                         goto out;
1721                 }
1722         }
1723         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1724 out:
1725         if (insert) {
1726                 path->keep_locks = 0;
1727                 btrfs_unlock_up_safe(path, 1);
1728         }
1729         return err;
1730 }
1731
1732 /*
1733  * helper to add new inline back ref
1734  */
1735 static noinline_for_stack
1736 void setup_inline_extent_backref(struct btrfs_root *root,
1737                                  struct btrfs_path *path,
1738                                  struct btrfs_extent_inline_ref *iref,
1739                                  u64 parent, u64 root_objectid,
1740                                  u64 owner, u64 offset, int refs_to_add,
1741                                  struct btrfs_delayed_extent_op *extent_op)
1742 {
1743         struct extent_buffer *leaf;
1744         struct btrfs_extent_item *ei;
1745         unsigned long ptr;
1746         unsigned long end;
1747         unsigned long item_offset;
1748         u64 refs;
1749         int size;
1750         int type;
1751
1752         leaf = path->nodes[0];
1753         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1754         item_offset = (unsigned long)iref - (unsigned long)ei;
1755
1756         type = extent_ref_type(parent, owner);
1757         size = btrfs_extent_inline_ref_size(type);
1758
1759         btrfs_extend_item(root, path, size);
1760
1761         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1762         refs = btrfs_extent_refs(leaf, ei);
1763         refs += refs_to_add;
1764         btrfs_set_extent_refs(leaf, ei, refs);
1765         if (extent_op)
1766                 __run_delayed_extent_op(extent_op, leaf, ei);
1767
1768         ptr = (unsigned long)ei + item_offset;
1769         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1770         if (ptr < end - size)
1771                 memmove_extent_buffer(leaf, ptr + size, ptr,
1772                                       end - size - ptr);
1773
1774         iref = (struct btrfs_extent_inline_ref *)ptr;
1775         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1776         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1777                 struct btrfs_extent_data_ref *dref;
1778                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1779                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1780                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1781                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1782                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1783         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1784                 struct btrfs_shared_data_ref *sref;
1785                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1786                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1787                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1788         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1789                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1790         } else {
1791                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1792         }
1793         btrfs_mark_buffer_dirty(leaf);
1794 }
1795
1796 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1797                                  struct btrfs_root *root,
1798                                  struct btrfs_path *path,
1799                                  struct btrfs_extent_inline_ref **ref_ret,
1800                                  u64 bytenr, u64 num_bytes, u64 parent,
1801                                  u64 root_objectid, u64 owner, u64 offset)
1802 {
1803         int ret;
1804
1805         ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1806                                            bytenr, num_bytes, parent,
1807                                            root_objectid, owner, offset, 0);
1808         if (ret != -ENOENT)
1809                 return ret;
1810
1811         btrfs_release_path(path);
1812         *ref_ret = NULL;
1813
1814         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1815                 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1816                                             root_objectid);
1817         } else {
1818                 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1819                                              root_objectid, owner, offset);
1820         }
1821         return ret;
1822 }
1823
1824 /*
1825  * helper to update/remove inline back ref
1826  */
1827 static noinline_for_stack
1828 void update_inline_extent_backref(struct btrfs_root *root,
1829                                   struct btrfs_path *path,
1830                                   struct btrfs_extent_inline_ref *iref,
1831                                   int refs_to_mod,
1832                                   struct btrfs_delayed_extent_op *extent_op,
1833                                   int *last_ref)
1834 {
1835         struct extent_buffer *leaf;
1836         struct btrfs_extent_item *ei;
1837         struct btrfs_extent_data_ref *dref = NULL;
1838         struct btrfs_shared_data_ref *sref = NULL;
1839         unsigned long ptr;
1840         unsigned long end;
1841         u32 item_size;
1842         int size;
1843         int type;
1844         u64 refs;
1845
1846         leaf = path->nodes[0];
1847         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1848         refs = btrfs_extent_refs(leaf, ei);
1849         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1850         refs += refs_to_mod;
1851         btrfs_set_extent_refs(leaf, ei, refs);
1852         if (extent_op)
1853                 __run_delayed_extent_op(extent_op, leaf, ei);
1854
1855         type = btrfs_extent_inline_ref_type(leaf, iref);
1856
1857         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1858                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1859                 refs = btrfs_extent_data_ref_count(leaf, dref);
1860         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1861                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1862                 refs = btrfs_shared_data_ref_count(leaf, sref);
1863         } else {
1864                 refs = 1;
1865                 BUG_ON(refs_to_mod != -1);
1866         }
1867
1868         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1869         refs += refs_to_mod;
1870
1871         if (refs > 0) {
1872                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1873                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1874                 else
1875                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1876         } else {
1877                 *last_ref = 1;
1878                 size =  btrfs_extent_inline_ref_size(type);
1879                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1880                 ptr = (unsigned long)iref;
1881                 end = (unsigned long)ei + item_size;
1882                 if (ptr + size < end)
1883                         memmove_extent_buffer(leaf, ptr, ptr + size,
1884                                               end - ptr - size);
1885                 item_size -= size;
1886                 btrfs_truncate_item(root, path, item_size, 1);
1887         }
1888         btrfs_mark_buffer_dirty(leaf);
1889 }
1890
1891 static noinline_for_stack
1892 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1893                                  struct btrfs_root *root,
1894                                  struct btrfs_path *path,
1895                                  u64 bytenr, u64 num_bytes, u64 parent,
1896                                  u64 root_objectid, u64 owner,
1897                                  u64 offset, int refs_to_add,
1898                                  struct btrfs_delayed_extent_op *extent_op)
1899 {
1900         struct btrfs_extent_inline_ref *iref;
1901         int ret;
1902
1903         ret = lookup_inline_extent_backref(trans, root, path, &iref,
1904                                            bytenr, num_bytes, parent,
1905                                            root_objectid, owner, offset, 1);
1906         if (ret == 0) {
1907                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1908                 update_inline_extent_backref(root, path, iref,
1909                                              refs_to_add, extent_op, NULL);
1910         } else if (ret == -ENOENT) {
1911                 setup_inline_extent_backref(root, path, iref, parent,
1912                                             root_objectid, owner, offset,
1913                                             refs_to_add, extent_op);
1914                 ret = 0;
1915         }
1916         return ret;
1917 }
1918
1919 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1920                                  struct btrfs_root *root,
1921                                  struct btrfs_path *path,
1922                                  u64 bytenr, u64 parent, u64 root_objectid,
1923                                  u64 owner, u64 offset, int refs_to_add)
1924 {
1925         int ret;
1926         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1927                 BUG_ON(refs_to_add != 1);
1928                 ret = insert_tree_block_ref(trans, root, path, bytenr,
1929                                             parent, root_objectid);
1930         } else {
1931                 ret = insert_extent_data_ref(trans, root, path, bytenr,
1932                                              parent, root_objectid,
1933                                              owner, offset, refs_to_add);
1934         }
1935         return ret;
1936 }
1937
1938 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1939                                  struct btrfs_root *root,
1940                                  struct btrfs_path *path,
1941                                  struct btrfs_extent_inline_ref *iref,
1942                                  int refs_to_drop, int is_data, int *last_ref)
1943 {
1944         int ret = 0;
1945
1946         BUG_ON(!is_data && refs_to_drop != 1);
1947         if (iref) {
1948                 update_inline_extent_backref(root, path, iref,
1949                                              -refs_to_drop, NULL, last_ref);
1950         } else if (is_data) {
1951                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
1952                                              last_ref);
1953         } else {
1954                 *last_ref = 1;
1955                 ret = btrfs_del_item(trans, root, path);
1956         }
1957         return ret;
1958 }
1959
1960 #define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
1961 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1962                                u64 *discarded_bytes)
1963 {
1964         int j, ret = 0;
1965         u64 bytes_left, end;
1966         u64 aligned_start = ALIGN(start, 1 << 9);
1967
1968         if (WARN_ON(start != aligned_start)) {
1969                 len -= aligned_start - start;
1970                 len = round_down(len, 1 << 9);
1971                 start = aligned_start;
1972         }
1973
1974         *discarded_bytes = 0;
1975
1976         if (!len)
1977                 return 0;
1978
1979         end = start + len;
1980         bytes_left = len;
1981
1982         /* Skip any superblocks on this device. */
1983         for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1984                 u64 sb_start = btrfs_sb_offset(j);
1985                 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1986                 u64 size = sb_start - start;
1987
1988                 if (!in_range(sb_start, start, bytes_left) &&
1989                     !in_range(sb_end, start, bytes_left) &&
1990                     !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1991                         continue;
1992
1993                 /*
1994                  * Superblock spans beginning of range.  Adjust start and
1995                  * try again.
1996                  */
1997                 if (sb_start <= start) {
1998                         start += sb_end - start;
1999                         if (start > end) {
2000                                 bytes_left = 0;
2001                                 break;
2002                         }
2003                         bytes_left = end - start;
2004                         continue;
2005                 }
2006
2007                 if (size) {
2008                         ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
2009                                                    GFP_NOFS, 0);
2010                         if (!ret)
2011                                 *discarded_bytes += size;
2012                         else if (ret != -EOPNOTSUPP)
2013                                 return ret;
2014                 }
2015
2016                 start = sb_end;
2017                 if (start > end) {
2018                         bytes_left = 0;
2019                         break;
2020                 }
2021                 bytes_left = end - start;
2022         }
2023
2024         if (bytes_left) {
2025                 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
2026                                            GFP_NOFS, 0);
2027                 if (!ret)
2028                         *discarded_bytes += bytes_left;
2029         }
2030         return ret;
2031 }
2032
2033 int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
2034                          u64 num_bytes, u64 *actual_bytes)
2035 {
2036         int ret;
2037         u64 discarded_bytes = 0;
2038         struct btrfs_bio *bbio = NULL;
2039
2040
2041         /*
2042          * Avoid races with device replace and make sure our bbio has devices
2043          * associated to its stripes that don't go away while we are discarding.
2044          */
2045         btrfs_bio_counter_inc_blocked(root->fs_info);
2046         /* Tell the block device(s) that the sectors can be discarded */
2047         ret = btrfs_map_block(root->fs_info, REQ_OP_DISCARD,
2048                               bytenr, &num_bytes, &bbio, 0);
2049         /* Error condition is -ENOMEM */
2050         if (!ret) {
2051                 struct btrfs_bio_stripe *stripe = bbio->stripes;
2052                 int i;
2053
2054
2055                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
2056                         u64 bytes;
2057                         if (!stripe->dev->can_discard)
2058                                 continue;
2059
2060                         ret = btrfs_issue_discard(stripe->dev->bdev,
2061                                                   stripe->physical,
2062                                                   stripe->length,
2063                                                   &bytes);
2064                         if (!ret)
2065                                 discarded_bytes += bytes;
2066                         else if (ret != -EOPNOTSUPP)
2067                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
2068
2069                         /*
2070                          * Just in case we get back EOPNOTSUPP for some reason,
2071                          * just ignore the return value so we don't screw up
2072                          * people calling discard_extent.
2073                          */
2074                         ret = 0;
2075                 }
2076                 btrfs_put_bbio(bbio);
2077         }
2078         btrfs_bio_counter_dec(root->fs_info);
2079
2080         if (actual_bytes)
2081                 *actual_bytes = discarded_bytes;
2082
2083
2084         if (ret == -EOPNOTSUPP)
2085                 ret = 0;
2086         return ret;
2087 }
2088
2089 /* Can return -ENOMEM */
2090 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2091                          struct btrfs_root *root,
2092                          u64 bytenr, u64 num_bytes, u64 parent,
2093                          u64 root_objectid, u64 owner, u64 offset)
2094 {
2095         int ret;
2096         struct btrfs_fs_info *fs_info = root->fs_info;
2097
2098         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2099                root_objectid == BTRFS_TREE_LOG_OBJECTID);
2100
2101         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2102                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
2103                                         num_bytes,
2104                                         parent, root_objectid, (int)owner,
2105                                         BTRFS_ADD_DELAYED_REF, NULL);
2106         } else {
2107                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
2108                                         num_bytes, parent, root_objectid,
2109                                         owner, offset, 0,
2110                                         BTRFS_ADD_DELAYED_REF, NULL);
2111         }
2112         return ret;
2113 }
2114
2115 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2116                                   struct btrfs_root *root,
2117                                   struct btrfs_delayed_ref_node *node,
2118                                   u64 parent, u64 root_objectid,
2119                                   u64 owner, u64 offset, int refs_to_add,
2120                                   struct btrfs_delayed_extent_op *extent_op)
2121 {
2122         struct btrfs_fs_info *fs_info = root->fs_info;
2123         struct btrfs_path *path;
2124         struct extent_buffer *leaf;
2125         struct btrfs_extent_item *item;
2126         struct btrfs_key key;
2127         u64 bytenr = node->bytenr;
2128         u64 num_bytes = node->num_bytes;
2129         u64 refs;
2130         int ret;
2131
2132         path = btrfs_alloc_path();
2133         if (!path)
2134                 return -ENOMEM;
2135
2136         path->reada = READA_FORWARD;
2137         path->leave_spinning = 1;
2138         /* this will setup the path even if it fails to insert the back ref */
2139         ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
2140                                            bytenr, num_bytes, parent,
2141                                            root_objectid, owner, offset,
2142                                            refs_to_add, extent_op);
2143         if ((ret < 0 && ret != -EAGAIN) || !ret)
2144                 goto out;
2145
2146         /*
2147          * Ok we had -EAGAIN which means we didn't have space to insert and
2148          * inline extent ref, so just update the reference count and add a
2149          * normal backref.
2150          */
2151         leaf = path->nodes[0];
2152         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2153         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2154         refs = btrfs_extent_refs(leaf, item);
2155         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2156         if (extent_op)
2157                 __run_delayed_extent_op(extent_op, leaf, item);
2158
2159         btrfs_mark_buffer_dirty(leaf);
2160         btrfs_release_path(path);
2161
2162         path->reada = READA_FORWARD;
2163         path->leave_spinning = 1;
2164         /* now insert the actual backref */
2165         ret = insert_extent_backref(trans, root->fs_info->extent_root,
2166                                     path, bytenr, parent, root_objectid,
2167                                     owner, offset, refs_to_add);
2168         if (ret)
2169                 btrfs_abort_transaction(trans, ret);
2170 out:
2171         btrfs_free_path(path);
2172         return ret;
2173 }
2174
2175 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2176                                 struct btrfs_root *root,
2177                                 struct btrfs_delayed_ref_node *node,
2178                                 struct btrfs_delayed_extent_op *extent_op,
2179                                 int insert_reserved)
2180 {
2181         int ret = 0;
2182         struct btrfs_delayed_data_ref *ref;
2183         struct btrfs_key ins;
2184         u64 parent = 0;
2185         u64 ref_root = 0;
2186         u64 flags = 0;
2187
2188         ins.objectid = node->bytenr;
2189         ins.offset = node->num_bytes;
2190         ins.type = BTRFS_EXTENT_ITEM_KEY;
2191
2192         ref = btrfs_delayed_node_to_data_ref(node);
2193         trace_run_delayed_data_ref(root->fs_info, node, ref, node->action);
2194
2195         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2196                 parent = ref->parent;
2197         ref_root = ref->root;
2198
2199         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2200                 if (extent_op)
2201                         flags |= extent_op->flags_to_set;
2202                 ret = alloc_reserved_file_extent(trans, root,
2203                                                  parent, ref_root, flags,
2204                                                  ref->objectid, ref->offset,
2205                                                  &ins, node->ref_mod);
2206         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2207                 ret = __btrfs_inc_extent_ref(trans, root, node, parent,
2208                                              ref_root, ref->objectid,
2209                                              ref->offset, node->ref_mod,
2210                                              extent_op);
2211         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2212                 ret = __btrfs_free_extent(trans, root, node, parent,
2213                                           ref_root, ref->objectid,
2214                                           ref->offset, node->ref_mod,
2215                                           extent_op);
2216         } else {
2217                 BUG();
2218         }
2219         return ret;
2220 }
2221
2222 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2223                                     struct extent_buffer *leaf,
2224                                     struct btrfs_extent_item *ei)
2225 {
2226         u64 flags = btrfs_extent_flags(leaf, ei);
2227         if (extent_op->update_flags) {
2228                 flags |= extent_op->flags_to_set;
2229                 btrfs_set_extent_flags(leaf, ei, flags);
2230         }
2231
2232         if (extent_op->update_key) {
2233                 struct btrfs_tree_block_info *bi;
2234                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2235                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2236                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2237         }
2238 }
2239
2240 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2241                                  struct btrfs_root *root,
2242                                  struct btrfs_delayed_ref_node *node,
2243                                  struct btrfs_delayed_extent_op *extent_op)
2244 {
2245         struct btrfs_key key;
2246         struct btrfs_path *path;
2247         struct btrfs_extent_item *ei;
2248         struct extent_buffer *leaf;
2249         u32 item_size;
2250         int ret;
2251         int err = 0;
2252         int metadata = !extent_op->is_data;
2253
2254         if (trans->aborted)
2255                 return 0;
2256
2257         if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2258                 metadata = 0;
2259
2260         path = btrfs_alloc_path();
2261         if (!path)
2262                 return -ENOMEM;
2263
2264         key.objectid = node->bytenr;
2265
2266         if (metadata) {
2267                 key.type = BTRFS_METADATA_ITEM_KEY;
2268                 key.offset = extent_op->level;
2269         } else {
2270                 key.type = BTRFS_EXTENT_ITEM_KEY;
2271                 key.offset = node->num_bytes;
2272         }
2273
2274 again:
2275         path->reada = READA_FORWARD;
2276         path->leave_spinning = 1;
2277         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2278                                 path, 0, 1);
2279         if (ret < 0) {
2280                 err = ret;
2281                 goto out;
2282         }
2283         if (ret > 0) {
2284                 if (metadata) {
2285                         if (path->slots[0] > 0) {
2286                                 path->slots[0]--;
2287                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
2288                                                       path->slots[0]);
2289                                 if (key.objectid == node->bytenr &&
2290                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
2291                                     key.offset == node->num_bytes)
2292                                         ret = 0;
2293                         }
2294                         if (ret > 0) {
2295                                 btrfs_release_path(path);
2296                                 metadata = 0;
2297
2298                                 key.objectid = node->bytenr;
2299                                 key.offset = node->num_bytes;
2300                                 key.type = BTRFS_EXTENT_ITEM_KEY;
2301                                 goto again;
2302                         }
2303                 } else {
2304                         err = -EIO;
2305                         goto out;
2306                 }
2307         }
2308
2309         leaf = path->nodes[0];
2310         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2311 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2312         if (item_size < sizeof(*ei)) {
2313                 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2314                                              path, (u64)-1, 0);
2315                 if (ret < 0) {
2316                         err = ret;
2317                         goto out;
2318                 }
2319                 leaf = path->nodes[0];
2320                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2321         }
2322 #endif
2323         BUG_ON(item_size < sizeof(*ei));
2324         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2325         __run_delayed_extent_op(extent_op, leaf, ei);
2326
2327         btrfs_mark_buffer_dirty(leaf);
2328 out:
2329         btrfs_free_path(path);
2330         return err;
2331 }
2332
2333 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2334                                 struct btrfs_root *root,
2335                                 struct btrfs_delayed_ref_node *node,
2336                                 struct btrfs_delayed_extent_op *extent_op,
2337                                 int insert_reserved)
2338 {
2339         int ret = 0;
2340         struct btrfs_delayed_tree_ref *ref;
2341         struct btrfs_key ins;
2342         u64 parent = 0;
2343         u64 ref_root = 0;
2344         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
2345                                                  SKINNY_METADATA);
2346
2347         ref = btrfs_delayed_node_to_tree_ref(node);
2348         trace_run_delayed_tree_ref(root->fs_info, node, ref, node->action);
2349
2350         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2351                 parent = ref->parent;
2352         ref_root = ref->root;
2353
2354         ins.objectid = node->bytenr;
2355         if (skinny_metadata) {
2356                 ins.offset = ref->level;
2357                 ins.type = BTRFS_METADATA_ITEM_KEY;
2358         } else {
2359                 ins.offset = node->num_bytes;
2360                 ins.type = BTRFS_EXTENT_ITEM_KEY;
2361         }
2362
2363         BUG_ON(node->ref_mod != 1);
2364         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2365                 BUG_ON(!extent_op || !extent_op->update_flags);
2366                 ret = alloc_reserved_tree_block(trans, root,
2367                                                 parent, ref_root,
2368                                                 extent_op->flags_to_set,
2369                                                 &extent_op->key,
2370                                                 ref->level, &ins);
2371         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2372                 ret = __btrfs_inc_extent_ref(trans, root, node,
2373                                              parent, ref_root,
2374                                              ref->level, 0, 1,
2375                                              extent_op);
2376         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2377                 ret = __btrfs_free_extent(trans, root, node,
2378                                           parent, ref_root,
2379                                           ref->level, 0, 1, extent_op);
2380         } else {
2381                 BUG();
2382         }
2383         return ret;
2384 }
2385
2386 /* helper function to actually process a single delayed ref entry */
2387 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2388                                struct btrfs_root *root,
2389                                struct btrfs_delayed_ref_node *node,
2390                                struct btrfs_delayed_extent_op *extent_op,
2391                                int insert_reserved)
2392 {
2393         int ret = 0;
2394
2395         if (trans->aborted) {
2396                 if (insert_reserved)
2397                         btrfs_pin_extent(root, node->bytenr,
2398                                          node->num_bytes, 1);
2399                 return 0;
2400         }
2401
2402         if (btrfs_delayed_ref_is_head(node)) {
2403                 struct btrfs_delayed_ref_head *head;
2404                 /*
2405                  * we've hit the end of the chain and we were supposed
2406                  * to insert this extent into the tree.  But, it got
2407                  * deleted before we ever needed to insert it, so all
2408                  * we have to do is clean up the accounting
2409                  */
2410                 BUG_ON(extent_op);
2411                 head = btrfs_delayed_node_to_head(node);
2412                 trace_run_delayed_ref_head(root->fs_info, node, head,
2413                                            node->action);
2414
2415                 if (insert_reserved) {
2416                         btrfs_pin_extent(root, node->bytenr,
2417                                          node->num_bytes, 1);
2418                         if (head->is_data) {
2419                                 ret = btrfs_del_csums(trans, root,
2420                                                       node->bytenr,
2421                                                       node->num_bytes);
2422                         }
2423                 }
2424
2425                 /* Also free its reserved qgroup space */
2426                 btrfs_qgroup_free_delayed_ref(root->fs_info,
2427                                               head->qgroup_ref_root,
2428                                               head->qgroup_reserved);
2429                 return ret;
2430         }
2431
2432         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2433             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2434                 ret = run_delayed_tree_ref(trans, root, node, extent_op,
2435                                            insert_reserved);
2436         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2437                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2438                 ret = run_delayed_data_ref(trans, root, node, extent_op,
2439                                            insert_reserved);
2440         else
2441                 BUG();
2442         return ret;
2443 }
2444
2445 static inline struct btrfs_delayed_ref_node *
2446 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2447 {
2448         struct btrfs_delayed_ref_node *ref;
2449
2450         if (list_empty(&head->ref_list))
2451                 return NULL;
2452
2453         /*
2454          * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2455          * This is to prevent a ref count from going down to zero, which deletes
2456          * the extent item from the extent tree, when there still are references
2457          * to add, which would fail because they would not find the extent item.
2458          */
2459         list_for_each_entry(ref, &head->ref_list, list) {
2460                 if (ref->action == BTRFS_ADD_DELAYED_REF)
2461                         return ref;
2462         }
2463
2464         return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
2465                           list);
2466 }
2467
2468 /*
2469  * Returns 0 on success or if called with an already aborted transaction.
2470  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2471  */
2472 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2473                                              struct btrfs_root *root,
2474                                              unsigned long nr)
2475 {
2476         struct btrfs_delayed_ref_root *delayed_refs;
2477         struct btrfs_delayed_ref_node *ref;
2478         struct btrfs_delayed_ref_head *locked_ref = NULL;
2479         struct btrfs_delayed_extent_op *extent_op;
2480         struct btrfs_fs_info *fs_info = root->fs_info;
2481         ktime_t start = ktime_get();
2482         int ret;
2483         unsigned long count = 0;
2484         unsigned long actual_count = 0;
2485         int must_insert_reserved = 0;
2486
2487         delayed_refs = &trans->transaction->delayed_refs;
2488         while (1) {
2489                 if (!locked_ref) {
2490                         if (count >= nr)
2491                                 break;
2492
2493                         spin_lock(&delayed_refs->lock);
2494                         locked_ref = btrfs_select_ref_head(trans);
2495                         if (!locked_ref) {
2496                                 spin_unlock(&delayed_refs->lock);
2497                                 break;
2498                         }
2499
2500                         /* grab the lock that says we are going to process
2501                          * all the refs for this head */
2502                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
2503                         spin_unlock(&delayed_refs->lock);
2504                         /*
2505                          * we may have dropped the spin lock to get the head
2506                          * mutex lock, and that might have given someone else
2507                          * time to free the head.  If that's true, it has been
2508                          * removed from our list and we can move on.
2509                          */
2510                         if (ret == -EAGAIN) {
2511                                 locked_ref = NULL;
2512                                 count++;
2513                                 continue;
2514                         }
2515                 }
2516
2517                 /*
2518                  * We need to try and merge add/drops of the same ref since we
2519                  * can run into issues with relocate dropping the implicit ref
2520                  * and then it being added back again before the drop can
2521                  * finish.  If we merged anything we need to re-loop so we can
2522                  * get a good ref.
2523                  * Or we can get node references of the same type that weren't
2524                  * merged when created due to bumps in the tree mod seq, and
2525                  * we need to merge them to prevent adding an inline extent
2526                  * backref before dropping it (triggering a BUG_ON at
2527                  * insert_inline_extent_backref()).
2528                  */
2529                 spin_lock(&locked_ref->lock);
2530                 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2531                                          locked_ref);
2532
2533                 /*
2534                  * locked_ref is the head node, so we have to go one
2535                  * node back for any delayed ref updates
2536                  */
2537                 ref = select_delayed_ref(locked_ref);
2538
2539                 if (ref && ref->seq &&
2540                     btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2541                         spin_unlock(&locked_ref->lock);
2542                         btrfs_delayed_ref_unlock(locked_ref);
2543                         spin_lock(&delayed_refs->lock);
2544                         locked_ref->processing = 0;
2545                         delayed_refs->num_heads_ready++;
2546                         spin_unlock(&delayed_refs->lock);
2547                         locked_ref = NULL;
2548                         cond_resched();
2549                         count++;
2550                         continue;
2551                 }
2552
2553                 /*
2554                  * record the must insert reserved flag before we
2555                  * drop the spin lock.
2556                  */
2557                 must_insert_reserved = locked_ref->must_insert_reserved;
2558                 locked_ref->must_insert_reserved = 0;
2559
2560                 extent_op = locked_ref->extent_op;
2561                 locked_ref->extent_op = NULL;
2562
2563                 if (!ref) {
2564
2565
2566                         /* All delayed refs have been processed, Go ahead
2567                          * and send the head node to run_one_delayed_ref,
2568                          * so that any accounting fixes can happen
2569                          */
2570                         ref = &locked_ref->node;
2571
2572                         if (extent_op && must_insert_reserved) {
2573                                 btrfs_free_delayed_extent_op(extent_op);
2574                                 extent_op = NULL;
2575                         }
2576
2577                         if (extent_op) {
2578                                 spin_unlock(&locked_ref->lock);
2579                                 ret = run_delayed_extent_op(trans, root,
2580                                                             ref, extent_op);
2581                                 btrfs_free_delayed_extent_op(extent_op);
2582
2583                                 if (ret) {
2584                                         /*
2585                                          * Need to reset must_insert_reserved if
2586                                          * there was an error so the abort stuff
2587                                          * can cleanup the reserved space
2588                                          * properly.
2589                                          */
2590                                         if (must_insert_reserved)
2591                                                 locked_ref->must_insert_reserved = 1;
2592                                         locked_ref->processing = 0;
2593                                         btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2594                                         btrfs_delayed_ref_unlock(locked_ref);
2595                                         return ret;
2596                                 }
2597                                 continue;
2598                         }
2599
2600                         /*
2601                          * Need to drop our head ref lock and re-acquire the
2602                          * delayed ref lock and then re-check to make sure
2603                          * nobody got added.
2604                          */
2605                         spin_unlock(&locked_ref->lock);
2606                         spin_lock(&delayed_refs->lock);
2607                         spin_lock(&locked_ref->lock);
2608                         if (!list_empty(&locked_ref->ref_list) ||
2609                             locked_ref->extent_op) {
2610                                 spin_unlock(&locked_ref->lock);
2611                                 spin_unlock(&delayed_refs->lock);
2612                                 continue;
2613                         }
2614                         ref->in_tree = 0;
2615                         delayed_refs->num_heads--;
2616                         rb_erase(&locked_ref->href_node,
2617                                  &delayed_refs->href_root);
2618                         spin_unlock(&delayed_refs->lock);
2619                 } else {
2620                         actual_count++;
2621                         ref->in_tree = 0;
2622                         list_del(&ref->list);
2623                 }
2624                 atomic_dec(&delayed_refs->num_entries);
2625
2626                 if (!btrfs_delayed_ref_is_head(ref)) {
2627                         /*
2628                          * when we play the delayed ref, also correct the
2629                          * ref_mod on head
2630                          */
2631                         switch (ref->action) {
2632                         case BTRFS_ADD_DELAYED_REF:
2633                         case BTRFS_ADD_DELAYED_EXTENT:
2634                                 locked_ref->node.ref_mod -= ref->ref_mod;
2635                                 break;
2636                         case BTRFS_DROP_DELAYED_REF:
2637                                 locked_ref->node.ref_mod += ref->ref_mod;
2638                                 break;
2639                         default:
2640                                 WARN_ON(1);
2641                         }
2642                 }
2643                 spin_unlock(&locked_ref->lock);
2644
2645                 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2646                                           must_insert_reserved);
2647
2648                 btrfs_free_delayed_extent_op(extent_op);
2649                 if (ret) {
2650                         spin_lock(&delayed_refs->lock);
2651                         locked_ref->processing = 0;
2652                         delayed_refs->num_heads_ready++;
2653                         spin_unlock(&delayed_refs->lock);
2654                         btrfs_delayed_ref_unlock(locked_ref);
2655                         btrfs_put_delayed_ref(ref);
2656                         btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
2657                         return ret;
2658                 }
2659
2660                 /*
2661                  * If this node is a head, that means all the refs in this head
2662                  * have been dealt with, and we will pick the next head to deal
2663                  * with, so we must unlock the head and drop it from the cluster
2664                  * list before we release it.
2665                  */
2666                 if (btrfs_delayed_ref_is_head(ref)) {
2667                         if (locked_ref->is_data &&
2668                             locked_ref->total_ref_mod < 0) {
2669                                 spin_lock(&delayed_refs->lock);
2670                                 delayed_refs->pending_csums -= ref->num_bytes;
2671                                 spin_unlock(&delayed_refs->lock);
2672                         }
2673                         btrfs_delayed_ref_unlock(locked_ref);
2674                         locked_ref = NULL;
2675                 }
2676                 btrfs_put_delayed_ref(ref);
2677                 count++;
2678                 cond_resched();
2679         }
2680
2681         /*
2682          * We don't want to include ref heads since we can have empty ref heads
2683          * and those will drastically skew our runtime down since we just do
2684          * accounting, no actual extent tree updates.
2685          */
2686         if (actual_count > 0) {
2687                 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2688                 u64 avg;
2689
2690                 /*
2691                  * We weigh the current average higher than our current runtime
2692                  * to avoid large swings in the average.
2693                  */
2694                 spin_lock(&delayed_refs->lock);
2695                 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2696                 fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
2697                 spin_unlock(&delayed_refs->lock);
2698         }
2699         return 0;
2700 }
2701
2702 #ifdef SCRAMBLE_DELAYED_REFS
2703 /*
2704  * Normally delayed refs get processed in ascending bytenr order. This
2705  * correlates in most cases to the order added. To expose dependencies on this
2706  * order, we start to process the tree in the middle instead of the beginning
2707  */
2708 static u64 find_middle(struct rb_root *root)
2709 {
2710         struct rb_node *n = root->rb_node;
2711         struct btrfs_delayed_ref_node *entry;
2712         int alt = 1;
2713         u64 middle;
2714         u64 first = 0, last = 0;
2715
2716         n = rb_first(root);
2717         if (n) {
2718                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2719                 first = entry->bytenr;
2720         }
2721         n = rb_last(root);
2722         if (n) {
2723                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2724                 last = entry->bytenr;
2725         }
2726         n = root->rb_node;
2727
2728         while (n) {
2729                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2730                 WARN_ON(!entry->in_tree);
2731
2732                 middle = entry->bytenr;
2733
2734                 if (alt)
2735                         n = n->rb_left;
2736                 else
2737                         n = n->rb_right;
2738
2739                 alt = 1 - alt;
2740         }
2741         return middle;
2742 }
2743 #endif
2744
2745 static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2746 {
2747         u64 num_bytes;
2748
2749         num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2750                              sizeof(struct btrfs_extent_inline_ref));
2751         if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2752                 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2753
2754         /*
2755          * We don't ever fill up leaves all the way so multiply by 2 just to be
2756          * closer to what we're really going to want to use.
2757          */
2758         return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
2759 }
2760
2761 /*
2762  * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2763  * would require to store the csums for that many bytes.
2764  */
2765 u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
2766 {
2767         u64 csum_size;
2768         u64 num_csums_per_leaf;
2769         u64 num_csums;
2770
2771         csum_size = BTRFS_MAX_ITEM_SIZE(root);
2772         num_csums_per_leaf = div64_u64(csum_size,
2773                         (u64)btrfs_super_csum_size(root->fs_info->super_copy));
2774         num_csums = div64_u64(csum_bytes, root->sectorsize);
2775         num_csums += num_csums_per_leaf - 1;
2776         num_csums = div64_u64(num_csums, num_csums_per_leaf);
2777         return num_csums;
2778 }
2779
2780 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2781                                        struct btrfs_root *root)
2782 {
2783         struct btrfs_block_rsv *global_rsv;
2784         u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2785         u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
2786         u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
2787         u64 num_bytes, num_dirty_bgs_bytes;
2788         int ret = 0;
2789
2790         num_bytes = btrfs_calc_trans_metadata_size(root, 1);
2791         num_heads = heads_to_leaves(root, num_heads);
2792         if (num_heads > 1)
2793                 num_bytes += (num_heads - 1) * root->nodesize;
2794         num_bytes <<= 1;
2795         num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
2796         num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root,
2797                                                              num_dirty_bgs);
2798         global_rsv = &root->fs_info->global_block_rsv;
2799
2800         /*
2801          * If we can't allocate any more chunks lets make sure we have _lots_ of
2802          * wiggle room since running delayed refs can create more delayed refs.
2803          */
2804         if (global_rsv->space_info->full) {
2805                 num_dirty_bgs_bytes <<= 1;
2806                 num_bytes <<= 1;
2807         }
2808
2809         spin_lock(&global_rsv->lock);
2810         if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
2811                 ret = 1;
2812         spin_unlock(&global_rsv->lock);
2813         return ret;
2814 }
2815
2816 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2817                                        struct btrfs_root *root)
2818 {
2819         struct btrfs_fs_info *fs_info = root->fs_info;
2820         u64 num_entries =
2821                 atomic_read(&trans->transaction->delayed_refs.num_entries);
2822         u64 avg_runtime;
2823         u64 val;
2824
2825         smp_mb();
2826         avg_runtime = fs_info->avg_delayed_ref_runtime;
2827         val = num_entries * avg_runtime;
2828         if (num_entries * avg_runtime >= NSEC_PER_SEC)
2829                 return 1;
2830         if (val >= NSEC_PER_SEC / 2)
2831                 return 2;
2832
2833         return btrfs_check_space_for_delayed_refs(trans, root);
2834 }
2835
2836 struct async_delayed_refs {
2837         struct btrfs_root *root;
2838         u64 transid;
2839         int count;
2840         int error;
2841         int sync;
2842         struct completion wait;
2843         struct btrfs_work work;
2844 };
2845
2846 static void delayed_ref_async_start(struct btrfs_work *work)
2847 {
2848         struct async_delayed_refs *async;
2849         struct btrfs_trans_handle *trans;
2850         int ret;
2851
2852         async = container_of(work, struct async_delayed_refs, work);
2853
2854         /* if the commit is already started, we don't need to wait here */
2855         if (btrfs_transaction_blocked(async->root->fs_info))
2856                 goto done;
2857
2858         trans = btrfs_join_transaction(async->root);
2859         if (IS_ERR(trans)) {
2860                 async->error = PTR_ERR(trans);
2861                 goto done;
2862         }
2863
2864         /*
2865          * trans->sync means that when we call end_transaction, we won't
2866          * wait on delayed refs
2867          */
2868         trans->sync = true;
2869
2870         /* Don't bother flushing if we got into a different transaction */
2871         if (trans->transid > async->transid)
2872                 goto end;
2873
2874         ret = btrfs_run_delayed_refs(trans, async->root, async->count);
2875         if (ret)
2876                 async->error = ret;
2877 end:
2878         ret = btrfs_end_transaction(trans, async->root);
2879         if (ret && !async->error)
2880                 async->error = ret;
2881 done:
2882         if (async->sync)
2883                 complete(&async->wait);
2884         else
2885                 kfree(async);
2886 }
2887
2888 int btrfs_async_run_delayed_refs(struct btrfs_root *root,
2889                                  unsigned long count, u64 transid, int wait)
2890 {
2891         struct async_delayed_refs *async;
2892         int ret;
2893
2894         async = kmalloc(sizeof(*async), GFP_NOFS);
2895         if (!async)
2896                 return -ENOMEM;
2897
2898         async->root = root->fs_info->tree_root;
2899         async->count = count;
2900         async->error = 0;
2901         async->transid = transid;
2902         if (wait)
2903                 async->sync = 1;
2904         else
2905                 async->sync = 0;
2906         init_completion(&async->wait);
2907
2908         btrfs_init_work(&async->work, btrfs_extent_refs_helper,
2909                         delayed_ref_async_start, NULL, NULL);
2910
2911         btrfs_queue_work(root->fs_info->extent_workers, &async->work);
2912
2913         if (wait) {
2914                 wait_for_completion(&async->wait);
2915                 ret = async->error;
2916                 kfree(async);
2917                 return ret;
2918         }
2919         return 0;
2920 }
2921
2922 /*
2923  * this starts processing the delayed reference count updates and
2924  * extent insertions we have queued up so far.  count can be
2925  * 0, which means to process everything in the tree at the start
2926  * of the run (but not newly added entries), or it can be some target
2927  * number you'd like to process.
2928  *
2929  * Returns 0 on success or if called with an aborted transaction
2930  * Returns <0 on error and aborts the transaction
2931  */
2932 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2933                            struct btrfs_root *root, unsigned long count)
2934 {
2935         struct rb_node *node;
2936         struct btrfs_delayed_ref_root *delayed_refs;
2937         struct btrfs_delayed_ref_head *head;
2938         int ret;
2939         int run_all = count == (unsigned long)-1;
2940         bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
2941
2942         /* We'll clean this up in btrfs_cleanup_transaction */
2943         if (trans->aborted)
2944                 return 0;
2945
2946         if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &root->fs_info->flags))
2947                 return 0;
2948
2949         if (root == root->fs_info->extent_root)
2950                 root = root->fs_info->tree_root;
2951
2952         delayed_refs = &trans->transaction->delayed_refs;
2953         if (count == 0)
2954                 count = atomic_read(&delayed_refs->num_entries) * 2;
2955
2956 again:
2957 #ifdef SCRAMBLE_DELAYED_REFS
2958         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2959 #endif
2960         trans->can_flush_pending_bgs = false;
2961         ret = __btrfs_run_delayed_refs(trans, root, count);
2962         if (ret < 0) {
2963                 btrfs_abort_transaction(trans, ret);
2964                 return ret;
2965         }
2966
2967         if (run_all) {
2968                 if (!list_empty(&trans->new_bgs))
2969                         btrfs_create_pending_block_groups(trans, root);
2970
2971                 spin_lock(&delayed_refs->lock);
2972                 node = rb_first(&delayed_refs->href_root);
2973                 if (!node) {
2974                         spin_unlock(&delayed_refs->lock);
2975                         goto out;
2976                 }
2977                 count = (unsigned long)-1;
2978
2979                 while (node) {
2980                         head = rb_entry(node, struct btrfs_delayed_ref_head,
2981                                         href_node);
2982                         if (btrfs_delayed_ref_is_head(&head->node)) {
2983                                 struct btrfs_delayed_ref_node *ref;
2984
2985                                 ref = &head->node;
2986                                 atomic_inc(&ref->refs);
2987
2988                                 spin_unlock(&delayed_refs->lock);
2989                                 /*
2990                                  * Mutex was contended, block until it's
2991                                  * released and try again
2992                                  */
2993                                 mutex_lock(&head->mutex);
2994                                 mutex_unlock(&head->mutex);
2995
2996                                 btrfs_put_delayed_ref(ref);
2997                                 cond_resched();
2998                                 goto again;
2999                         } else {
3000                                 WARN_ON(1);
3001                         }
3002                         node = rb_next(node);
3003                 }
3004                 spin_unlock(&delayed_refs->lock);
3005                 cond_resched();
3006                 goto again;
3007         }
3008 out:
3009         assert_qgroups_uptodate(trans);
3010         trans->can_flush_pending_bgs = can_flush_pending_bgs;
3011         return 0;
3012 }
3013
3014 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
3015                                 struct btrfs_root *root,
3016                                 u64 bytenr, u64 num_bytes, u64 flags,
3017                                 int level, int is_data)
3018 {
3019         struct btrfs_delayed_extent_op *extent_op;
3020         int ret;
3021
3022         extent_op = btrfs_alloc_delayed_extent_op();
3023         if (!extent_op)
3024                 return -ENOMEM;
3025
3026         extent_op->flags_to_set = flags;
3027         extent_op->update_flags = true;
3028         extent_op->update_key = false;
3029         extent_op->is_data = is_data ? true : false;
3030         extent_op->level = level;
3031
3032         ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
3033                                           num_bytes, extent_op);
3034         if (ret)
3035                 btrfs_free_delayed_extent_op(extent_op);
3036         return ret;
3037 }
3038
3039 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
3040                                       struct btrfs_root *root,
3041                                       struct btrfs_path *path,
3042                                       u64 objectid, u64 offset, u64 bytenr)
3043 {
3044         struct btrfs_delayed_ref_head *head;
3045         struct btrfs_delayed_ref_node *ref;
3046         struct btrfs_delayed_data_ref *data_ref;
3047         struct btrfs_delayed_ref_root *delayed_refs;
3048         int ret = 0;
3049
3050         delayed_refs = &trans->transaction->delayed_refs;
3051         spin_lock(&delayed_refs->lock);
3052         head = btrfs_find_delayed_ref_head(trans, bytenr);
3053         if (!head) {
3054                 spin_unlock(&delayed_refs->lock);
3055                 return 0;
3056         }
3057
3058         if (!mutex_trylock(&head->mutex)) {
3059                 atomic_inc(&head->node.refs);
3060                 spin_unlock(&delayed_refs->lock);
3061
3062                 btrfs_release_path(path);
3063
3064                 /*
3065                  * Mutex was contended, block until it's released and let
3066                  * caller try again
3067                  */
3068                 mutex_lock(&head->mutex);
3069                 mutex_unlock(&head->mutex);
3070                 btrfs_put_delayed_ref(&head->node);
3071                 return -EAGAIN;
3072         }
3073         spin_unlock(&delayed_refs->lock);
3074
3075         spin_lock(&head->lock);
3076         list_for_each_entry(ref, &head->ref_list, list) {
3077                 /* If it's a shared ref we know a cross reference exists */
3078                 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
3079                         ret = 1;
3080                         break;
3081                 }
3082
3083                 data_ref = btrfs_delayed_node_to_data_ref(ref);
3084
3085                 /*
3086                  * If our ref doesn't match the one we're currently looking at
3087                  * then we have a cross reference.
3088                  */
3089                 if (data_ref->root != root->root_key.objectid ||
3090                     data_ref->objectid != objectid ||
3091                     data_ref->offset != offset) {
3092                         ret = 1;
3093                         break;
3094                 }
3095         }
3096         spin_unlock(&head->lock);
3097         mutex_unlock(&head->mutex);
3098         return ret;
3099 }
3100
3101 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
3102                                         struct btrfs_root *root,
3103                                         struct btrfs_path *path,
3104                                         u64 objectid, u64 offset, u64 bytenr)
3105 {
3106         struct btrfs_root *extent_root = root->fs_info->extent_root;
3107         struct extent_buffer *leaf;
3108         struct btrfs_extent_data_ref *ref;
3109         struct btrfs_extent_inline_ref *iref;
3110         struct btrfs_extent_item *ei;
3111         struct btrfs_key key;
3112         u32 item_size;
3113         int ret;
3114
3115         key.objectid = bytenr;
3116         key.offset = (u64)-1;
3117         key.type = BTRFS_EXTENT_ITEM_KEY;
3118
3119         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3120         if (ret < 0)
3121                 goto out;
3122         BUG_ON(ret == 0); /* Corruption */
3123
3124         ret = -ENOENT;
3125         if (path->slots[0] == 0)
3126                 goto out;
3127
3128         path->slots[0]--;
3129         leaf = path->nodes[0];
3130         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3131
3132         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
3133                 goto out;
3134
3135         ret = 1;
3136         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3137 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3138         if (item_size < sizeof(*ei)) {
3139                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3140                 goto out;
3141         }
3142 #endif
3143         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
3144
3145         if (item_size != sizeof(*ei) +
3146             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3147                 goto out;
3148
3149         if (btrfs_extent_generation(leaf, ei) <=
3150             btrfs_root_last_snapshot(&root->root_item))
3151                 goto out;
3152
3153         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3154         if (btrfs_extent_inline_ref_type(leaf, iref) !=
3155             BTRFS_EXTENT_DATA_REF_KEY)
3156                 goto out;
3157
3158         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3159         if (btrfs_extent_refs(leaf, ei) !=
3160             btrfs_extent_data_ref_count(leaf, ref) ||
3161             btrfs_extent_data_ref_root(leaf, ref) !=
3162             root->root_key.objectid ||
3163             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3164             btrfs_extent_data_ref_offset(leaf, ref) != offset)
3165                 goto out;
3166
3167         ret = 0;
3168 out:
3169         return ret;
3170 }
3171
3172 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
3173                           struct btrfs_root *root,
3174                           u64 objectid, u64 offset, u64 bytenr)
3175 {
3176         struct btrfs_path *path;
3177         int ret;
3178         int ret2;
3179
3180         path = btrfs_alloc_path();
3181         if (!path)
3182                 return -ENOENT;
3183
3184         do {
3185                 ret = check_committed_ref(trans, root, path, objectid,
3186                                           offset, bytenr);
3187                 if (ret && ret != -ENOENT)
3188                         goto out;
3189
3190                 ret2 = check_delayed_ref(trans, root, path, objectid,
3191                                          offset, bytenr);
3192         } while (ret2 == -EAGAIN);
3193
3194         if (ret2 && ret2 != -ENOENT) {
3195                 ret = ret2;
3196                 goto out;
3197         }
3198
3199         if (ret != -ENOENT || ret2 != -ENOENT)
3200                 ret = 0;
3201 out:
3202         btrfs_free_path(path);
3203         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3204                 WARN_ON(ret > 0);
3205         return ret;
3206 }
3207
3208 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3209                            struct btrfs_root *root,
3210                            struct extent_buffer *buf,
3211                            int full_backref, int inc)
3212 {
3213         u64 bytenr;
3214         u64 num_bytes;
3215         u64 parent;
3216         u64 ref_root;
3217         u32 nritems;
3218         struct btrfs_key key;
3219         struct btrfs_file_extent_item *fi;
3220         int i;
3221         int level;
3222         int ret = 0;
3223         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
3224                             u64, u64, u64, u64, u64, u64);
3225
3226
3227         if (btrfs_is_testing(root->fs_info))
3228                 return 0;
3229
3230         ref_root = btrfs_header_owner(buf);
3231         nritems = btrfs_header_nritems(buf);
3232         level = btrfs_header_level(buf);
3233
3234         if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3235                 return 0;
3236
3237         if (inc)
3238                 process_func = btrfs_inc_extent_ref;
3239         else
3240                 process_func = btrfs_free_extent;
3241
3242         if (full_backref)
3243                 parent = buf->start;
3244         else
3245                 parent = 0;
3246
3247         for (i = 0; i < nritems; i++) {
3248                 if (level == 0) {
3249                         btrfs_item_key_to_cpu(buf, &key, i);
3250                         if (key.type != BTRFS_EXTENT_DATA_KEY)
3251                                 continue;
3252                         fi = btrfs_item_ptr(buf, i,
3253                                             struct btrfs_file_extent_item);
3254                         if (btrfs_file_extent_type(buf, fi) ==
3255                             BTRFS_FILE_EXTENT_INLINE)
3256                                 continue;
3257                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3258                         if (bytenr == 0)
3259                                 continue;
3260
3261                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3262                         key.offset -= btrfs_file_extent_offset(buf, fi);
3263                         ret = process_func(trans, root, bytenr, num_bytes,
3264                                            parent, ref_root, key.objectid,
3265                                            key.offset);
3266                         if (ret)
3267                                 goto fail;
3268                 } else {
3269                         bytenr = btrfs_node_blockptr(buf, i);
3270                         num_bytes = root->nodesize;
3271                         ret = process_func(trans, root, bytenr, num_bytes,
3272                                            parent, ref_root, level - 1, 0);
3273                         if (ret)
3274                                 goto fail;
3275                 }
3276         }
3277         return 0;
3278 fail:
3279         return ret;
3280 }
3281
3282 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3283                   struct extent_buffer *buf, int full_backref)
3284 {
3285         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3286 }
3287
3288 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3289                   struct extent_buffer *buf, int full_backref)
3290 {
3291         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3292 }
3293
3294 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3295                                  struct btrfs_root *root,
3296                                  struct btrfs_path *path,
3297                                  struct btrfs_block_group_cache *cache)
3298 {
3299         int ret;
3300         struct btrfs_root *extent_root = root->fs_info->extent_root;
3301         unsigned long bi;
3302         struct extent_buffer *leaf;
3303
3304         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3305         if (ret) {
3306                 if (ret > 0)
3307                         ret = -ENOENT;
3308                 goto fail;
3309         }
3310
3311         leaf = path->nodes[0];
3312         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3313         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3314         btrfs_mark_buffer_dirty(leaf);
3315 fail:
3316         btrfs_release_path(path);
3317         return ret;
3318
3319 }
3320
3321 static struct btrfs_block_group_cache *
3322 next_block_group(struct btrfs_root *root,
3323                  struct btrfs_block_group_cache *cache)
3324 {
3325         struct rb_node *node;
3326
3327         spin_lock(&root->fs_info->block_group_cache_lock);
3328
3329         /* If our block group was removed, we need a full search. */
3330         if (RB_EMPTY_NODE(&cache->cache_node)) {
3331                 const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3332
3333                 spin_unlock(&root->fs_info->block_group_cache_lock);
3334                 btrfs_put_block_group(cache);
3335                 cache = btrfs_lookup_first_block_group(root->fs_info,
3336                                                        next_bytenr);
3337                 return cache;
3338         }
3339         node = rb_next(&cache->cache_node);
3340         btrfs_put_block_group(cache);
3341         if (node) {
3342                 cache = rb_entry(node, struct btrfs_block_group_cache,
3343                                  cache_node);
3344                 btrfs_get_block_group(cache);
3345         } else
3346                 cache = NULL;
3347         spin_unlock(&root->fs_info->block_group_cache_lock);
3348         return cache;
3349 }
3350
3351 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3352                             struct btrfs_trans_handle *trans,
3353                             struct btrfs_path *path)
3354 {
3355         struct btrfs_root *root = block_group->fs_info->tree_root;
3356         struct inode *inode = NULL;
3357         u64 alloc_hint = 0;
3358         int dcs = BTRFS_DC_ERROR;
3359         u64 num_pages = 0;
3360         int retries = 0;
3361         int ret = 0;
3362
3363         /*
3364          * If this block group is smaller than 100 megs don't bother caching the
3365          * block group.
3366          */
3367         if (block_group->key.offset < (100 * SZ_1M)) {
3368                 spin_lock(&block_group->lock);
3369                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3370                 spin_unlock(&block_group->lock);
3371                 return 0;
3372         }
3373
3374         if (trans->aborted)
3375                 return 0;
3376 again:
3377         inode = lookup_free_space_inode(root, block_group, path);
3378         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3379                 ret = PTR_ERR(inode);
3380                 btrfs_release_path(path);
3381                 goto out;
3382         }
3383
3384         if (IS_ERR(inode)) {
3385                 BUG_ON(retries);
3386                 retries++;
3387
3388                 if (block_group->ro)
3389                         goto out_free;
3390
3391                 ret = create_free_space_inode(root, trans, block_group, path);
3392                 if (ret)
3393                         goto out_free;
3394                 goto again;
3395         }
3396
3397         /* We've already setup this transaction, go ahead and exit */
3398         if (block_group->cache_generation == trans->transid &&
3399             i_size_read(inode)) {
3400                 dcs = BTRFS_DC_SETUP;
3401                 goto out_put;
3402         }
3403
3404         /*
3405          * We want to set the generation to 0, that way if anything goes wrong
3406          * from here on out we know not to trust this cache when we load up next
3407          * time.
3408          */
3409         BTRFS_I(inode)->generation = 0;
3410         ret = btrfs_update_inode(trans, root, inode);
3411         if (ret) {
3412                 /*
3413                  * So theoretically we could recover from this, simply set the
3414                  * super cache generation to 0 so we know to invalidate the
3415                  * cache, but then we'd have to keep track of the block groups
3416                  * that fail this way so we know we _have_ to reset this cache
3417                  * before the next commit or risk reading stale cache.  So to
3418                  * limit our exposure to horrible edge cases lets just abort the
3419                  * transaction, this only happens in really bad situations
3420                  * anyway.
3421                  */
3422                 btrfs_abort_transaction(trans, ret);
3423                 goto out_put;
3424         }
3425         WARN_ON(ret);
3426
3427         if (i_size_read(inode) > 0) {
3428                 ret = btrfs_check_trunc_cache_free_space(root,
3429                                         &root->fs_info->global_block_rsv);
3430                 if (ret)
3431                         goto out_put;
3432
3433                 ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
3434                 if (ret)
3435                         goto out_put;
3436         }
3437
3438         spin_lock(&block_group->lock);
3439         if (block_group->cached != BTRFS_CACHE_FINISHED ||
3440             !btrfs_test_opt(root->fs_info, SPACE_CACHE)) {
3441                 /*
3442                  * don't bother trying to write stuff out _if_
3443                  * a) we're not cached,
3444                  * b) we're with nospace_cache mount option.
3445                  */
3446                 dcs = BTRFS_DC_WRITTEN;
3447                 spin_unlock(&block_group->lock);
3448                 goto out_put;
3449         }
3450         spin_unlock(&block_group->lock);
3451
3452         /*
3453          * We hit an ENOSPC when setting up the cache in this transaction, just
3454          * skip doing the setup, we've already cleared the cache so we're safe.
3455          */
3456         if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3457                 ret = -ENOSPC;
3458                 goto out_put;
3459         }
3460
3461         /*
3462          * Try to preallocate enough space based on how big the block group is.
3463          * Keep in mind this has to include any pinned space which could end up
3464          * taking up quite a bit since it's not folded into the other space
3465          * cache.
3466          */
3467         num_pages = div_u64(block_group->key.offset, SZ_256M);
3468         if (!num_pages)
3469                 num_pages = 1;
3470
3471         num_pages *= 16;
3472         num_pages *= PAGE_SIZE;
3473
3474         ret = btrfs_check_data_free_space(inode, 0, num_pages);
3475         if (ret)
3476                 goto out_put;
3477
3478         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3479                                               num_pages, num_pages,
3480                                               &alloc_hint);
3481         /*
3482          * Our cache requires contiguous chunks so that we don't modify a bunch
3483          * of metadata or split extents when writing the cache out, which means
3484          * we can enospc if we are heavily fragmented in addition to just normal
3485          * out of space conditions.  So if we hit this just skip setting up any
3486          * other block groups for this transaction, maybe we'll unpin enough
3487          * space the next time around.
3488          */
3489         if (!ret)
3490                 dcs = BTRFS_DC_SETUP;
3491         else if (ret == -ENOSPC)
3492                 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3493
3494 out_put:
3495         iput(inode);
3496 out_free:
3497         btrfs_release_path(path);
3498 out:
3499         spin_lock(&block_group->lock);
3500         if (!ret && dcs == BTRFS_DC_SETUP)
3501                 block_group->cache_generation = trans->transid;
3502         block_group->disk_cache_state = dcs;
3503         spin_unlock(&block_group->lock);
3504
3505         return ret;
3506 }
3507
3508 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3509                             struct btrfs_root *root)
3510 {
3511         struct btrfs_block_group_cache *cache, *tmp;
3512         struct btrfs_transaction *cur_trans = trans->transaction;
3513         struct btrfs_path *path;
3514
3515         if (list_empty(&cur_trans->dirty_bgs) ||
3516             !btrfs_test_opt(root->fs_info, SPACE_CACHE))
3517                 return 0;
3518
3519         path = btrfs_alloc_path();
3520         if (!path)
3521                 return -ENOMEM;
3522
3523         /* Could add new block groups, use _safe just in case */
3524         list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3525                                  dirty_list) {
3526                 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3527                         cache_save_setup(cache, trans, path);
3528         }
3529
3530         btrfs_free_path(path);
3531         return 0;
3532 }
3533
3534 /*
3535  * transaction commit does final block group cache writeback during a
3536  * critical section where nothing is allowed to change the FS.  This is
3537  * required in order for the cache to actually match the block group,
3538  * but can introduce a lot of latency into the commit.
3539  *
3540  * So, btrfs_start_dirty_block_groups is here to kick off block group
3541  * cache IO.  There's a chance we'll have to redo some of it if the
3542  * block group changes again during the commit, but it greatly reduces
3543  * the commit latency by getting rid of the easy block groups while
3544  * we're still allowing others to join the commit.
3545  */
3546 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
3547                                    struct btrfs_root *root)
3548 {
3549         struct btrfs_block_group_cache *cache;
3550         struct btrfs_transaction *cur_trans = trans->transaction;
3551         int ret = 0;
3552         int should_put;
3553         struct btrfs_path *path = NULL;
3554         LIST_HEAD(dirty);
3555         struct list_head *io = &cur_trans->io_bgs;
3556         int num_started = 0;
3557         int loops = 0;
3558
3559         spin_lock(&cur_trans->dirty_bgs_lock);
3560         if (list_empty(&cur_trans->dirty_bgs)) {
3561                 spin_unlock(&cur_trans->dirty_bgs_lock);
3562                 return 0;
3563         }
3564         list_splice_init(&cur_trans->dirty_bgs, &dirty);
3565         spin_unlock(&cur_trans->dirty_bgs_lock);
3566
3567 again:
3568         /*
3569          * make sure all the block groups on our dirty list actually
3570          * exist
3571          */
3572         btrfs_create_pending_block_groups(trans, root);
3573
3574         if (!path) {
3575                 path = btrfs_alloc_path();
3576                 if (!path)
3577                         return -ENOMEM;
3578         }
3579
3580         /*
3581          * cache_write_mutex is here only to save us from balance or automatic
3582          * removal of empty block groups deleting this block group while we are
3583          * writing out the cache
3584          */
3585         mutex_lock(&trans->transaction->cache_write_mutex);
3586         while (!list_empty(&dirty)) {
3587                 cache = list_first_entry(&dirty,
3588                                          struct btrfs_block_group_cache,
3589                                          dirty_list);
3590                 /*
3591                  * this can happen if something re-dirties a block
3592                  * group that is already under IO.  Just wait for it to
3593                  * finish and then do it all again
3594                  */
3595                 if (!list_empty(&cache->io_list)) {
3596                         list_del_init(&cache->io_list);
3597                         btrfs_wait_cache_io(root, trans, cache,
3598                                             &cache->io_ctl, path,
3599                                             cache->key.objectid);
3600                         btrfs_put_block_group(cache);
3601                 }
3602
3603
3604                 /*
3605                  * btrfs_wait_cache_io uses the cache->dirty_list to decide
3606                  * if it should update the cache_state.  Don't delete
3607                  * until after we wait.
3608                  *
3609                  * Since we're not running in the commit critical section
3610                  * we need the dirty_bgs_lock to protect from update_block_group
3611                  */
3612                 spin_lock(&cur_trans->dirty_bgs_lock);
3613                 list_del_init(&cache->dirty_list);
3614                 spin_unlock(&cur_trans->dirty_bgs_lock);
3615
3616                 should_put = 1;
3617
3618                 cache_save_setup(cache, trans, path);
3619
3620                 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3621                         cache->io_ctl.inode = NULL;
3622                         ret = btrfs_write_out_cache(root, trans, cache, path);
3623                         if (ret == 0 && cache->io_ctl.inode) {
3624                                 num_started++;
3625                                 should_put = 0;
3626
3627                                 /*
3628                                  * the cache_write_mutex is protecting
3629                                  * the io_list
3630                                  */
3631                                 list_add_tail(&cache->io_list, io);
3632                         } else {
3633                                 /*
3634                                  * if we failed to write the cache, the
3635                                  * generation will be bad and life goes on
3636                                  */
3637                                 ret = 0;
3638                         }
3639                 }
3640                 if (!ret) {
3641                         ret = write_one_cache_group(trans, root, path, cache);
3642                         /*
3643                          * Our block group might still be attached to the list
3644                          * of new block groups in the transaction handle of some
3645                          * other task (struct btrfs_trans_handle->new_bgs). This
3646                          * means its block group item isn't yet in the extent
3647                          * tree. If this happens ignore the error, as we will
3648                          * try again later in the critical section of the
3649                          * transaction commit.
3650                          */
3651                         if (ret == -ENOENT) {
3652                                 ret = 0;
3653                                 spin_lock(&cur_trans->dirty_bgs_lock);
3654                                 if (list_empty(&cache->dirty_list)) {
3655                                         list_add_tail(&cache->dirty_list,
3656                                                       &cur_trans->dirty_bgs);
3657                                         btrfs_get_block_group(cache);
3658                                 }
3659                                 spin_unlock(&cur_trans->dirty_bgs_lock);
3660                         } else if (ret) {
3661                                 btrfs_abort_transaction(trans, ret);
3662                         }
3663                 }
3664
3665                 /* if its not on the io list, we need to put the block group */
3666                 if (should_put)
3667                         btrfs_put_block_group(cache);
3668
3669                 if (ret)
3670                         break;
3671
3672                 /*
3673                  * Avoid blocking other tasks for too long. It might even save
3674                  * us from writing caches for block groups that are going to be
3675                  * removed.
3676                  */
3677                 mutex_unlock(&trans->transaction->cache_write_mutex);
3678                 mutex_lock(&trans->transaction->cache_write_mutex);
3679         }
3680         mutex_unlock(&trans->transaction->cache_write_mutex);
3681
3682         /*
3683          * go through delayed refs for all the stuff we've just kicked off
3684          * and then loop back (just once)
3685          */
3686         ret = btrfs_run_delayed_refs(trans, root, 0);
3687         if (!ret && loops == 0) {
3688                 loops++;
3689                 spin_lock(&cur_trans->dirty_bgs_lock);
3690                 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3691                 /*
3692                  * dirty_bgs_lock protects us from concurrent block group
3693                  * deletes too (not just cache_write_mutex).
3694                  */
3695                 if (!list_empty(&dirty)) {
3696                         spin_unlock(&cur_trans->dirty_bgs_lock);
3697                         goto again;
3698                 }
3699                 spin_unlock(&cur_trans->dirty_bgs_lock);
3700         } else if (ret < 0) {
3701                 btrfs_cleanup_dirty_bgs(cur_trans, root);
3702         }
3703
3704         btrfs_free_path(path);
3705         return ret;
3706 }
3707
3708 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3709                                    struct btrfs_root *root)
3710 {
3711         struct btrfs_block_group_cache *cache;
3712         struct btrfs_transaction *cur_trans = trans->transaction;
3713         int ret = 0;
3714         int should_put;
3715         struct btrfs_path *path;
3716         struct list_head *io = &cur_trans->io_bgs;
3717         int num_started = 0;
3718
3719         path = btrfs_alloc_path();
3720         if (!path)
3721                 return -ENOMEM;
3722
3723         /*
3724          * Even though we are in the critical section of the transaction commit,
3725          * we can still have concurrent tasks adding elements to this
3726          * transaction's list of dirty block groups. These tasks correspond to
3727          * endio free space workers started when writeback finishes for a
3728          * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3729          * allocate new block groups as a result of COWing nodes of the root
3730          * tree when updating the free space inode. The writeback for the space
3731          * caches is triggered by an earlier call to
3732          * btrfs_start_dirty_block_groups() and iterations of the following
3733          * loop.
3734          * Also we want to do the cache_save_setup first and then run the
3735          * delayed refs to make sure we have the best chance at doing this all
3736          * in one shot.
3737          */
3738         spin_lock(&cur_trans->dirty_bgs_lock);
3739         while (!list_empty(&cur_trans->dirty_bgs)) {
3740                 cache = list_first_entry(&cur_trans->dirty_bgs,
3741                                          struct btrfs_block_group_cache,
3742                                          dirty_list);
3743
3744                 /*
3745                  * this can happen if cache_save_setup re-dirties a block
3746                  * group that is already under IO.  Just wait for it to
3747                  * finish and then do it all again
3748                  */
3749                 if (!list_empty(&cache->io_list)) {
3750                         spin_unlock(&cur_trans->dirty_bgs_lock);
3751                         list_del_init(&cache->io_list);
3752                         btrfs_wait_cache_io(root, trans, cache,
3753                                             &cache->io_ctl, path,
3754                                             cache->key.objectid);
3755                         btrfs_put_block_group(cache);
3756                         spin_lock(&cur_trans->dirty_bgs_lock);
3757                 }
3758
3759                 /*
3760                  * don't remove from the dirty list until after we've waited
3761                  * on any pending IO
3762                  */
3763                 list_del_init(&cache->dirty_list);
3764                 spin_unlock(&cur_trans->dirty_bgs_lock);
3765                 should_put = 1;
3766
3767                 cache_save_setup(cache, trans, path);
3768
3769                 if (!ret)
3770                         ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
3771
3772                 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3773                         cache->io_ctl.inode = NULL;
3774                         ret = btrfs_write_out_cache(root, trans, cache, path);
3775                         if (ret == 0 && cache->io_ctl.inode) {
3776                                 num_started++;
3777                                 should_put = 0;
3778                                 list_add_tail(&cache->io_list, io);
3779                         } else {
3780                                 /*
3781                                  * if we failed to write the cache, the
3782                                  * generation will be bad and life goes on
3783                                  */
3784                                 ret = 0;
3785                         }
3786                 }
3787                 if (!ret) {
3788                         ret = write_one_cache_group(trans, root, path, cache);
3789                         /*
3790                          * One of the free space endio workers might have
3791                          * created a new block group while updating a free space
3792                          * cache's inode (at inode.c:btrfs_finish_ordered_io())
3793                          * and hasn't released its transaction handle yet, in
3794                          * which case the new block group is still attached to
3795                          * its transaction handle and its creation has not
3796                          * finished yet (no block group item in the extent tree
3797                          * yet, etc). If this is the case, wait for all free
3798                          * space endio workers to finish and retry. This is a
3799                          * a very rare case so no need for a more efficient and
3800                          * complex approach.
3801                          */
3802                         if (ret == -ENOENT) {
3803                                 wait_event(cur_trans->writer_wait,
3804                                    atomic_read(&cur_trans->num_writers) == 1);
3805                                 ret = write_one_cache_group(trans, root, path,
3806                                                             cache);
3807                         }
3808                         if (ret)
3809                                 btrfs_abort_transaction(trans, ret);
3810                 }
3811
3812                 /* if its not on the io list, we need to put the block group */
3813                 if (should_put)
3814                         btrfs_put_block_group(cache);
3815                 spin_lock(&cur_trans->dirty_bgs_lock);
3816         }
3817         spin_unlock(&cur_trans->dirty_bgs_lock);
3818
3819         while (!list_empty(io)) {
3820                 cache = list_first_entry(io, struct btrfs_block_group_cache,
3821                                          io_list);
3822                 list_del_init(&cache->io_list);
3823                 btrfs_wait_cache_io(root, trans, cache,
3824                                     &cache->io_ctl, path, cache->key.objectid);
3825                 btrfs_put_block_group(cache);
3826         }
3827
3828         btrfs_free_path(path);
3829         return ret;
3830 }
3831
3832 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3833 {
3834         struct btrfs_block_group_cache *block_group;
3835         int readonly = 0;
3836
3837         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3838         if (!block_group || block_group->ro)
3839                 readonly = 1;
3840         if (block_group)
3841                 btrfs_put_block_group(block_group);
3842         return readonly;
3843 }
3844
3845 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3846 {
3847         struct btrfs_block_group_cache *bg;
3848         bool ret = true;
3849
3850         bg = btrfs_lookup_block_group(fs_info, bytenr);
3851         if (!bg)
3852                 return false;
3853
3854         spin_lock(&bg->lock);
3855         if (bg->ro)
3856                 ret = false;
3857         else
3858                 atomic_inc(&bg->nocow_writers);
3859         spin_unlock(&bg->lock);
3860
3861         /* no put on block group, done by btrfs_dec_nocow_writers */
3862         if (!ret)
3863                 btrfs_put_block_group(bg);
3864
3865         return ret;
3866
3867 }
3868
3869 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3870 {
3871         struct btrfs_block_group_cache *bg;
3872
3873         bg = btrfs_lookup_block_group(fs_info, bytenr);
3874         ASSERT(bg);
3875         if (atomic_dec_and_test(&bg->nocow_writers))
3876                 wake_up_atomic_t(&bg->nocow_writers);
3877         /*
3878          * Once for our lookup and once for the lookup done by a previous call
3879          * to btrfs_inc_nocow_writers()
3880          */
3881         btrfs_put_block_group(bg);
3882         btrfs_put_block_group(bg);
3883 }
3884
3885 static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
3886 {
3887         schedule();
3888         return 0;
3889 }
3890
3891 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3892 {
3893         wait_on_atomic_t(&bg->nocow_writers,
3894                          btrfs_wait_nocow_writers_atomic_t,
3895                          TASK_UNINTERRUPTIBLE);
3896 }
3897
3898 static const char *alloc_name(u64 flags)
3899 {
3900         switch (flags) {
3901         case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
3902                 return "mixed";
3903         case BTRFS_BLOCK_GROUP_METADATA:
3904                 return "metadata";
3905         case BTRFS_BLOCK_GROUP_DATA:
3906                 return "data";
3907         case BTRFS_BLOCK_GROUP_SYSTEM:
3908                 return "system";
3909         default:
3910                 WARN_ON(1);
3911                 return "invalid-combination";
3912         };
3913 }
3914
3915 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3916                              u64 total_bytes, u64 bytes_used,
3917                              u64 bytes_readonly,
3918                              struct btrfs_space_info **space_info)
3919 {
3920         struct btrfs_space_info *found;
3921         int i;
3922         int factor;
3923         int ret;
3924
3925         if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3926                      BTRFS_BLOCK_GROUP_RAID10))
3927                 factor = 2;
3928         else
3929                 factor = 1;
3930
3931         found = __find_space_info(info, flags);
3932         if (found) {
3933                 spin_lock(&found->lock);
3934                 found->total_bytes += total_bytes;
3935                 found->disk_total += total_bytes * factor;
3936                 found->bytes_used += bytes_used;
3937                 found->disk_used += bytes_used * factor;
3938                 found->bytes_readonly += bytes_readonly;
3939                 if (total_bytes > 0)
3940                         found->full = 0;
3941                 space_info_add_new_bytes(info, found, total_bytes -
3942                                          bytes_used - bytes_readonly);
3943                 spin_unlock(&found->lock);
3944                 *space_info = found;
3945                 return 0;
3946         }
3947         found = kzalloc(sizeof(*found), GFP_NOFS);
3948         if (!found)
3949                 return -ENOMEM;
3950
3951         ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
3952         if (ret) {
3953                 kfree(found);
3954                 return ret;
3955         }
3956
3957         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3958                 INIT_LIST_HEAD(&found->block_groups[i]);
3959         init_rwsem(&found->groups_sem);
3960         spin_lock_init(&found->lock);
3961         found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3962         found->total_bytes = total_bytes;
3963         found->disk_total = total_bytes * factor;
3964         found->bytes_used = bytes_used;
3965         found->disk_used = bytes_used * factor;
3966         found->bytes_pinned = 0;
3967         found->bytes_reserved = 0;
3968         found->bytes_readonly = bytes_readonly;
3969         found->bytes_may_use = 0;
3970         found->full = 0;
3971         found->max_extent_size = 0;
3972         found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3973         found->chunk_alloc = 0;
3974         found->flush = 0;
3975         init_waitqueue_head(&found->wait);
3976         INIT_LIST_HEAD(&found->ro_bgs);
3977         INIT_LIST_HEAD(&found->tickets);
3978         INIT_LIST_HEAD(&found->priority_tickets);
3979
3980         ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
3981                                     info->space_info_kobj, "%s",
3982                                     alloc_name(found->flags));
3983         if (ret) {
3984                 kfree(found);
3985                 return ret;
3986         }
3987
3988         *space_info = found;
3989         list_add_rcu(&found->list, &info->space_info);
3990         if (flags & BTRFS_BLOCK_GROUP_DATA)
3991                 info->data_sinfo = found;
3992
3993         return ret;
3994 }
3995
3996 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3997 {
3998         u64 extra_flags = chunk_to_extended(flags) &
3999                                 BTRFS_EXTENDED_PROFILE_MASK;
4000
4001         write_seqlock(&fs_info->profiles_lock);
4002         if (flags & BTRFS_BLOCK_GROUP_DATA)
4003                 fs_info->avail_data_alloc_bits |= extra_flags;
4004         if (flags & BTRFS_BLOCK_GROUP_METADATA)
4005                 fs_info->avail_metadata_alloc_bits |= extra_flags;
4006         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4007                 fs_info->avail_system_alloc_bits |= extra_flags;
4008         write_sequnlock(&fs_info->profiles_lock);
4009 }
4010
4011 /*
4012  * returns target flags in extended format or 0 if restripe for this
4013  * chunk_type is not in progress
4014  *
4015  * should be called with either volume_mutex or balance_lock held
4016  */
4017 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
4018 {
4019         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4020         u64 target = 0;
4021
4022         if (!bctl)
4023                 return 0;
4024
4025         if (flags & BTRFS_BLOCK_GROUP_DATA &&
4026             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4027                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
4028         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
4029                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4030                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
4031         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
4032                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4033                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
4034         }
4035
4036         return target;
4037 }
4038
4039 /*
4040  * @flags: available profiles in extended format (see ctree.h)
4041  *
4042  * Returns reduced profile in chunk format.  If profile changing is in
4043  * progress (either running or paused) picks the target profile (if it's
4044  * already available), otherwise falls back to plain reducing.
4045  */
4046 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
4047 {
4048         u64 num_devices = root->fs_info->fs_devices->rw_devices;
4049         u64 target;
4050         u64 raid_type;
4051         u64 allowed = 0;
4052
4053         /*
4054          * see if restripe for this chunk_type is in progress, if so
4055          * try to reduce to the target profile
4056          */
4057         spin_lock(&root->fs_info->balance_lock);
4058         target = get_restripe_target(root->fs_info, flags);
4059         if (target) {
4060                 /* pick target profile only if it's already available */
4061                 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
4062                         spin_unlock(&root->fs_info->balance_lock);
4063                         return extended_to_chunk(target);
4064                 }
4065         }
4066         spin_unlock(&root->fs_info->balance_lock);
4067
4068         /* First, mask out the RAID levels which aren't possible */
4069         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
4070                 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
4071                         allowed |= btrfs_raid_group[raid_type];
4072         }
4073         allowed &= flags;
4074
4075         if (allowed & BTRFS_BLOCK_GROUP_RAID6)
4076                 allowed = BTRFS_BLOCK_GROUP_RAID6;
4077         else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
4078                 allowed = BTRFS_BLOCK_GROUP_RAID5;
4079         else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
4080                 allowed = BTRFS_BLOCK_GROUP_RAID10;
4081         else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
4082                 allowed = BTRFS_BLOCK_GROUP_RAID1;
4083         else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
4084                 allowed = BTRFS_BLOCK_GROUP_RAID0;
4085
4086         flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
4087
4088         return extended_to_chunk(flags | allowed);
4089 }
4090
4091 static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
4092 {
4093         unsigned seq;
4094         u64 flags;
4095
4096         do {
4097                 flags = orig_flags;
4098                 seq = read_seqbegin(&root->fs_info->profiles_lock);
4099
4100                 if (flags & BTRFS_BLOCK_GROUP_DATA)
4101                         flags |= root->fs_info->avail_data_alloc_bits;
4102                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4103                         flags |= root->fs_info->avail_system_alloc_bits;
4104                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
4105                         flags |= root->fs_info->avail_metadata_alloc_bits;
4106         } while (read_seqretry(&root->fs_info->profiles_lock, seq));
4107
4108         return btrfs_reduce_alloc_profile(root, flags);
4109 }
4110
4111 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
4112 {
4113         u64 flags;
4114         u64 ret;
4115
4116         if (data)
4117                 flags = BTRFS_BLOCK_GROUP_DATA;
4118         else if (root == root->fs_info->chunk_root)
4119                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
4120         else
4121                 flags = BTRFS_BLOCK_GROUP_METADATA;
4122
4123         ret = get_alloc_profile(root, flags);
4124         return ret;
4125 }
4126
4127 int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
4128 {
4129         struct btrfs_space_info *data_sinfo;
4130         struct btrfs_root *root = BTRFS_I(inode)->root;
4131         struct btrfs_fs_info *fs_info = root->fs_info;
4132         u64 used;
4133         int ret = 0;
4134         int need_commit = 2;
4135         int have_pinned_space;
4136         int have_bg_delete_sem = 0;
4137         bool free_space_inode = btrfs_is_free_space_inode(inode);
4138
4139         /* make sure bytes are sectorsize aligned */
4140         bytes = ALIGN(bytes, root->sectorsize);
4141
4142         if (free_space_inode) {
4143                 need_commit = 0;
4144                 ASSERT(current->journal_info);
4145         }
4146
4147         /*
4148          * Here we shouldn't call down_read(bg_delete_sem) for free space inode,
4149          * there is lock order between bg_delete_sem and "wait current trans
4150          * finished". Meanwhile because we only do the data space reservation
4151          * for free space cache in the transaction context,
4152          * btrfs_delete_unused_bgs() will either have finished its job, or start
4153          * a new transaction waiting current transaction to complete, there will
4154          * be no unused block groups to be deleted, so it's safe to not call
4155          * down_read(bg_delete_sem).
4156          */
4157         data_sinfo = fs_info->data_sinfo;
4158         if (!data_sinfo) {
4159                 if (!free_space_inode) {
4160                         down_read(&root->fs_info->bg_delete_sem);
4161                         have_bg_delete_sem = 1;
4162                 }
4163                 goto alloc;
4164         }
4165
4166 again:
4167         /* make sure we have enough space to handle the data first */
4168         spin_lock(&data_sinfo->lock);
4169         used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
4170                 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
4171                 data_sinfo->bytes_may_use;
4172
4173         if (used + bytes > data_sinfo->total_bytes) {
4174                 struct btrfs_trans_handle *trans;
4175
4176                 /*
4177                  * We may need to allocate new chunk, so we should block
4178                  * btrfs_delete_unused_bgs()
4179                  */
4180                 if (!have_bg_delete_sem && !free_space_inode) {
4181                         spin_unlock(&data_sinfo->lock);
4182                         down_read(&root->fs_info->bg_delete_sem);
4183                         have_bg_delete_sem = 1;
4184                         goto again;
4185                 }
4186
4187                 /*
4188                  * if we don't have enough free bytes in this space then we need
4189                  * to alloc a new chunk.
4190                  */
4191                 if (!data_sinfo->full) {
4192                         u64 alloc_target;
4193
4194                         data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
4195                         spin_unlock(&data_sinfo->lock);
4196 alloc:
4197                         alloc_target = btrfs_get_alloc_profile(root, 1);
4198                         /*
4199                          * It is ugly that we don't call nolock join
4200                          * transaction for the free space inode case here.
4201                          * But it is safe because we only do the data space
4202                          * reservation for the free space cache in the
4203                          * transaction context, the common join transaction
4204                          * just increase the counter of the current transaction
4205                          * handler, doesn't try to acquire the trans_lock of
4206                          * the fs.
4207                          */
4208                         trans = btrfs_join_transaction(root);
4209                         if (IS_ERR(trans)) {
4210                                 ret = PTR_ERR(trans);
4211                                 goto out;
4212                         }
4213
4214                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4215                                              alloc_target,
4216                                              CHUNK_ALLOC_NO_FORCE);
4217                         btrfs_end_transaction(trans, root);
4218                         if (ret < 0) {
4219                                 if (ret != -ENOSPC)
4220                                         goto out;
4221                                 else {
4222                                         have_pinned_space = 1;
4223                                         goto commit_trans;
4224                                 }
4225                         }
4226
4227                         if (!data_sinfo)
4228                                 data_sinfo = fs_info->data_sinfo;
4229
4230                         goto again;
4231                 }
4232
4233                 /*
4234                  * If we don't have enough pinned space to deal with this
4235                  * allocation, and no removed chunk in current transaction,
4236                  * don't bother committing the transaction.
4237                  */
4238                 have_pinned_space = percpu_counter_compare(
4239                         &data_sinfo->total_bytes_pinned,
4240                         used + bytes - data_sinfo->total_bytes);
4241                 spin_unlock(&data_sinfo->lock);
4242
4243                 /* commit the current transaction and try again */
4244 commit_trans:
4245                 if (need_commit &&
4246                     !atomic_read(&root->fs_info->open_ioctl_trans)) {
4247                         need_commit--;
4248
4249                         if (need_commit > 0) {
4250                                 btrfs_start_delalloc_roots(fs_info, 0, -1);
4251                                 btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
4252                         }
4253
4254                         trans = btrfs_join_transaction(root);
4255                         if (IS_ERR(trans)) {
4256                                 ret = PTR_ERR(trans);
4257                                 goto out;
4258                         }
4259                         if (have_pinned_space >= 0 ||
4260                             test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4261                                      &trans->transaction->flags) ||
4262                             need_commit > 0) {
4263                                 ret = btrfs_commit_transaction(trans, root);
4264                                 if (ret)
4265                                         goto out;
4266                                 /*
4267                                  * The cleaner kthread might still be doing iput
4268                                  * operations. Wait for it to finish so that
4269                                  * more space is released.
4270                                  */
4271                                 mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
4272                                 mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
4273                                 goto again;
4274                         } else {
4275                                 btrfs_end_transaction(trans, root);
4276                         }
4277                 }
4278
4279                 trace_btrfs_space_reservation(root->fs_info,
4280                                               "space_info:enospc",
4281                                               data_sinfo->flags, bytes, 1);
4282                 ret = -ENOSPC;
4283                 goto out;
4284         }
4285         data_sinfo->bytes_may_use += bytes;
4286         trace_btrfs_space_reservation(root->fs_info, "space_info",
4287                                       data_sinfo->flags, bytes, 1);
4288         spin_unlock(&data_sinfo->lock);
4289
4290 out:
4291         if (have_bg_delete_sem && !free_space_inode)
4292                 up_read(&root->fs_info->bg_delete_sem);
4293
4294         return ret;
4295 }
4296
4297 /*
4298  * New check_data_free_space() with ability for precious data reservation
4299  * Will replace old btrfs_check_data_free_space(), but for patch split,
4300  * add a new function first and then replace it.
4301  */
4302 int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
4303 {
4304         struct btrfs_root *root = BTRFS_I(inode)->root;
4305         int ret;
4306
4307         /* align the range */
4308         len = round_up(start + len, root->sectorsize) -
4309               round_down(start, root->sectorsize);
4310         start = round_down(start, root->sectorsize);
4311
4312         ret = btrfs_alloc_data_chunk_ondemand(inode, len);
4313         if (ret < 0)
4314                 return ret;
4315
4316         /*
4317          * Use new btrfs_qgroup_reserve_data to reserve precious data space
4318          *
4319          * TODO: Find a good method to avoid reserve data space for NOCOW
4320          * range, but don't impact performance on quota disable case.
4321          */
4322         ret = btrfs_qgroup_reserve_data(inode, start, len);
4323         if (ret < 0)
4324                 /* Qgroup reserve failed, need to cleanup reserved data space */
4325                 btrfs_free_reserved_data_space(inode, start, len);
4326         return ret;
4327 }
4328
4329 /*
4330  * Called if we need to clear a data reservation for this inode
4331  * Normally in a error case.
4332  *
4333  * This one will *NOT* use accurate qgroup reserved space API, just for case
4334  * which we can't sleep and is sure it won't affect qgroup reserved space.
4335  * Like clear_bit_hook().
4336  */
4337 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4338                                             u64 len)
4339 {
4340         struct btrfs_root *root = BTRFS_I(inode)->root;
4341         struct btrfs_space_info *data_sinfo;
4342
4343         /* Make sure the range is aligned to sectorsize */
4344         len = round_up(start + len, root->sectorsize) -
4345               round_down(start, root->sectorsize);
4346         start = round_down(start, root->sectorsize);
4347
4348         data_sinfo = root->fs_info->data_sinfo;
4349         spin_lock(&data_sinfo->lock);
4350         if (WARN_ON(data_sinfo->bytes_may_use < len))
4351                 data_sinfo->bytes_may_use = 0;
4352         else
4353                 data_sinfo->bytes_may_use -= len;
4354         trace_btrfs_space_reservation(root->fs_info, "space_info",
4355                                       data_sinfo->flags, len, 0);
4356         spin_unlock(&data_sinfo->lock);
4357 }
4358
4359 /*
4360  * Called if we need to clear a data reservation for this inode
4361  * Normally in a error case.
4362  *
4363  * This one will handle the per-inode data rsv map for accurate reserved
4364  * space framework.
4365  */
4366 void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
4367 {
4368         btrfs_free_reserved_data_space_noquota(inode, start, len);
4369         btrfs_qgroup_free_data(inode, start, len);
4370 }
4371
4372 static void force_metadata_allocation(struct btrfs_fs_info *info)
4373 {
4374         struct list_head *head = &info->space_info;
4375         struct btrfs_space_info *found;
4376
4377         rcu_read_lock();
4378         list_for_each_entry_rcu(found, head, list) {
4379                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4380                         found->force_alloc = CHUNK_ALLOC_FORCE;
4381         }
4382         rcu_read_unlock();
4383 }
4384
4385 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4386 {
4387         return (global->size << 1);
4388 }
4389
4390 static int should_alloc_chunk(struct btrfs_root *root,
4391                               struct btrfs_space_info *sinfo, int force)
4392 {
4393         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4394         u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
4395         u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
4396         u64 thresh;
4397
4398         if (force == CHUNK_ALLOC_FORCE)
4399                 return 1;
4400
4401         /*
4402          * We need to take into account the global rsv because for all intents
4403          * and purposes it's used space.  Don't worry about locking the
4404          * global_rsv, it doesn't change except when the transaction commits.
4405          */
4406         if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
4407                 num_allocated += calc_global_rsv_need_space(global_rsv);
4408
4409         /*
4410          * in limited mode, we want to have some free space up to
4411          * about 1% of the FS size.
4412          */
4413         if (force == CHUNK_ALLOC_LIMITED) {
4414                 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
4415                 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
4416
4417                 if (num_bytes - num_allocated < thresh)
4418                         return 1;
4419         }
4420
4421         if (num_allocated + SZ_2M < div_factor(num_bytes, 8))
4422                 return 0;
4423         return 1;
4424 }
4425
4426 static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
4427 {
4428         u64 num_dev;
4429
4430         if (type & (BTRFS_BLOCK_GROUP_RAID10 |
4431                     BTRFS_BLOCK_GROUP_RAID0 |
4432                     BTRFS_BLOCK_GROUP_RAID5 |
4433                     BTRFS_BLOCK_GROUP_RAID6))
4434                 num_dev = root->fs_info->fs_devices->rw_devices;
4435         else if (type & BTRFS_BLOCK_GROUP_RAID1)
4436                 num_dev = 2;
4437         else
4438                 num_dev = 1;    /* DUP or single */
4439
4440         return num_dev;
4441 }
4442
4443 /*
4444  * If @is_allocation is true, reserve space in the system space info necessary
4445  * for allocating a chunk, otherwise if it's false, reserve space necessary for
4446  * removing a chunk.
4447  */
4448 void check_system_chunk(struct btrfs_trans_handle *trans,
4449                         struct btrfs_root *root,
4450                         u64 type)
4451 {
4452         struct btrfs_space_info *info;
4453         u64 left;
4454         u64 thresh;
4455         int ret = 0;
4456         u64 num_devs;
4457
4458         /*
4459          * Needed because we can end up allocating a system chunk and for an
4460          * atomic and race free space reservation in the chunk block reserve.
4461          */
4462         ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex));
4463
4464         info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4465         spin_lock(&info->lock);
4466         left = info->total_bytes - info->bytes_used - info->bytes_pinned -
4467                 info->bytes_reserved - info->bytes_readonly -
4468                 info->bytes_may_use;
4469         spin_unlock(&info->lock);
4470
4471         num_devs = get_profile_num_devs(root, type);
4472
4473         /* num_devs device items to update and 1 chunk item to add or remove */
4474         thresh = btrfs_calc_trunc_metadata_size(root, num_devs) +
4475                 btrfs_calc_trans_metadata_size(root, 1);
4476
4477         if (left < thresh && btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
4478                 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
4479                         left, thresh, type);
4480                 dump_space_info(info, 0, 0);
4481         }
4482
4483         if (left < thresh) {
4484                 u64 flags;
4485
4486                 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
4487                 /*
4488                  * Ignore failure to create system chunk. We might end up not
4489                  * needing it, as we might not need to COW all nodes/leafs from
4490                  * the paths we visit in the chunk tree (they were already COWed
4491                  * or created in the current transaction for example).
4492                  */
4493                 ret = btrfs_alloc_chunk(trans, root, flags);
4494         }
4495
4496         if (!ret) {
4497                 ret = btrfs_block_rsv_add(root->fs_info->chunk_root,
4498                                           &root->fs_info->chunk_block_rsv,
4499                                           thresh, BTRFS_RESERVE_NO_FLUSH);
4500                 if (!ret)
4501                         trans->chunk_bytes_reserved += thresh;
4502         }
4503 }
4504
4505 /*
4506  * If force is CHUNK_ALLOC_FORCE:
4507  *    - return 1 if it successfully allocates a chunk,
4508  *    - return errors including -ENOSPC otherwise.
4509  * If force is NOT CHUNK_ALLOC_FORCE:
4510  *    - return 0 if it doesn't need to allocate a new chunk,
4511  *    - return 1 if it successfully allocates a chunk,
4512  *    - return errors including -ENOSPC otherwise.
4513  */
4514 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
4515                           struct btrfs_root *extent_root, u64 flags, int force)
4516 {
4517         struct btrfs_space_info *space_info;
4518         struct btrfs_fs_info *fs_info = extent_root->fs_info;
4519         int wait_for_alloc = 0;
4520         int ret = 0;
4521
4522         /* Don't re-enter if we're already allocating a chunk */
4523         if (trans->allocating_chunk)
4524                 return -ENOSPC;
4525
4526         space_info = __find_space_info(extent_root->fs_info, flags);
4527         if (!space_info) {
4528                 ret = update_space_info(extent_root->fs_info, flags,
4529                                         0, 0, 0, &space_info);
4530                 BUG_ON(ret); /* -ENOMEM */
4531         }
4532         BUG_ON(!space_info); /* Logic error */
4533
4534 again:
4535         spin_lock(&space_info->lock);
4536         if (force < space_info->force_alloc)
4537                 force = space_info->force_alloc;
4538         if (space_info->full) {
4539                 if (should_alloc_chunk(extent_root, space_info, force))
4540                         ret = -ENOSPC;
4541                 else
4542                         ret = 0;
4543                 spin_unlock(&space_info->lock);
4544                 return ret;
4545         }
4546
4547         if (!should_alloc_chunk(extent_root, space_info, force)) {
4548                 spin_unlock(&space_info->lock);
4549                 return 0;
4550         } else if (space_info->chunk_alloc) {
4551                 wait_for_alloc = 1;
4552         } else {
4553                 space_info->chunk_alloc = 1;
4554         }
4555
4556         spin_unlock(&space_info->lock);
4557
4558         mutex_lock(&fs_info->chunk_mutex);
4559
4560         /*
4561          * The chunk_mutex is held throughout the entirety of a chunk
4562          * allocation, so once we've acquired the chunk_mutex we know that the
4563          * other guy is done and we need to recheck and see if we should
4564          * allocate.
4565          */
4566         if (wait_for_alloc) {
4567                 mutex_unlock(&fs_info->chunk_mutex);
4568                 wait_for_alloc = 0;
4569                 goto again;
4570         }
4571
4572         trans->allocating_chunk = true;
4573
4574         /*
4575          * If we have mixed data/metadata chunks we want to make sure we keep
4576          * allocating mixed chunks instead of individual chunks.
4577          */
4578         if (btrfs_mixed_space_info(space_info))
4579                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4580
4581         /*
4582          * if we're doing a data chunk, go ahead and make sure that
4583          * we keep a reasonable number of metadata chunks allocated in the
4584          * FS as well.
4585          */
4586         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4587                 fs_info->data_chunk_allocations++;
4588                 if (!(fs_info->data_chunk_allocations %
4589                       fs_info->metadata_ratio))
4590                         force_metadata_allocation(fs_info);
4591         }
4592
4593         /*
4594          * Check if we have enough space in SYSTEM chunk because we may need
4595          * to update devices.
4596          */
4597         check_system_chunk(trans, extent_root, flags);
4598
4599         ret = btrfs_alloc_chunk(trans, extent_root, flags);
4600         trans->allocating_chunk = false;
4601
4602         spin_lock(&space_info->lock);
4603         if (ret < 0 && ret != -ENOSPC)
4604                 goto out;
4605         if (ret)
4606                 space_info->full = 1;
4607         else
4608                 ret = 1;
4609
4610         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4611 out:
4612         space_info->chunk_alloc = 0;
4613         spin_unlock(&space_info->lock);
4614         mutex_unlock(&fs_info->chunk_mutex);
4615         /*
4616          * When we allocate a new chunk we reserve space in the chunk block
4617          * reserve to make sure we can COW nodes/leafs in the chunk tree or
4618          * add new nodes/leafs to it if we end up needing to do it when
4619          * inserting the chunk item and updating device items as part of the
4620          * second phase of chunk allocation, performed by
4621          * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4622          * large number of new block groups to create in our transaction
4623          * handle's new_bgs list to avoid exhausting the chunk block reserve
4624          * in extreme cases - like having a single transaction create many new
4625          * block groups when starting to write out the free space caches of all
4626          * the block groups that were made dirty during the lifetime of the
4627          * transaction.
4628          */
4629         if (trans->can_flush_pending_bgs &&
4630             trans->chunk_bytes_reserved >= (u64)SZ_2M) {
4631                 btrfs_create_pending_block_groups(trans, extent_root);
4632                 btrfs_trans_release_chunk_metadata(trans);
4633         }
4634         return ret;
4635 }
4636
4637 static int can_overcommit(struct btrfs_root *root,
4638                           struct btrfs_space_info *space_info, u64 bytes,
4639                           enum btrfs_reserve_flush_enum flush)
4640 {
4641         struct btrfs_block_rsv *global_rsv;
4642         u64 profile;
4643         u64 space_size;
4644         u64 avail;
4645         u64 used;
4646
4647         /* Don't overcommit when in mixed mode. */
4648         if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
4649                 return 0;
4650
4651         BUG_ON(root->fs_info == NULL);
4652         global_rsv = &root->fs_info->global_block_rsv;
4653         profile = btrfs_get_alloc_profile(root, 0);
4654         used = space_info->bytes_used + space_info->bytes_reserved +
4655                 space_info->bytes_pinned + space_info->bytes_readonly;
4656
4657         /*
4658          * We only want to allow over committing if we have lots of actual space
4659          * free, but if we don't have enough space to handle the global reserve
4660          * space then we could end up having a real enospc problem when trying
4661          * to allocate a chunk or some other such important allocation.
4662          */
4663         spin_lock(&global_rsv->lock);
4664         space_size = calc_global_rsv_need_space(global_rsv);
4665         spin_unlock(&global_rsv->lock);
4666         if (used + space_size >= space_info->total_bytes)
4667                 return 0;
4668
4669         used += space_info->bytes_may_use;
4670
4671         spin_lock(&root->fs_info->free_chunk_lock);
4672         avail = root->fs_info->free_chunk_space;
4673         spin_unlock(&root->fs_info->free_chunk_lock);
4674
4675         /*
4676          * If we have dup, raid1 or raid10 then only half of the free
4677          * space is actually useable.  For raid56, the space info used
4678          * doesn't include the parity drive, so we don't have to
4679          * change the math
4680          */
4681         if (profile & (BTRFS_BLOCK_GROUP_DUP |
4682                        BTRFS_BLOCK_GROUP_RAID1 |
4683                        BTRFS_BLOCK_GROUP_RAID10))
4684                 avail >>= 1;
4685
4686         /*
4687          * If we aren't flushing all things, let us overcommit up to
4688          * 1/2th of the space. If we can flush, don't let us overcommit
4689          * too much, let it overcommit up to 1/8 of the space.
4690          */
4691         if (flush == BTRFS_RESERVE_FLUSH_ALL)
4692                 avail >>= 3;
4693         else
4694                 avail >>= 1;
4695
4696         if (used + bytes < space_info->total_bytes + avail)
4697                 return 1;
4698         return 0;
4699 }
4700
4701 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
4702                                          unsigned long nr_pages, int nr_items)
4703 {
4704         struct super_block *sb = root->fs_info->sb;
4705
4706         if (down_read_trylock(&sb->s_umount)) {
4707                 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4708                 up_read(&sb->s_umount);
4709         } else {
4710                 /*
4711                  * We needn't worry the filesystem going from r/w to r/o though
4712                  * we don't acquire ->s_umount mutex, because the filesystem
4713                  * should guarantee the delalloc inodes list be empty after
4714                  * the filesystem is readonly(all dirty pages are written to
4715                  * the disk).
4716                  */
4717                 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
4718                 if (!current->journal_info)
4719                         btrfs_wait_ordered_roots(root->fs_info, nr_items,
4720                                                  0, (u64)-1);
4721         }
4722 }
4723
4724 static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
4725 {
4726         u64 bytes;
4727         int nr;
4728
4729         bytes = btrfs_calc_trans_metadata_size(root, 1);
4730         nr = (int)div64_u64(to_reclaim, bytes);
4731         if (!nr)
4732                 nr = 1;
4733         return nr;
4734 }
4735
4736 #define EXTENT_SIZE_PER_ITEM    SZ_256K
4737
4738 /*
4739  * shrink metadata reservation for delalloc
4740  */
4741 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
4742                             bool wait_ordered)
4743 {
4744         struct btrfs_block_rsv *block_rsv;
4745         struct btrfs_space_info *space_info;
4746         struct btrfs_trans_handle *trans;
4747         u64 delalloc_bytes;
4748         u64 max_reclaim;
4749         long time_left;
4750         unsigned long nr_pages;
4751         int loops;
4752         int items;
4753         enum btrfs_reserve_flush_enum flush;
4754
4755         /* Calc the number of the pages we need flush for space reservation */
4756         items = calc_reclaim_items_nr(root, to_reclaim);
4757         to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM;
4758
4759         trans = (struct btrfs_trans_handle *)current->journal_info;
4760         block_rsv = &root->fs_info->delalloc_block_rsv;
4761         space_info = block_rsv->space_info;
4762
4763         delalloc_bytes = percpu_counter_sum_positive(
4764                                                 &root->fs_info->delalloc_bytes);
4765         if (delalloc_bytes == 0) {
4766                 if (trans)
4767                         return;
4768                 if (wait_ordered)
4769                         btrfs_wait_ordered_roots(root->fs_info, items,
4770                                                  0, (u64)-1);
4771                 return;
4772         }
4773
4774         loops = 0;
4775         while (delalloc_bytes && loops < 3) {
4776                 max_reclaim = min(delalloc_bytes, to_reclaim);
4777                 nr_pages = max_reclaim >> PAGE_SHIFT;
4778                 btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
4779                 /*
4780                  * We need to wait for the async pages to actually start before
4781                  * we do anything.
4782                  */
4783                 max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages);
4784                 if (!max_reclaim)
4785                         goto skip_async;
4786
4787                 if (max_reclaim <= nr_pages)
4788                         max_reclaim = 0;
4789                 else
4790                         max_reclaim -= nr_pages;
4791
4792                 wait_event(root->fs_info->async_submit_wait,
4793                            atomic_read(&root->fs_info->async_delalloc_pages) <=
4794                            (int)max_reclaim);
4795 skip_async:
4796                 if (!trans)
4797                         flush = BTRFS_RESERVE_FLUSH_ALL;
4798                 else
4799                         flush = BTRFS_RESERVE_NO_FLUSH;
4800                 spin_lock(&space_info->lock);
4801                 if (can_overcommit(root, space_info, orig, flush)) {
4802                         spin_unlock(&space_info->lock);
4803                         break;
4804                 }
4805                 if (list_empty(&space_info->tickets) &&
4806                     list_empty(&space_info->priority_tickets)) {
4807                         spin_unlock(&space_info->lock);
4808                         break;
4809                 }
4810                 spin_unlock(&space_info->lock);
4811
4812                 loops++;
4813                 if (wait_ordered && !trans) {
4814                         btrfs_wait_ordered_roots(root->fs_info, items,
4815                                                  0, (u64)-1);
4816                 } else {
4817                         time_left = schedule_timeout_killable(1);
4818                         if (time_left)
4819                                 break;
4820                 }
4821                 delalloc_bytes = percpu_counter_sum_positive(
4822                                                 &root->fs_info->delalloc_bytes);
4823         }
4824 }
4825
4826 /**
4827  * maybe_commit_transaction - possibly commit the transaction if its ok to
4828  * @root - the root we're allocating for
4829  * @bytes - the number of bytes we want to reserve
4830  * @force - force the commit
4831  *
4832  * This will check to make sure that committing the transaction will actually
4833  * get us somewhere and then commit the transaction if it does.  Otherwise it
4834  * will return -ENOSPC.
4835  */
4836 static int may_commit_transaction(struct btrfs_root *root,
4837                                   struct btrfs_space_info *space_info,
4838                                   u64 bytes, int force)
4839 {
4840         struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
4841         struct btrfs_trans_handle *trans;
4842
4843         trans = (struct btrfs_trans_handle *)current->journal_info;
4844         if (trans)
4845                 return -EAGAIN;
4846
4847         if (force)
4848                 goto commit;
4849
4850         /* See if there is enough pinned space to make this reservation */
4851         if (percpu_counter_compare(&space_info->total_bytes_pinned,
4852                                    bytes) >= 0)
4853                 goto commit;
4854
4855         /*
4856          * See if there is some space in the delayed insertion reservation for
4857          * this reservation.
4858          */
4859         if (space_info != delayed_rsv->space_info)
4860                 return -ENOSPC;
4861
4862         spin_lock(&delayed_rsv->lock);
4863         if (percpu_counter_compare(&space_info->total_bytes_pinned,
4864                                    bytes - delayed_rsv->size) >= 0) {
4865                 spin_unlock(&delayed_rsv->lock);
4866                 return -ENOSPC;
4867         }
4868         spin_unlock(&delayed_rsv->lock);
4869
4870 commit:
4871         trans = btrfs_join_transaction(root);
4872         if (IS_ERR(trans))
4873                 return -ENOSPC;
4874
4875         return btrfs_commit_transaction(trans, root);
4876 }
4877
4878 struct reserve_ticket {
4879         u64 bytes;
4880         int error;
4881         struct list_head list;
4882         wait_queue_head_t wait;
4883 };
4884
4885 static int flush_space(struct btrfs_root *root,
4886                        struct btrfs_space_info *space_info, u64 num_bytes,
4887                        u64 orig_bytes, int state)
4888 {
4889         struct btrfs_trans_handle *trans;
4890         int nr;
4891         int ret = 0;
4892
4893         switch (state) {
4894         case FLUSH_DELAYED_ITEMS_NR:
4895         case FLUSH_DELAYED_ITEMS:
4896                 if (state == FLUSH_DELAYED_ITEMS_NR)
4897                         nr = calc_reclaim_items_nr(root, num_bytes) * 2;
4898                 else
4899                         nr = -1;
4900
4901                 trans = btrfs_join_transaction(root);
4902                 if (IS_ERR(trans)) {
4903                         ret = PTR_ERR(trans);
4904                         break;
4905                 }
4906                 ret = btrfs_run_delayed_items_nr(trans, root, nr);
4907                 btrfs_end_transaction(trans, root);
4908                 break;
4909         case FLUSH_DELALLOC:
4910         case FLUSH_DELALLOC_WAIT:
4911                 shrink_delalloc(root, num_bytes * 2, orig_bytes,
4912                                 state == FLUSH_DELALLOC_WAIT);
4913                 break;
4914         case ALLOC_CHUNK:
4915                 trans = btrfs_join_transaction(root);
4916                 if (IS_ERR(trans)) {
4917                         ret = PTR_ERR(trans);
4918                         break;
4919                 }
4920                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4921                                      btrfs_get_alloc_profile(root, 0),
4922                                      CHUNK_ALLOC_NO_FORCE);
4923                 btrfs_end_transaction(trans, root);
4924                 if (ret > 0 || ret == -ENOSPC)
4925                         ret = 0;
4926                 break;
4927         case COMMIT_TRANS:
4928                 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
4929                 break;
4930         default:
4931                 ret = -ENOSPC;
4932                 break;
4933         }
4934
4935         trace_btrfs_flush_space(root->fs_info, space_info->flags, num_bytes,
4936                                 orig_bytes, state, ret);
4937         return ret;
4938 }
4939
4940 static inline u64
4941 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
4942                                  struct btrfs_space_info *space_info)
4943 {
4944         struct reserve_ticket *ticket;
4945         u64 used;
4946         u64 expected;
4947         u64 to_reclaim = 0;
4948
4949         list_for_each_entry(ticket, &space_info->tickets, list)
4950                 to_reclaim += ticket->bytes;
4951         list_for_each_entry(ticket, &space_info->priority_tickets, list)
4952                 to_reclaim += ticket->bytes;
4953         if (to_reclaim)
4954                 return to_reclaim;
4955
4956         to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
4957         if (can_overcommit(root, space_info, to_reclaim,
4958                            BTRFS_RESERVE_FLUSH_ALL))
4959                 return 0;
4960
4961         used = space_info->bytes_used + space_info->bytes_reserved +
4962                space_info->bytes_pinned + space_info->bytes_readonly +
4963                space_info->bytes_may_use;
4964         if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL))
4965                 expected = div_factor_fine(space_info->total_bytes, 95);
4966         else
4967                 expected = div_factor_fine(space_info->total_bytes, 90);
4968
4969         if (used > expected)
4970                 to_reclaim = used - expected;
4971         else
4972                 to_reclaim = 0;
4973         to_reclaim = min(to_reclaim, space_info->bytes_may_use +
4974                                      space_info->bytes_reserved);
4975         return to_reclaim;
4976 }
4977
4978 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
4979                                         struct btrfs_root *root, u64 used)
4980 {
4981         u64 thresh = div_factor_fine(space_info->total_bytes, 98);
4982
4983         /* If we're just plain full then async reclaim just slows us down. */
4984         if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
4985                 return 0;
4986
4987         if (!btrfs_calc_reclaim_metadata_size(root, space_info))
4988                 return 0;
4989
4990         return (used >= thresh && !btrfs_fs_closing(root->fs_info) &&
4991                 !test_bit(BTRFS_FS_STATE_REMOUNTING,
4992                           &root->fs_info->fs_state));
4993 }
4994
4995 static void wake_all_tickets(struct list_head *head)
4996 {
4997         struct reserve_ticket *ticket;
4998
4999         while (!list_empty(head)) {
5000                 ticket = list_first_entry(head, struct reserve_ticket, list);
5001                 list_del_init(&ticket->list);
5002                 ticket->error = -ENOSPC;
5003                 wake_up(&ticket->wait);
5004         }
5005 }
5006
5007 /*
5008  * This is for normal flushers, we can wait all goddamned day if we want to.  We
5009  * will loop and continuously try to flush as long as we are making progress.
5010  * We count progress as clearing off tickets each time we have to loop.
5011  */
5012 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
5013 {
5014         struct btrfs_fs_info *fs_info;
5015         struct btrfs_space_info *space_info;
5016         u64 to_reclaim;
5017         int flush_state;
5018         int commit_cycles = 0;
5019         u64 last_tickets_id;
5020
5021         fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
5022         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5023
5024         spin_lock(&space_info->lock);
5025         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
5026                                                       space_info);
5027         if (!to_reclaim) {
5028                 space_info->flush = 0;
5029                 spin_unlock(&space_info->lock);
5030                 return;
5031         }
5032         last_tickets_id = space_info->tickets_id;
5033         spin_unlock(&space_info->lock);
5034
5035         flush_state = FLUSH_DELAYED_ITEMS_NR;
5036         do {
5037                 struct reserve_ticket *ticket;
5038                 int ret;
5039
5040                 ret = flush_space(fs_info->fs_root, space_info, to_reclaim,
5041                             to_reclaim, flush_state);
5042                 spin_lock(&space_info->lock);
5043                 if (list_empty(&space_info->tickets)) {
5044                         space_info->flush = 0;
5045                         spin_unlock(&space_info->lock);
5046                         return;
5047                 }
5048                 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
5049                                                               space_info);
5050                 ticket = list_first_entry(&space_info->tickets,
5051                                           struct reserve_ticket, list);
5052                 if (last_tickets_id == space_info->tickets_id) {
5053                         flush_state++;
5054                 } else {
5055                         last_tickets_id = space_info->tickets_id;
5056                         flush_state = FLUSH_DELAYED_ITEMS_NR;
5057                         if (commit_cycles)
5058                                 commit_cycles--;
5059                 }
5060
5061                 if (flush_state > COMMIT_TRANS) {
5062                         commit_cycles++;
5063                         if (commit_cycles > 2) {
5064                                 wake_all_tickets(&space_info->tickets);
5065                                 space_info->flush = 0;
5066                         } else {
5067                                 flush_state = FLUSH_DELAYED_ITEMS_NR;
5068                         }
5069                 }
5070                 spin_unlock(&space_info->lock);
5071         } while (flush_state <= COMMIT_TRANS);
5072 }
5073
5074 void btrfs_init_async_reclaim_work(struct work_struct *work)
5075 {
5076         INIT_WORK(work, btrfs_async_reclaim_metadata_space);
5077 }
5078
5079 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
5080                                             struct btrfs_space_info *space_info,
5081                                             struct reserve_ticket *ticket)
5082 {
5083         u64 to_reclaim;
5084         int flush_state = FLUSH_DELAYED_ITEMS_NR;
5085
5086         spin_lock(&space_info->lock);
5087         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
5088                                                       space_info);
5089         if (!to_reclaim) {
5090                 spin_unlock(&space_info->lock);
5091                 return;
5092         }
5093         spin_unlock(&space_info->lock);
5094
5095         do {
5096                 flush_space(fs_info->fs_root, space_info, to_reclaim,
5097                             to_reclaim, flush_state);
5098                 flush_state++;
5099                 spin_lock(&space_info->lock);
5100                 if (ticket->bytes == 0) {
5101                         spin_unlock(&space_info->lock);
5102                         return;
5103                 }
5104                 spin_unlock(&space_info->lock);
5105
5106                 /*
5107                  * Priority flushers can't wait on delalloc without
5108                  * deadlocking.
5109                  */
5110                 if (flush_state == FLUSH_DELALLOC ||
5111                     flush_state == FLUSH_DELALLOC_WAIT)
5112                         flush_state = ALLOC_CHUNK;
5113         } while (flush_state < COMMIT_TRANS);
5114 }
5115
5116 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
5117                                struct btrfs_space_info *space_info,
5118                                struct reserve_ticket *ticket, u64 orig_bytes)
5119
5120 {
5121         DEFINE_WAIT(wait);
5122         int ret = 0;
5123
5124         spin_lock(&space_info->lock);
5125         while (ticket->bytes > 0 && ticket->error == 0) {
5126                 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
5127                 if (ret) {
5128                         ret = -EINTR;
5129                         break;
5130                 }
5131                 spin_unlock(&space_info->lock);
5132
5133                 schedule();
5134
5135                 finish_wait(&ticket->wait, &wait);
5136                 spin_lock(&space_info->lock);
5137         }
5138         if (!ret)
5139                 ret = ticket->error;
5140         if (!list_empty(&ticket->list))
5141                 list_del_init(&ticket->list);
5142         if (ticket->bytes && ticket->bytes < orig_bytes) {
5143                 u64 num_bytes = orig_bytes - ticket->bytes;
5144                 space_info->bytes_may_use -= num_bytes;
5145                 trace_btrfs_space_reservation(fs_info, "space_info",
5146                                               space_info->flags, num_bytes, 0);
5147         }
5148         spin_unlock(&space_info->lock);
5149
5150         return ret;
5151 }
5152
5153 /**
5154  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5155  * @root - the root we're allocating for
5156  * @space_info - the space info we want to allocate from
5157  * @orig_bytes - the number of bytes we want
5158  * @flush - whether or not we can flush to make our reservation
5159  *
5160  * This will reserve orig_bytes number of bytes from the space info associated
5161  * with the block_rsv.  If there is not enough space it will make an attempt to
5162  * flush out space to make room.  It will do this by flushing delalloc if
5163  * possible or committing the transaction.  If flush is 0 then no attempts to
5164  * regain reservations will be made and this will fail if there is not enough
5165  * space already.
5166  */
5167 static int __reserve_metadata_bytes(struct btrfs_root *root,
5168                                     struct btrfs_space_info *space_info,
5169                                     u64 orig_bytes,
5170                                     enum btrfs_reserve_flush_enum flush)
5171 {
5172         struct reserve_ticket ticket;
5173         u64 used;
5174         int ret = 0;
5175
5176         ASSERT(orig_bytes);
5177         ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
5178
5179         spin_lock(&space_info->lock);
5180         ret = -ENOSPC;
5181         used = space_info->bytes_used + space_info->bytes_reserved +
5182                 space_info->bytes_pinned + space_info->bytes_readonly +
5183                 space_info->bytes_may_use;
5184
5185         /*
5186          * If we have enough space then hooray, make our reservation and carry
5187          * on.  If not see if we can overcommit, and if we can, hooray carry on.
5188          * If not things get more complicated.
5189          */
5190         if (used + orig_bytes <= space_info->total_bytes) {
5191                 space_info->bytes_may_use += orig_bytes;
5192                 trace_btrfs_space_reservation(root->fs_info, "space_info",
5193                                               space_info->flags, orig_bytes,
5194                                               1);
5195                 ret = 0;
5196         } else if (can_overcommit(root, space_info, orig_bytes, flush)) {
5197                 space_info->bytes_may_use += orig_bytes;
5198                 trace_btrfs_space_reservation(root->fs_info, "space_info",
5199                                               space_info->flags, orig_bytes,
5200                                               1);
5201                 ret = 0;
5202         }
5203
5204         /*
5205          * If we couldn't make a reservation then setup our reservation ticket
5206          * and kick the async worker if it's not already running.
5207          *
5208          * If we are a priority flusher then we just need to add our ticket to
5209          * the list and we will do our own flushing further down.
5210          */
5211         if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
5212                 ticket.bytes = orig_bytes;
5213                 ticket.error = 0;
5214                 init_waitqueue_head(&ticket.wait);
5215                 if (flush == BTRFS_RESERVE_FLUSH_ALL) {
5216                         list_add_tail(&ticket.list, &space_info->tickets);
5217                         if (!space_info->flush) {
5218                                 space_info->flush = 1;
5219                                 trace_btrfs_trigger_flush(root->fs_info,
5220                                                           space_info->flags,
5221                                                           orig_bytes, flush,
5222                                                           "enospc");
5223                                 queue_work(system_unbound_wq,
5224                                            &root->fs_info->async_reclaim_work);
5225                         }
5226                 } else {
5227                         list_add_tail(&ticket.list,
5228                                       &space_info->priority_tickets);
5229                 }
5230         } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
5231                 used += orig_bytes;
5232                 /*
5233                  * We will do the space reservation dance during log replay,
5234                  * which means we won't have fs_info->fs_root set, so don't do
5235                  * the async reclaim as we will panic.
5236                  */
5237                 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags) &&
5238                     need_do_async_reclaim(space_info, root, used) &&
5239                     !work_busy(&root->fs_info->async_reclaim_work)) {
5240                         trace_btrfs_trigger_flush(root->fs_info,
5241                                                   space_info->flags,
5242                                                   orig_bytes, flush,
5243                                                   "preempt");
5244                         queue_work(system_unbound_wq,
5245                                    &root->fs_info->async_reclaim_work);
5246                 }
5247         }
5248         spin_unlock(&space_info->lock);
5249         if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
5250                 return ret;
5251
5252         if (flush == BTRFS_RESERVE_FLUSH_ALL)
5253                 return wait_reserve_ticket(root->fs_info, space_info, &ticket,
5254                                            orig_bytes);
5255
5256         ret = 0;
5257         priority_reclaim_metadata_space(root->fs_info, space_info, &ticket);
5258         spin_lock(&space_info->lock);
5259         if (ticket.bytes) {
5260                 if (ticket.bytes < orig_bytes) {
5261                         u64 num_bytes = orig_bytes - ticket.bytes;
5262                         space_info->bytes_may_use -= num_bytes;
5263                         trace_btrfs_space_reservation(root->fs_info,
5264                                         "space_info", space_info->flags,
5265                                         num_bytes, 0);
5266
5267                 }
5268                 list_del_init(&ticket.list);
5269                 ret = -ENOSPC;
5270         }
5271         spin_unlock(&space_info->lock);
5272         ASSERT(list_empty(&ticket.list));
5273         return ret;
5274 }
5275
5276 /**
5277  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5278  * @root - the root we're allocating for
5279  * @block_rsv - the block_rsv we're allocating for
5280  * @orig_bytes - the number of bytes we want
5281  * @flush - whether or not we can flush to make our reservation
5282  *
5283  * This will reserve orgi_bytes number of bytes from the space info associated
5284  * with the block_rsv.  If there is not enough space it will make an attempt to
5285  * flush out space to make room.  It will do this by flushing delalloc if
5286  * possible or committing the transaction.  If flush is 0 then no attempts to
5287  * regain reservations will be made and this will fail if there is not enough
5288  * space already.
5289  */
5290 static int reserve_metadata_bytes(struct btrfs_root *root,
5291                                   struct btrfs_block_rsv *block_rsv,
5292                                   u64 orig_bytes,
5293                                   enum btrfs_reserve_flush_enum flush)
5294 {
5295         int ret;
5296
5297         ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes,
5298                                        flush);
5299         if (ret == -ENOSPC &&
5300             unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5301                 struct btrfs_block_rsv *global_rsv =
5302                         &root->fs_info->global_block_rsv;
5303
5304                 if (block_rsv != global_rsv &&
5305                     !block_rsv_use_bytes(global_rsv, orig_bytes))
5306                         ret = 0;
5307         }
5308         if (ret == -ENOSPC)
5309                 trace_btrfs_space_reservation(root->fs_info,
5310                                               "space_info:enospc",
5311                                               block_rsv->space_info->flags,
5312                                               orig_bytes, 1);
5313         return ret;
5314 }
5315
5316 static struct btrfs_block_rsv *get_block_rsv(
5317                                         const struct btrfs_trans_handle *trans,
5318                                         const struct btrfs_root *root)
5319 {
5320         struct btrfs_block_rsv *block_rsv = NULL;
5321
5322         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
5323             (root == root->fs_info->csum_root && trans->adding_csums) ||
5324              (root == root->fs_info->uuid_root))
5325                 block_rsv = trans->block_rsv;
5326
5327         if (!block_rsv)
5328                 block_rsv = root->block_rsv;
5329
5330         if (!block_rsv)
5331                 block_rsv = &root->fs_info->empty_block_rsv;
5332
5333         return block_rsv;
5334 }
5335
5336 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
5337                                u64 num_bytes)
5338 {
5339         int ret = -ENOSPC;
5340         spin_lock(&block_rsv->lock);
5341         if (block_rsv->reserved >= num_bytes) {
5342                 block_rsv->reserved -= num_bytes;
5343                 if (block_rsv->reserved < block_rsv->size)
5344                         block_rsv->full = 0;
5345                 ret = 0;
5346         }
5347         spin_unlock(&block_rsv->lock);
5348         return ret;
5349 }
5350
5351 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
5352                                 u64 num_bytes, int update_size)
5353 {
5354         spin_lock(&block_rsv->lock);
5355         block_rsv->reserved += num_bytes;
5356         if (update_size)
5357                 block_rsv->size += num_bytes;
5358         else if (block_rsv->reserved >= block_rsv->size)
5359                 block_rsv->full = 1;
5360         spin_unlock(&block_rsv->lock);
5361 }
5362
5363 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5364                              struct btrfs_block_rsv *dest, u64 num_bytes,
5365                              int min_factor)
5366 {
5367         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5368         u64 min_bytes;
5369
5370         if (global_rsv->space_info != dest->space_info)
5371                 return -ENOSPC;
5372
5373         spin_lock(&global_rsv->lock);
5374         min_bytes = div_factor(global_rsv->size, min_factor);
5375         if (global_rsv->reserved < min_bytes + num_bytes) {
5376                 spin_unlock(&global_rsv->lock);
5377                 return -ENOSPC;
5378         }
5379         global_rsv->reserved -= num_bytes;
5380         if (global_rsv->reserved < global_rsv->size)
5381                 global_rsv->full = 0;
5382         spin_unlock(&global_rsv->lock);
5383
5384         block_rsv_add_bytes(dest, num_bytes, 1);
5385         return 0;
5386 }
5387
5388 /*
5389  * This is for space we already have accounted in space_info->bytes_may_use, so
5390  * basically when we're returning space from block_rsv's.
5391  */
5392 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
5393                                      struct btrfs_space_info *space_info,
5394                                      u64 num_bytes)
5395 {
5396         struct reserve_ticket *ticket;
5397         struct list_head *head;
5398         u64 used;
5399         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
5400         bool check_overcommit = false;
5401
5402         spin_lock(&space_info->lock);
5403         head = &space_info->priority_tickets;
5404
5405         /*
5406          * If we are over our limit then we need to check and see if we can
5407          * overcommit, and if we can't then we just need to free up our space
5408          * and not satisfy any requests.
5409          */
5410         used = space_info->bytes_used + space_info->bytes_reserved +
5411                 space_info->bytes_pinned + space_info->bytes_readonly +
5412                 space_info->bytes_may_use;
5413         if (used - num_bytes >= space_info->total_bytes)
5414                 check_overcommit = true;
5415 again:
5416         while (!list_empty(head) && num_bytes) {
5417                 ticket = list_first_entry(head, struct reserve_ticket,
5418                                           list);
5419                 /*
5420                  * We use 0 bytes because this space is already reserved, so
5421                  * adding the ticket space would be a double count.
5422                  */
5423                 if (check_overcommit &&
5424                     !can_overcommit(fs_info->extent_root, space_info, 0,
5425                                     flush))
5426                         break;
5427                 if (num_bytes >= ticket->bytes) {
5428                         list_del_init(&ticket->list);
5429                         num_bytes -= ticket->bytes;
5430                         ticket->bytes = 0;
5431                         space_info->tickets_id++;
5432                         wake_up(&ticket->wait);
5433                 } else {
5434                         ticket->bytes -= num_bytes;
5435                         num_bytes = 0;
5436                 }
5437         }
5438
5439         if (num_bytes && head == &space_info->priority_tickets) {
5440                 head = &space_info->tickets;
5441                 flush = BTRFS_RESERVE_FLUSH_ALL;
5442                 goto again;
5443         }
5444         space_info->bytes_may_use -= num_bytes;
5445         trace_btrfs_space_reservation(fs_info, "space_info",
5446                                       space_info->flags, num_bytes, 0);
5447         spin_unlock(&space_info->lock);
5448 }
5449
5450 /*
5451  * This is for newly allocated space that isn't accounted in
5452  * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
5453  * we use this helper.
5454  */
5455 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
5456                                      struct btrfs_space_info *space_info,
5457                                      u64 num_bytes)
5458 {
5459         struct reserve_ticket *ticket;
5460         struct list_head *head = &space_info->priority_tickets;
5461
5462 again:
5463         while (!list_empty(head) && num_bytes) {
5464                 ticket = list_first_entry(head, struct reserve_ticket,
5465                                           list);
5466                 if (num_bytes >= ticket->bytes) {
5467                         trace_btrfs_space_reservation(fs_info, "space_info",
5468                                                       space_info->flags,
5469                                                       ticket->bytes, 1);
5470                         list_del_init(&ticket->list);
5471                         num_bytes -= ticket->bytes;
5472                         space_info->bytes_may_use += ticket->bytes;
5473                         ticket->bytes = 0;
5474                         space_info->tickets_id++;
5475                         wake_up(&ticket->wait);
5476                 } else {
5477                         trace_btrfs_space_reservation(fs_info, "space_info",
5478                                                       space_info->flags,
5479                                                       num_bytes, 1);
5480                         space_info->bytes_may_use += num_bytes;
5481                         ticket->bytes -= num_bytes;
5482                         num_bytes = 0;
5483                 }
5484         }
5485
5486         if (num_bytes && head == &space_info->priority_tickets) {
5487                 head = &space_info->tickets;
5488                 goto again;
5489         }
5490 }
5491
5492 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5493                                     struct btrfs_block_rsv *block_rsv,
5494                                     struct btrfs_block_rsv *dest, u64 num_bytes)
5495 {
5496         struct btrfs_space_info *space_info = block_rsv->space_info;
5497
5498         spin_lock(&block_rsv->lock);
5499         if (num_bytes == (u64)-1)
5500                 num_bytes = block_rsv->size;
5501         block_rsv->size -= num_bytes;
5502         if (block_rsv->reserved >= block_rsv->size) {
5503                 num_bytes = block_rsv->reserved - block_rsv->size;
5504                 block_rsv->reserved = block_rsv->size;
5505                 block_rsv->full = 1;
5506         } else {
5507                 num_bytes = 0;
5508         }
5509         spin_unlock(&block_rsv->lock);
5510
5511         if (num_bytes > 0) {
5512                 if (dest) {
5513                         spin_lock(&dest->lock);
5514                         if (!dest->full) {
5515                                 u64 bytes_to_add;
5516
5517                                 bytes_to_add = dest->size - dest->reserved;
5518                                 bytes_to_add = min(num_bytes, bytes_to_add);
5519                                 dest->reserved += bytes_to_add;
5520                                 if (dest->reserved >= dest->size)
5521                                         dest->full = 1;
5522                                 num_bytes -= bytes_to_add;
5523                         }
5524                         spin_unlock(&dest->lock);
5525                 }
5526                 if (num_bytes)
5527                         space_info_add_old_bytes(fs_info, space_info,
5528                                                  num_bytes);
5529         }
5530 }
5531
5532 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
5533                             struct btrfs_block_rsv *dst, u64 num_bytes,
5534                             int update_size)
5535 {
5536         int ret;
5537
5538         ret = block_rsv_use_bytes(src, num_bytes);
5539         if (ret)
5540                 return ret;
5541
5542         block_rsv_add_bytes(dst, num_bytes, update_size);
5543         return 0;
5544 }
5545
5546 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5547 {
5548         memset(rsv, 0, sizeof(*rsv));
5549         spin_lock_init(&rsv->lock);
5550         rsv->type = type;
5551 }
5552
5553 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
5554                                               unsigned short type)
5555 {
5556         struct btrfs_block_rsv *block_rsv;
5557         struct btrfs_fs_info *fs_info = root->fs_info;
5558
5559         block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5560         if (!block_rsv)
5561                 return NULL;
5562
5563         btrfs_init_block_rsv(block_rsv, type);
5564         block_rsv->space_info = __find_space_info(fs_info,
5565                                                   BTRFS_BLOCK_GROUP_METADATA);
5566         return block_rsv;
5567 }
5568
5569 void btrfs_free_block_rsv(struct btrfs_root *root,
5570                           struct btrfs_block_rsv *rsv)
5571 {
5572         if (!rsv)
5573                 return;
5574         btrfs_block_rsv_release(root, rsv, (u64)-1);
5575         kfree(rsv);
5576 }
5577
5578 void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
5579 {
5580         kfree(rsv);
5581 }
5582
5583 int btrfs_block_rsv_add(struct btrfs_root *root,
5584                         struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5585                         enum btrfs_reserve_flush_enum flush)
5586 {
5587         int ret;
5588
5589         if (num_bytes == 0)
5590                 return 0;
5591
5592         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5593         if (!ret) {
5594                 block_rsv_add_bytes(block_rsv, num_bytes, 1);
5595                 return 0;
5596         }
5597
5598         return ret;
5599 }
5600
5601 int btrfs_block_rsv_check(struct btrfs_root *root,
5602                           struct btrfs_block_rsv *block_rsv, int min_factor)
5603 {
5604         u64 num_bytes = 0;
5605         int ret = -ENOSPC;
5606
5607         if (!block_rsv)
5608                 return 0;
5609
5610         spin_lock(&block_rsv->lock);
5611         num_bytes = div_factor(block_rsv->size, min_factor);
5612         if (block_rsv->reserved >= num_bytes)
5613                 ret = 0;
5614         spin_unlock(&block_rsv->lock);
5615
5616         return ret;
5617 }
5618
5619 int btrfs_block_rsv_refill(struct btrfs_root *root,
5620                            struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5621                            enum btrfs_reserve_flush_enum flush)
5622 {
5623         u64 num_bytes = 0;
5624         int ret = -ENOSPC;
5625
5626         if (!block_rsv)
5627                 return 0;
5628
5629         spin_lock(&block_rsv->lock);
5630         num_bytes = min_reserved;
5631         if (block_rsv->reserved >= num_bytes)
5632                 ret = 0;
5633         else
5634                 num_bytes -= block_rsv->reserved;
5635         spin_unlock(&block_rsv->lock);
5636
5637         if (!ret)
5638                 return 0;
5639
5640         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5641         if (!ret) {
5642                 block_rsv_add_bytes(block_rsv, num_bytes, 0);
5643                 return 0;
5644         }
5645
5646         return ret;
5647 }
5648
5649 void btrfs_block_rsv_release(struct btrfs_root *root,
5650                              struct btrfs_block_rsv *block_rsv,
5651                              u64 num_bytes)
5652 {
5653         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5654         if (global_rsv == block_rsv ||
5655             block_rsv->space_info != global_rsv->space_info)
5656                 global_rsv = NULL;
5657         block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
5658                                 num_bytes);
5659 }
5660
5661 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5662 {
5663         struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5664         struct btrfs_space_info *sinfo = block_rsv->space_info;
5665         u64 num_bytes;
5666
5667         /*
5668          * The global block rsv is based on the size of the extent tree, the
5669          * checksum tree and the root tree.  If the fs is empty we want to set
5670          * it to a minimal amount for safety.
5671          */
5672         num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
5673                 btrfs_root_used(&fs_info->csum_root->root_item) +
5674                 btrfs_root_used(&fs_info->tree_root->root_item);
5675         num_bytes = max_t(u64, num_bytes, SZ_16M);
5676
5677         spin_lock(&sinfo->lock);
5678         spin_lock(&block_rsv->lock);
5679
5680         block_rsv->size = min_t(u64, num_bytes, SZ_512M);
5681
5682         if (block_rsv->reserved < block_rsv->size) {
5683                 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
5684                         sinfo->bytes_reserved + sinfo->bytes_readonly +
5685                         sinfo->bytes_may_use;
5686                 if (sinfo->total_bytes > num_bytes) {
5687                         num_bytes = sinfo->total_bytes - num_bytes;
5688                         num_bytes = min(num_bytes,
5689                                         block_rsv->size - block_rsv->reserved);
5690                         block_rsv->reserved += num_bytes;
5691                         sinfo->bytes_may_use += num_bytes;
5692                         trace_btrfs_space_reservation(fs_info, "space_info",
5693                                                       sinfo->flags, num_bytes,
5694                                                       1);
5695                 }
5696         } else if (block_rsv->reserved > block_rsv->size) {
5697                 num_bytes = block_rsv->reserved - block_rsv->size;
5698                 sinfo->bytes_may_use -= num_bytes;
5699                 trace_btrfs_space_reservation(fs_info, "space_info",
5700                                       sinfo->flags, num_bytes, 0);
5701                 block_rsv->reserved = block_rsv->size;
5702         }
5703
5704         if (block_rsv->reserved == block_rsv->size)
5705                 block_rsv->full = 1;
5706         else
5707                 block_rsv->full = 0;
5708
5709         spin_unlock(&block_rsv->lock);
5710         spin_unlock(&sinfo->lock);
5711 }
5712
5713 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5714 {
5715         struct btrfs_space_info *space_info;
5716
5717         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5718         fs_info->chunk_block_rsv.space_info = space_info;
5719
5720         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5721         fs_info->global_block_rsv.space_info = space_info;
5722         fs_info->delalloc_block_rsv.space_info = space_info;
5723         fs_info->trans_block_rsv.space_info = space_info;
5724         fs_info->empty_block_rsv.space_info = space_info;
5725         fs_info->delayed_block_rsv.space_info = space_info;
5726
5727         fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
5728         fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
5729         fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5730         fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5731         if (fs_info->quota_root)
5732                 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5733         fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5734
5735         update_global_block_rsv(fs_info);
5736 }
5737
5738 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5739 {
5740         block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5741                                 (u64)-1);
5742         WARN_ON(fs_info->delalloc_block_rsv.size > 0);
5743         WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
5744         WARN_ON(fs_info->trans_block_rsv.size > 0);
5745         WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5746         WARN_ON(fs_info->chunk_block_rsv.size > 0);
5747         WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5748         WARN_ON(fs_info->delayed_block_rsv.size > 0);
5749         WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5750 }
5751
5752 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
5753                                   struct btrfs_root *root)
5754 {
5755         if (!trans->block_rsv)
5756                 return;
5757
5758         if (!trans->bytes_reserved)
5759                 return;
5760
5761         trace_btrfs_space_reservation(root->fs_info, "transaction",
5762                                       trans->transid, trans->bytes_reserved, 0);
5763         btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
5764         trans->bytes_reserved = 0;
5765 }
5766
5767 /*
5768  * To be called after all the new block groups attached to the transaction
5769  * handle have been created (btrfs_create_pending_block_groups()).
5770  */
5771 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5772 {
5773         struct btrfs_fs_info *fs_info = trans->fs_info;
5774
5775         if (!trans->chunk_bytes_reserved)
5776                 return;
5777
5778         WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5779
5780         block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5781                                 trans->chunk_bytes_reserved);
5782         trans->chunk_bytes_reserved = 0;
5783 }
5784
5785 /* Can only return 0 or -ENOSPC */
5786 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
5787                                   struct inode *inode)
5788 {
5789         struct btrfs_root *root = BTRFS_I(inode)->root;
5790         /*
5791          * We always use trans->block_rsv here as we will have reserved space
5792          * for our orphan when starting the transaction, using get_block_rsv()
5793          * here will sometimes make us choose the wrong block rsv as we could be
5794          * doing a reloc inode for a non refcounted root.
5795          */
5796         struct btrfs_block_rsv *src_rsv = trans->block_rsv;
5797         struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
5798
5799         /*
5800          * We need to hold space in order to delete our orphan item once we've
5801          * added it, so this takes the reservation so we can release it later
5802          * when we are truly done with the orphan item.
5803          */
5804         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
5805         trace_btrfs_space_reservation(root->fs_info, "orphan",
5806                                       btrfs_ino(inode), num_bytes, 1);
5807         return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
5808 }
5809
5810 void btrfs_orphan_release_metadata(struct inode *inode)
5811 {
5812         struct btrfs_root *root = BTRFS_I(inode)->root;
5813         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
5814         trace_btrfs_space_reservation(root->fs_info, "orphan",
5815                                       btrfs_ino(inode), num_bytes, 0);
5816         btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
5817 }
5818
5819 /*
5820  * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
5821  * root: the root of the parent directory
5822  * rsv: block reservation
5823  * items: the number of items that we need do reservation
5824  * qgroup_reserved: used to return the reserved size in qgroup
5825  *
5826  * This function is used to reserve the space for snapshot/subvolume
5827  * creation and deletion. Those operations are different with the
5828  * common file/directory operations, they change two fs/file trees
5829  * and root tree, the number of items that the qgroup reserves is
5830  * different with the free space reservation. So we can not use
5831  * the space reservation mechanism in start_transaction().
5832  */
5833 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5834                                      struct btrfs_block_rsv *rsv,
5835                                      int items,
5836                                      u64 *qgroup_reserved,
5837                                      bool use_global_rsv)
5838 {
5839         u64 num_bytes;
5840         int ret;
5841         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5842
5843         if (test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags)) {
5844                 /* One for parent inode, two for dir entries */
5845                 num_bytes = 3 * root->nodesize;
5846                 ret = btrfs_qgroup_reserve_meta(root, num_bytes);
5847                 if (ret)
5848                         return ret;
5849         } else {
5850                 num_bytes = 0;
5851         }
5852
5853         *qgroup_reserved = num_bytes;
5854
5855         num_bytes = btrfs_calc_trans_metadata_size(root, items);
5856         rsv->space_info = __find_space_info(root->fs_info,
5857                                             BTRFS_BLOCK_GROUP_METADATA);
5858         ret = btrfs_block_rsv_add(root, rsv, num_bytes,
5859                                   BTRFS_RESERVE_FLUSH_ALL);
5860
5861         if (ret == -ENOSPC && use_global_rsv)
5862                 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
5863
5864         if (ret && *qgroup_reserved)
5865                 btrfs_qgroup_free_meta(root, *qgroup_reserved);
5866
5867         return ret;
5868 }
5869
5870 void btrfs_subvolume_release_metadata(struct btrfs_root *root,
5871                                       struct btrfs_block_rsv *rsv,
5872                                       u64 qgroup_reserved)
5873 {
5874         btrfs_block_rsv_release(root, rsv, (u64)-1);
5875 }
5876
5877 /**
5878  * drop_outstanding_extent - drop an outstanding extent
5879  * @inode: the inode we're dropping the extent for
5880  * @num_bytes: the number of bytes we're releasing.
5881  *
5882  * This is called when we are freeing up an outstanding extent, either called
5883  * after an error or after an extent is written.  This will return the number of
5884  * reserved extents that need to be freed.  This must be called with
5885  * BTRFS_I(inode)->lock held.
5886  */
5887 static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
5888 {
5889         unsigned drop_inode_space = 0;
5890         unsigned dropped_extents = 0;
5891         unsigned num_extents = 0;
5892
5893         num_extents = (unsigned)div64_u64(num_bytes +
5894                                           BTRFS_MAX_EXTENT_SIZE - 1,
5895                                           BTRFS_MAX_EXTENT_SIZE);
5896         ASSERT(num_extents);
5897         ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents);
5898         BTRFS_I(inode)->outstanding_extents -= num_extents;
5899
5900         if (BTRFS_I(inode)->outstanding_extents == 0 &&
5901             test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5902                                &BTRFS_I(inode)->runtime_flags))
5903                 drop_inode_space = 1;
5904
5905         /*
5906          * If we have more or the same amount of outstanding extents than we have
5907          * reserved then we need to leave the reserved extents count alone.
5908          */
5909         if (BTRFS_I(inode)->outstanding_extents >=
5910             BTRFS_I(inode)->reserved_extents)
5911                 return drop_inode_space;
5912
5913         dropped_extents = BTRFS_I(inode)->reserved_extents -
5914                 BTRFS_I(inode)->outstanding_extents;
5915         BTRFS_I(inode)->reserved_extents -= dropped_extents;
5916         return dropped_extents + drop_inode_space;
5917 }
5918
5919 /**
5920  * calc_csum_metadata_size - return the amount of metadata space that must be
5921  *      reserved/freed for the given bytes.
5922  * @inode: the inode we're manipulating
5923  * @num_bytes: the number of bytes in question
5924  * @reserve: 1 if we are reserving space, 0 if we are freeing space
5925  *
5926  * This adjusts the number of csum_bytes in the inode and then returns the
5927  * correct amount of metadata that must either be reserved or freed.  We
5928  * calculate how many checksums we can fit into one leaf and then divide the
5929  * number of bytes that will need to be checksumed by this value to figure out
5930  * how many checksums will be required.  If we are adding bytes then the number
5931  * may go up and we will return the number of additional bytes that must be
5932  * reserved.  If it is going down we will return the number of bytes that must
5933  * be freed.
5934  *
5935  * This must be called with BTRFS_I(inode)->lock held.
5936  */
5937 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
5938                                    int reserve)
5939 {
5940         struct btrfs_root *root = BTRFS_I(inode)->root;
5941         u64 old_csums, num_csums;
5942
5943         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
5944             BTRFS_I(inode)->csum_bytes == 0)
5945                 return 0;
5946
5947         old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
5948         if (reserve)
5949                 BTRFS_I(inode)->csum_bytes += num_bytes;
5950         else
5951                 BTRFS_I(inode)->csum_bytes -= num_bytes;
5952         num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
5953
5954         /* No change, no need to reserve more */
5955         if (old_csums == num_csums)
5956                 return 0;
5957
5958         if (reserve)
5959                 return btrfs_calc_trans_metadata_size(root,
5960                                                       num_csums - old_csums);
5961
5962         return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
5963 }
5964
5965 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
5966 {
5967         struct btrfs_root *root = BTRFS_I(inode)->root;
5968         struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
5969         u64 to_reserve = 0;
5970         u64 csum_bytes;
5971         unsigned nr_extents = 0;
5972         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
5973         int ret = 0;
5974         bool delalloc_lock = true;
5975         u64 to_free = 0;
5976         unsigned dropped;
5977         bool release_extra = false;
5978
5979         /* If we are a free space inode we need to not flush since we will be in
5980          * the middle of a transaction commit.  We also don't need the delalloc
5981          * mutex since we won't race with anybody.  We need this mostly to make
5982          * lockdep shut its filthy mouth.
5983          *
5984          * If we have a transaction open (can happen if we call truncate_block
5985          * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
5986          */
5987         if (btrfs_is_free_space_inode(inode)) {
5988                 flush = BTRFS_RESERVE_NO_FLUSH;
5989                 delalloc_lock = false;
5990         } else if (current->journal_info) {
5991                 flush = BTRFS_RESERVE_FLUSH_LIMIT;
5992         }
5993
5994         if (flush != BTRFS_RESERVE_NO_FLUSH &&
5995             btrfs_transaction_in_commit(root->fs_info))
5996                 schedule_timeout(1);
5997
5998         if (delalloc_lock)
5999                 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
6000
6001         num_bytes = ALIGN(num_bytes, root->sectorsize);
6002
6003         spin_lock(&BTRFS_I(inode)->lock);
6004         nr_extents = (unsigned)div64_u64(num_bytes +
6005                                          BTRFS_MAX_EXTENT_SIZE - 1,
6006                                          BTRFS_MAX_EXTENT_SIZE);
6007         BTRFS_I(inode)->outstanding_extents += nr_extents;
6008
6009         nr_extents = 0;
6010         if (BTRFS_I(inode)->outstanding_extents >
6011             BTRFS_I(inode)->reserved_extents)
6012                 nr_extents += BTRFS_I(inode)->outstanding_extents -
6013                         BTRFS_I(inode)->reserved_extents;
6014
6015         /* We always want to reserve a slot for updating the inode. */
6016         to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents + 1);
6017         to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
6018         csum_bytes = BTRFS_I(inode)->csum_bytes;
6019         spin_unlock(&BTRFS_I(inode)->lock);
6020
6021         if (test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags)) {
6022                 ret = btrfs_qgroup_reserve_meta(root,
6023                                 nr_extents * root->nodesize);
6024                 if (ret)
6025                         goto out_fail;
6026         }
6027
6028         ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush);
6029         if (unlikely(ret)) {
6030                 btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
6031                 goto out_fail;
6032         }
6033
6034         spin_lock(&BTRFS_I(inode)->lock);
6035         if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
6036                              &BTRFS_I(inode)->runtime_flags)) {
6037                 to_reserve -= btrfs_calc_trans_metadata_size(root, 1);
6038                 release_extra = true;
6039         }
6040         BTRFS_I(inode)->reserved_extents += nr_extents;
6041         spin_unlock(&BTRFS_I(inode)->lock);
6042
6043         if (delalloc_lock)
6044                 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
6045
6046         if (to_reserve)
6047                 trace_btrfs_space_reservation(root->fs_info, "delalloc",
6048                                               btrfs_ino(inode), to_reserve, 1);
6049         if (release_extra)
6050                 btrfs_block_rsv_release(root, block_rsv,
6051                                         btrfs_calc_trans_metadata_size(root,
6052                                                                        1));
6053         return 0;
6054
6055 out_fail:
6056         spin_lock(&BTRFS_I(inode)->lock);
6057         dropped = drop_outstanding_extent(inode, num_bytes);
6058         /*
6059          * If the inodes csum_bytes is the same as the original
6060          * csum_bytes then we know we haven't raced with any free()ers
6061          * so we can just reduce our inodes csum bytes and carry on.
6062          */
6063         if (BTRFS_I(inode)->csum_bytes == csum_bytes) {
6064                 calc_csum_metadata_size(inode, num_bytes, 0);
6065         } else {
6066                 u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes;
6067                 u64 bytes;
6068
6069                 /*
6070                  * This is tricky, but first we need to figure out how much we
6071                  * freed from any free-ers that occurred during this
6072                  * reservation, so we reset ->csum_bytes to the csum_bytes
6073                  * before we dropped our lock, and then call the free for the
6074                  * number of bytes that were freed while we were trying our
6075                  * reservation.
6076                  */
6077                 bytes = csum_bytes - BTRFS_I(inode)->csum_bytes;
6078                 BTRFS_I(inode)->csum_bytes = csum_bytes;
6079                 to_free = calc_csum_metadata_size(inode, bytes, 0);
6080
6081
6082                 /*
6083                  * Now we need to see how much we would have freed had we not
6084                  * been making this reservation and our ->csum_bytes were not
6085                  * artificially inflated.
6086                  */
6087                 BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes;
6088                 bytes = csum_bytes - orig_csum_bytes;
6089                 bytes = calc_csum_metadata_size(inode, bytes, 0);
6090
6091                 /*
6092                  * Now reset ->csum_bytes to what it should be.  If bytes is
6093                  * more than to_free then we would have freed more space had we
6094                  * not had an artificially high ->csum_bytes, so we need to free
6095                  * the remainder.  If bytes is the same or less then we don't
6096                  * need to do anything, the other free-ers did the correct
6097                  * thing.
6098                  */
6099                 BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes;
6100                 if (bytes > to_free)
6101                         to_free = bytes - to_free;
6102                 else
6103                         to_free = 0;
6104         }
6105         spin_unlock(&BTRFS_I(inode)->lock);
6106         if (dropped)
6107                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
6108
6109         if (to_free) {
6110                 btrfs_block_rsv_release(root, block_rsv, to_free);
6111                 trace_btrfs_space_reservation(root->fs_info, "delalloc",
6112                                               btrfs_ino(inode), to_free, 0);
6113         }
6114         if (delalloc_lock)
6115                 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
6116         return ret;
6117 }
6118
6119 /**
6120  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
6121  * @inode: the inode to release the reservation for
6122  * @num_bytes: the number of bytes we're releasing
6123  *
6124  * This will release the metadata reservation for an inode.  This can be called
6125  * once we complete IO for a given set of bytes to release their metadata
6126  * reservations.
6127  */
6128 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
6129 {
6130         struct btrfs_root *root = BTRFS_I(inode)->root;
6131         u64 to_free = 0;
6132         unsigned dropped;
6133
6134         num_bytes = ALIGN(num_bytes, root->sectorsize);
6135         spin_lock(&BTRFS_I(inode)->lock);
6136         dropped = drop_outstanding_extent(inode, num_bytes);
6137
6138         if (num_bytes)
6139                 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
6140         spin_unlock(&BTRFS_I(inode)->lock);
6141         if (dropped > 0)
6142                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
6143
6144         if (btrfs_is_testing(root->fs_info))
6145                 return;
6146
6147         trace_btrfs_space_reservation(root->fs_info, "delalloc",
6148                                       btrfs_ino(inode), to_free, 0);
6149
6150         btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
6151                                 to_free);
6152 }
6153
6154 /**
6155  * btrfs_delalloc_reserve_space - reserve data and metadata space for
6156  * delalloc
6157  * @inode: inode we're writing to
6158  * @start: start range we are writing to
6159  * @len: how long the range we are writing to
6160  *
6161  * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
6162  *
6163  * This will do the following things
6164  *
6165  * o reserve space in data space info for num bytes
6166  *   and reserve precious corresponding qgroup space
6167  *   (Done in check_data_free_space)
6168  *
6169  * o reserve space for metadata space, based on the number of outstanding
6170  *   extents and how much csums will be needed
6171  *   also reserve metadata space in a per root over-reserve method.
6172  * o add to the inodes->delalloc_bytes
6173  * o add it to the fs_info's delalloc inodes list.
6174  *   (Above 3 all done in delalloc_reserve_metadata)
6175  *
6176  * Return 0 for success
6177  * Return <0 for error(-ENOSPC or -EQUOT)
6178  */
6179 int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
6180 {
6181         int ret;
6182
6183         ret = btrfs_check_data_free_space(inode, start, len);
6184         if (ret < 0)
6185                 return ret;
6186         ret = btrfs_delalloc_reserve_metadata(inode, len);
6187         if (ret < 0)
6188                 btrfs_free_reserved_data_space(inode, start, len);
6189         return ret;
6190 }
6191
6192 /**
6193  * btrfs_delalloc_release_space - release data and metadata space for delalloc
6194  * @inode: inode we're releasing space for
6195  * @start: start position of the space already reserved
6196  * @len: the len of the space already reserved
6197  *
6198  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
6199  * called in the case that we don't need the metadata AND data reservations
6200  * anymore.  So if there is an error or we insert an inline extent.
6201  *
6202  * This function will release the metadata space that was not used and will
6203  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
6204  * list if there are no delalloc bytes left.
6205  * Also it will handle the qgroup reserved space.
6206  */
6207 void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
6208 {
6209         btrfs_delalloc_release_metadata(inode, len);
6210         btrfs_free_reserved_data_space(inode, start, len);
6211 }
6212
6213 static int update_block_group(struct btrfs_trans_handle *trans,
6214                               struct btrfs_root *root, u64 bytenr,
6215                               u64 num_bytes, int alloc)
6216 {
6217         struct btrfs_block_group_cache *cache = NULL;
6218         struct btrfs_fs_info *info = root->fs_info;
6219         u64 total = num_bytes;
6220         u64 old_val;
6221         u64 byte_in_group;
6222         int factor;
6223
6224         /* block accounting for super block */
6225         spin_lock(&info->delalloc_root_lock);
6226         old_val = btrfs_super_bytes_used(info->super_copy);
6227         if (alloc)
6228                 old_val += num_bytes;
6229         else
6230                 old_val -= num_bytes;
6231         btrfs_set_super_bytes_used(info->super_copy, old_val);
6232         spin_unlock(&info->delalloc_root_lock);
6233
6234         while (total) {
6235                 cache = btrfs_lookup_block_group(info, bytenr);
6236                 if (!cache)
6237                         return -ENOENT;
6238                 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
6239                                     BTRFS_BLOCK_GROUP_RAID1 |
6240                                     BTRFS_BLOCK_GROUP_RAID10))
6241                         factor = 2;
6242                 else
6243                         factor = 1;
6244                 /*
6245                  * If this block group has free space cache written out, we
6246                  * need to make sure to load it if we are removing space.  This
6247                  * is because we need the unpinning stage to actually add the
6248                  * space back to the block group, otherwise we will leak space.
6249                  */
6250                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
6251                         cache_block_group(cache, 1);
6252
6253                 byte_in_group = bytenr - cache->key.objectid;
6254                 WARN_ON(byte_in_group > cache->key.offset);
6255
6256                 spin_lock(&cache->space_info->lock);
6257                 spin_lock(&cache->lock);
6258
6259                 if (btrfs_test_opt(root->fs_info, SPACE_CACHE) &&
6260                     cache->disk_cache_state < BTRFS_DC_CLEAR)
6261                         cache->disk_cache_state = BTRFS_DC_CLEAR;
6262
6263                 old_val = btrfs_block_group_used(&cache->item);
6264                 num_bytes = min(total, cache->key.offset - byte_in_group);
6265                 if (alloc) {
6266                         old_val += num_bytes;
6267                         btrfs_set_block_group_used(&cache->item, old_val);
6268                         cache->reserved -= num_bytes;
6269                         cache->space_info->bytes_reserved -= num_bytes;
6270                         cache->space_info->bytes_used += num_bytes;
6271                         cache->space_info->disk_used += num_bytes * factor;
6272                         spin_unlock(&cache->lock);
6273                         spin_unlock(&cache->space_info->lock);
6274                 } else {
6275                         old_val -= num_bytes;
6276                         btrfs_set_block_group_used(&cache->item, old_val);
6277                         cache->pinned += num_bytes;
6278                         cache->space_info->bytes_pinned += num_bytes;
6279                         cache->space_info->bytes_used -= num_bytes;
6280                         cache->space_info->disk_used -= num_bytes * factor;
6281                         spin_unlock(&cache->lock);
6282                         spin_unlock(&cache->space_info->lock);
6283
6284                         trace_btrfs_space_reservation(root->fs_info, "pinned",
6285                                                       cache->space_info->flags,
6286                                                       num_bytes, 1);
6287                         set_extent_dirty(info->pinned_extents,
6288                                          bytenr, bytenr + num_bytes - 1,
6289                                          GFP_NOFS | __GFP_NOFAIL);
6290                 }
6291
6292                 spin_lock(&trans->transaction->dirty_bgs_lock);
6293                 if (list_empty(&cache->dirty_list)) {
6294                         list_add_tail(&cache->dirty_list,
6295                                       &trans->transaction->dirty_bgs);
6296                                 trans->transaction->num_dirty_bgs++;
6297                         btrfs_get_block_group(cache);
6298                 }
6299                 spin_unlock(&trans->transaction->dirty_bgs_lock);
6300
6301                 /*
6302                  * No longer have used bytes in this block group, queue it for
6303                  * deletion. We do this after adding the block group to the
6304                  * dirty list to avoid races between cleaner kthread and space
6305                  * cache writeout.
6306                  */
6307                 if (!alloc && old_val == 0) {
6308                         spin_lock(&info->unused_bgs_lock);
6309                         if (list_empty(&cache->bg_list)) {
6310                                 btrfs_get_block_group(cache);
6311                                 list_add_tail(&cache->bg_list,
6312                                               &info->unused_bgs);
6313                         }
6314                         spin_unlock(&info->unused_bgs_lock);
6315                 }
6316
6317                 btrfs_put_block_group(cache);
6318                 total -= num_bytes;
6319                 bytenr += num_bytes;
6320         }
6321         return 0;
6322 }
6323
6324 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
6325 {
6326         struct btrfs_block_group_cache *cache;
6327         u64 bytenr;
6328
6329         spin_lock(&root->fs_info->block_group_cache_lock);
6330         bytenr = root->fs_info->first_logical_byte;
6331         spin_unlock(&root->fs_info->block_group_cache_lock);
6332
6333         if (bytenr < (u64)-1)
6334                 return bytenr;
6335
6336         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
6337         if (!cache)
6338                 return 0;
6339
6340         bytenr = cache->key.objectid;
6341         btrfs_put_block_group(cache);
6342
6343         return bytenr;
6344 }
6345
6346 static int pin_down_extent(struct btrfs_root *root,
6347                            struct btrfs_block_group_cache *cache,
6348                            u64 bytenr, u64 num_bytes, int reserved)
6349 {
6350         spin_lock(&cache->space_info->lock);
6351         spin_lock(&cache->lock);
6352         cache->pinned += num_bytes;
6353         cache->space_info->bytes_pinned += num_bytes;
6354         if (reserved) {
6355                 cache->reserved -= num_bytes;
6356                 cache->space_info->bytes_reserved -= num_bytes;
6357         }
6358         spin_unlock(&cache->lock);
6359         spin_unlock(&cache->space_info->lock);
6360
6361         trace_btrfs_space_reservation(root->fs_info, "pinned",
6362                                       cache->space_info->flags, num_bytes, 1);
6363         set_extent_dirty(root->fs_info->pinned_extents, bytenr,
6364                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
6365         return 0;
6366 }
6367
6368 /*
6369  * this function must be called within transaction
6370  */
6371 int btrfs_pin_extent(struct btrfs_root *root,
6372                      u64 bytenr, u64 num_bytes, int reserved)
6373 {
6374         struct btrfs_block_group_cache *cache;
6375
6376         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
6377         BUG_ON(!cache); /* Logic error */
6378
6379         pin_down_extent(root, cache, bytenr, num_bytes, reserved);
6380
6381         btrfs_put_block_group(cache);
6382         return 0;
6383 }
6384
6385 /*
6386  * this function must be called within transaction
6387  */
6388 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
6389                                     u64 bytenr, u64 num_bytes)
6390 {
6391         struct btrfs_block_group_cache *cache;
6392         int ret;
6393
6394         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
6395         if (!cache)
6396                 return -EINVAL;
6397
6398         /*
6399          * pull in the free space cache (if any) so that our pin
6400          * removes the free space from the cache.  We have load_only set
6401          * to one because the slow code to read in the free extents does check
6402          * the pinned extents.
6403          */
6404         cache_block_group(cache, 1);
6405
6406         pin_down_extent(root, cache, bytenr, num_bytes, 0);
6407
6408         /* remove us from the free space cache (if we're there at all) */
6409         ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
6410         btrfs_put_block_group(cache);
6411         return ret;
6412 }
6413
6414 static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
6415 {
6416         int ret;
6417         struct btrfs_block_group_cache *block_group;
6418         struct btrfs_caching_control *caching_ctl;
6419
6420         block_group = btrfs_lookup_block_group(root->fs_info, start);
6421         if (!block_group)
6422                 return -EINVAL;
6423
6424         cache_block_group(block_group, 0);
6425         caching_ctl = get_caching_control(block_group);
6426
6427         if (!caching_ctl) {
6428                 /* Logic error */
6429                 BUG_ON(!block_group_cache_done(block_group));
6430                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
6431         } else {
6432                 mutex_lock(&caching_ctl->mutex);
6433
6434                 if (start >= caching_ctl->progress) {
6435                         ret = add_excluded_extent(root, start, num_bytes);
6436                 } else if (start + num_bytes <= caching_ctl->progress) {
6437                         ret = btrfs_remove_free_space(block_group,
6438                                                       start, num_bytes);
6439                 } else {
6440                         num_bytes = caching_ctl->progress - start;
6441                         ret = btrfs_remove_free_space(block_group,
6442                                                       start, num_bytes);
6443                         if (ret)
6444                                 goto out_lock;
6445
6446                         num_bytes = (start + num_bytes) -
6447                                 caching_ctl->progress;
6448                         start = caching_ctl->progress;
6449                         ret = add_excluded_extent(root, start, num_bytes);
6450                 }
6451 out_lock:
6452                 mutex_unlock(&caching_ctl->mutex);
6453                 put_caching_control(caching_ctl);
6454         }
6455         btrfs_put_block_group(block_group);
6456         return ret;
6457 }
6458
6459 int btrfs_exclude_logged_extents(struct btrfs_root *log,
6460                                  struct extent_buffer *eb)
6461 {
6462         struct btrfs_file_extent_item *item;
6463         struct btrfs_key key;
6464         int found_type;
6465         int i;
6466
6467         if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
6468                 return 0;
6469
6470         for (i = 0; i < btrfs_header_nritems(eb); i++) {
6471                 btrfs_item_key_to_cpu(eb, &key, i);
6472                 if (key.type != BTRFS_EXTENT_DATA_KEY)
6473                         continue;
6474                 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
6475                 found_type = btrfs_file_extent_type(eb, item);
6476                 if (found_type == BTRFS_FILE_EXTENT_INLINE)
6477                         continue;
6478                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
6479                         continue;
6480                 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
6481                 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
6482                 __exclude_logged_extent(log, key.objectid, key.offset);
6483         }
6484
6485         return 0;
6486 }
6487
6488 static void
6489 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
6490 {
6491         atomic_inc(&bg->reservations);
6492 }
6493
6494 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
6495                                         const u64 start)
6496 {
6497         struct btrfs_block_group_cache *bg;
6498
6499         bg = btrfs_lookup_block_group(fs_info, start);
6500         ASSERT(bg);
6501         if (atomic_dec_and_test(&bg->reservations))
6502                 wake_up_atomic_t(&bg->reservations);
6503         btrfs_put_block_group(bg);
6504 }
6505
6506 static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
6507 {
6508         schedule();
6509         return 0;
6510 }
6511
6512 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6513 {
6514         struct btrfs_space_info *space_info = bg->space_info;
6515
6516         ASSERT(bg->ro);
6517
6518         if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
6519                 return;
6520
6521         /*
6522          * Our block group is read only but before we set it to read only,
6523          * some task might have had allocated an extent from it already, but it
6524          * has not yet created a respective ordered extent (and added it to a
6525          * root's list of ordered extents).
6526          * Therefore wait for any task currently allocating extents, since the
6527          * block group's reservations counter is incremented while a read lock
6528          * on the groups' semaphore is held and decremented after releasing
6529          * the read access on that semaphore and creating the ordered extent.
6530          */
6531         down_write(&space_info->groups_sem);
6532         up_write(&space_info->groups_sem);
6533
6534         wait_on_atomic_t(&bg->reservations,
6535                          btrfs_wait_bg_reservations_atomic_t,
6536                          TASK_UNINTERRUPTIBLE);
6537 }
6538
6539 /**
6540  * btrfs_add_reserved_bytes - update the block_group and space info counters
6541  * @cache:      The cache we are manipulating
6542  * @ram_bytes:  The number of bytes of file content, and will be same to
6543  *              @num_bytes except for the compress path.
6544  * @num_bytes:  The number of bytes in question
6545  * @delalloc:   The blocks are allocated for the delalloc write
6546  *
6547  * This is called by the allocator when it reserves space. Metadata
6548  * reservations should be called with RESERVE_ALLOC so we do the proper
6549  * ENOSPC accounting.  For data we handle the reservation through clearing the
6550  * delalloc bits in the io_tree.  We have to do this since we could end up
6551  * allocating less disk space for the amount of data we have reserved in the
6552  * case of compression.
6553  *
6554  * If this is a reservation and the block group has become read only we cannot
6555  * make the reservation and return -EAGAIN, otherwise this function always
6556  * succeeds.
6557  */
6558 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
6559                                     u64 ram_bytes, u64 num_bytes, int delalloc)
6560 {
6561         struct btrfs_space_info *space_info = cache->space_info;
6562         int ret = 0;
6563
6564         spin_lock(&space_info->lock);
6565         spin_lock(&cache->lock);
6566         if (cache->ro) {
6567                 ret = -EAGAIN;
6568         } else {
6569                 cache->reserved += num_bytes;
6570                 space_info->bytes_reserved += num_bytes;
6571
6572                 trace_btrfs_space_reservation(cache->fs_info,
6573                                 "space_info", space_info->flags,
6574                                 ram_bytes, 0);
6575                 space_info->bytes_may_use -= ram_bytes;
6576                 if (delalloc)
6577                         cache->delalloc_bytes += num_bytes;
6578         }
6579         spin_unlock(&cache->lock);
6580         spin_unlock(&space_info->lock);
6581         return ret;
6582 }
6583
6584 /**
6585  * btrfs_free_reserved_bytes - update the block_group and space info counters
6586  * @cache:      The cache we are manipulating
6587  * @num_bytes:  The number of bytes in question
6588  * @delalloc:   The blocks are allocated for the delalloc write
6589  *
6590  * This is called by somebody who is freeing space that was never actually used
6591  * on disk.  For example if you reserve some space for a new leaf in transaction
6592  * A and before transaction A commits you free that leaf, you call this with
6593  * reserve set to 0 in order to clear the reservation.
6594  */
6595
6596 static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
6597                                      u64 num_bytes, int delalloc)
6598 {
6599         struct btrfs_space_info *space_info = cache->space_info;
6600         int ret = 0;
6601
6602         spin_lock(&space_info->lock);
6603         spin_lock(&cache->lock);
6604         if (cache->ro)
6605                 space_info->bytes_readonly += num_bytes;
6606         cache->reserved -= num_bytes;
6607         space_info->bytes_reserved -= num_bytes;
6608
6609         if (delalloc)
6610                 cache->delalloc_bytes -= num_bytes;
6611         spin_unlock(&cache->lock);
6612         spin_unlock(&space_info->lock);
6613         return ret;
6614 }
6615 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
6616                                 struct btrfs_root *root)
6617 {
6618         struct btrfs_fs_info *fs_info = root->fs_info;
6619         struct btrfs_caching_control *next;
6620         struct btrfs_caching_control *caching_ctl;
6621         struct btrfs_block_group_cache *cache;
6622
6623         down_write(&fs_info->commit_root_sem);
6624
6625         list_for_each_entry_safe(caching_ctl, next,
6626                                  &fs_info->caching_block_groups, list) {
6627                 cache = caching_ctl->block_group;
6628                 if (block_group_cache_done(cache)) {
6629                         cache->last_byte_to_unpin = (u64)-1;
6630                         list_del_init(&caching_ctl->list);
6631                         put_caching_control(caching_ctl);
6632                 } else {
6633                         cache->last_byte_to_unpin = caching_ctl->progress;
6634                 }
6635         }
6636
6637         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6638                 fs_info->pinned_extents = &fs_info->freed_extents[1];
6639         else
6640                 fs_info->pinned_extents = &fs_info->freed_extents[0];
6641
6642         up_write(&fs_info->commit_root_sem);
6643
6644         update_global_block_rsv(fs_info);
6645 }
6646
6647 /*
6648  * Returns the free cluster for the given space info and sets empty_cluster to
6649  * what it should be based on the mount options.
6650  */
6651 static struct btrfs_free_cluster *
6652 fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
6653                    u64 *empty_cluster)
6654 {
6655         struct btrfs_free_cluster *ret = NULL;
6656         bool ssd = btrfs_test_opt(root->fs_info, SSD);
6657
6658         *empty_cluster = 0;
6659         if (btrfs_mixed_space_info(space_info))
6660                 return ret;
6661
6662         if (ssd)
6663                 *empty_cluster = SZ_2M;
6664         if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
6665                 ret = &root->fs_info->meta_alloc_cluster;
6666                 if (!ssd)
6667                         *empty_cluster = SZ_64K;
6668         } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
6669                 ret = &root->fs_info->data_alloc_cluster;
6670         }
6671
6672         return ret;
6673 }
6674
6675 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
6676                               const bool return_free_space)
6677 {
6678         struct btrfs_fs_info *fs_info = root->fs_info;
6679         struct btrfs_block_group_cache *cache = NULL;
6680         struct btrfs_space_info *space_info;
6681         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6682         struct btrfs_free_cluster *cluster = NULL;
6683         u64 len;
6684         u64 total_unpinned = 0;
6685         u64 empty_cluster = 0;
6686         bool readonly;
6687
6688         while (start <= end) {
6689                 readonly = false;
6690                 if (!cache ||
6691                     start >= cache->key.objectid + cache->key.offset) {
6692                         if (cache)
6693                                 btrfs_put_block_group(cache);
6694                         total_unpinned = 0;
6695                         cache = btrfs_lookup_block_group(fs_info, start);
6696                         BUG_ON(!cache); /* Logic error */
6697
6698                         cluster = fetch_cluster_info(root,
6699                                                      cache->space_info,
6700                                                      &empty_cluster);
6701                         empty_cluster <<= 1;
6702                 }
6703
6704                 len = cache->key.objectid + cache->key.offset - start;
6705                 len = min(len, end + 1 - start);
6706
6707                 if (start < cache->last_byte_to_unpin) {
6708                         len = min(len, cache->last_byte_to_unpin - start);
6709                         if (return_free_space)
6710                                 btrfs_add_free_space(cache, start, len);
6711                 }
6712
6713                 start += len;
6714                 total_unpinned += len;
6715                 space_info = cache->space_info;
6716
6717                 /*
6718                  * If this space cluster has been marked as fragmented and we've
6719                  * unpinned enough in this block group to potentially allow a
6720                  * cluster to be created inside of it go ahead and clear the
6721                  * fragmented check.
6722                  */
6723                 if (cluster && cluster->fragmented &&
6724                     total_unpinned > empty_cluster) {
6725                         spin_lock(&cluster->lock);
6726                         cluster->fragmented = 0;
6727                         spin_unlock(&cluster->lock);
6728                 }
6729
6730                 spin_lock(&space_info->lock);
6731                 spin_lock(&cache->lock);
6732                 cache->pinned -= len;
6733                 space_info->bytes_pinned -= len;
6734
6735                 trace_btrfs_space_reservation(fs_info, "pinned",
6736                                               space_info->flags, len, 0);
6737                 space_info->max_extent_size = 0;
6738                 percpu_counter_add(&space_info->total_bytes_pinned, -len);
6739                 if (cache->ro) {
6740                         space_info->bytes_readonly += len;
6741                         readonly = true;
6742                 }
6743                 spin_unlock(&cache->lock);
6744                 if (!readonly && return_free_space &&
6745                     global_rsv->space_info == space_info) {
6746                         u64 to_add = len;
6747                         WARN_ON(!return_free_space);
6748                         spin_lock(&global_rsv->lock);
6749                         if (!global_rsv->full) {
6750                                 to_add = min(len, global_rsv->size -
6751                                              global_rsv->reserved);
6752                                 global_rsv->reserved += to_add;
6753                                 space_info->bytes_may_use += to_add;
6754                                 if (global_rsv->reserved >= global_rsv->size)
6755                                         global_rsv->full = 1;
6756                                 trace_btrfs_space_reservation(fs_info,
6757                                                               "space_info",
6758                                                               space_info->flags,
6759                                                               to_add, 1);
6760                                 len -= to_add;
6761                         }
6762                         spin_unlock(&global_rsv->lock);
6763                         /* Add to any tickets we may have */
6764                         if (len)
6765                                 space_info_add_new_bytes(fs_info, space_info,
6766                                                          len);
6767                 }
6768                 spin_unlock(&space_info->lock);
6769         }
6770
6771         if (cache)
6772                 btrfs_put_block_group(cache);
6773         return 0;
6774 }
6775
6776 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
6777                                struct btrfs_root *root)
6778 {
6779         struct btrfs_fs_info *fs_info = root->fs_info;
6780         struct btrfs_block_group_cache *block_group, *tmp;
6781         struct list_head *deleted_bgs;
6782         struct extent_io_tree *unpin;
6783         u64 start;
6784         u64 end;
6785         int ret;
6786
6787         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6788                 unpin = &fs_info->freed_extents[1];
6789         else
6790                 unpin = &fs_info->freed_extents[0];
6791
6792         while (!trans->aborted) {
6793                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
6794                 ret = find_first_extent_bit(unpin, 0, &start, &end,
6795                                             EXTENT_DIRTY, NULL);
6796                 if (ret) {
6797                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6798                         break;
6799                 }
6800
6801                 if (btrfs_test_opt(root->fs_info, DISCARD))
6802                         ret = btrfs_discard_extent(root, start,
6803                                                    end + 1 - start, NULL);
6804
6805                 clear_extent_dirty(unpin, start, end);
6806                 unpin_extent_range(root, start, end, true);
6807                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6808                 cond_resched();
6809         }
6810
6811         /*
6812          * Transaction is finished.  We don't need the lock anymore.  We
6813          * do need to clean up the block groups in case of a transaction
6814          * abort.
6815          */
6816         deleted_bgs = &trans->transaction->deleted_bgs;
6817         list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
6818                 u64 trimmed = 0;
6819
6820                 ret = -EROFS;
6821                 if (!trans->aborted)
6822                         ret = btrfs_discard_extent(root,
6823                                                    block_group->key.objectid,
6824                                                    block_group->key.offset,
6825                                                    &trimmed);
6826
6827                 list_del_init(&block_group->bg_list);
6828                 btrfs_put_block_group_trimming(block_group);
6829                 btrfs_put_block_group(block_group);
6830
6831                 if (ret) {
6832                         const char *errstr = btrfs_decode_error(ret);
6833                         btrfs_warn(fs_info,
6834                                    "Discard failed while removing blockgroup: errno=%d %s\n",
6835                                    ret, errstr);
6836                 }
6837         }
6838
6839         return 0;
6840 }
6841
6842 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
6843                              u64 owner, u64 root_objectid)
6844 {
6845         struct btrfs_space_info *space_info;
6846         u64 flags;
6847
6848         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
6849                 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
6850                         flags = BTRFS_BLOCK_GROUP_SYSTEM;
6851                 else
6852                         flags = BTRFS_BLOCK_GROUP_METADATA;
6853         } else {
6854                 flags = BTRFS_BLOCK_GROUP_DATA;
6855         }
6856
6857         space_info = __find_space_info(fs_info, flags);
6858         BUG_ON(!space_info); /* Logic bug */
6859         percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
6860 }
6861
6862
6863 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6864                                 struct btrfs_root *root,
6865                                 struct btrfs_delayed_ref_node *node, u64 parent,
6866                                 u64 root_objectid, u64 owner_objectid,
6867                                 u64 owner_offset, int refs_to_drop,
6868                                 struct btrfs_delayed_extent_op *extent_op)
6869 {
6870         struct btrfs_key key;
6871         struct btrfs_path *path;
6872         struct btrfs_fs_info *info = root->fs_info;
6873         struct btrfs_root *extent_root = info->extent_root;
6874         struct extent_buffer *leaf;
6875         struct btrfs_extent_item *ei;
6876         struct btrfs_extent_inline_ref *iref;
6877         int ret;
6878         int is_data;
6879         int extent_slot = 0;
6880         int found_extent = 0;
6881         int num_to_del = 1;
6882         u32 item_size;
6883         u64 refs;
6884         u64 bytenr = node->bytenr;
6885         u64 num_bytes = node->num_bytes;
6886         int last_ref = 0;
6887         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
6888                                                  SKINNY_METADATA);
6889
6890         path = btrfs_alloc_path();
6891         if (!path)
6892                 return -ENOMEM;
6893
6894         path->reada = READA_FORWARD;
6895         path->leave_spinning = 1;
6896
6897         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
6898         BUG_ON(!is_data && refs_to_drop != 1);
6899
6900         if (is_data)
6901                 skinny_metadata = 0;
6902
6903         ret = lookup_extent_backref(trans, extent_root, path, &iref,
6904                                     bytenr, num_bytes, parent,
6905                                     root_objectid, owner_objectid,
6906                                     owner_offset);
6907         if (ret == 0) {
6908                 extent_slot = path->slots[0];
6909                 while (extent_slot >= 0) {
6910                         btrfs_item_key_to_cpu(path->nodes[0], &key,
6911                                               extent_slot);
6912                         if (key.objectid != bytenr)
6913                                 break;
6914                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
6915                             key.offset == num_bytes) {
6916                                 found_extent = 1;
6917                                 break;
6918                         }
6919                         if (key.type == BTRFS_METADATA_ITEM_KEY &&
6920                             key.offset == owner_objectid) {
6921                                 found_extent = 1;
6922                                 break;
6923                         }
6924                         if (path->slots[0] - extent_slot > 5)
6925                                 break;
6926                         extent_slot--;
6927                 }
6928 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6929                 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
6930                 if (found_extent && item_size < sizeof(*ei))
6931                         found_extent = 0;
6932 #endif
6933                 if (!found_extent) {
6934                         BUG_ON(iref);
6935                         ret = remove_extent_backref(trans, extent_root, path,
6936                                                     NULL, refs_to_drop,
6937                                                     is_data, &last_ref);
6938                         if (ret) {
6939                                 btrfs_abort_transaction(trans, ret);
6940                                 goto out;
6941                         }
6942                         btrfs_release_path(path);
6943                         path->leave_spinning = 1;
6944
6945                         key.objectid = bytenr;
6946                         key.type = BTRFS_EXTENT_ITEM_KEY;
6947                         key.offset = num_bytes;
6948
6949                         if (!is_data && skinny_metadata) {
6950                                 key.type = BTRFS_METADATA_ITEM_KEY;
6951                                 key.offset = owner_objectid;
6952                         }
6953
6954                         ret = btrfs_search_slot(trans, extent_root,
6955                                                 &key, path, -1, 1);
6956                         if (ret > 0 && skinny_metadata && path->slots[0]) {
6957                                 /*
6958                                  * Couldn't find our skinny metadata item,
6959                                  * see if we have ye olde extent item.
6960                                  */
6961                                 path->slots[0]--;
6962                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
6963                                                       path->slots[0]);
6964                                 if (key.objectid == bytenr &&
6965                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
6966                                     key.offset == num_bytes)
6967                                         ret = 0;
6968                         }
6969
6970                         if (ret > 0 && skinny_metadata) {
6971                                 skinny_metadata = false;
6972                                 key.objectid = bytenr;
6973                                 key.type = BTRFS_EXTENT_ITEM_KEY;
6974                                 key.offset = num_bytes;
6975                                 btrfs_release_path(path);
6976                                 ret = btrfs_search_slot(trans, extent_root,
6977                                                         &key, path, -1, 1);
6978                         }
6979
6980                         if (ret) {
6981                                 btrfs_err(info, "umm, got %d back from search, was looking for %llu",
6982                                         ret, bytenr);
6983                                 if (ret > 0)
6984                                         btrfs_print_leaf(extent_root,
6985                                                          path->nodes[0]);
6986                         }
6987                         if (ret < 0) {
6988                                 btrfs_abort_transaction(trans, ret);
6989                                 goto out;
6990                         }
6991                         extent_slot = path->slots[0];
6992                 }
6993         } else if (WARN_ON(ret == -ENOENT)) {
6994                 btrfs_print_leaf(extent_root, path->nodes[0]);
6995                 btrfs_err(info,
6996                         "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
6997                         bytenr, parent, root_objectid, owner_objectid,
6998                         owner_offset);
6999                 btrfs_abort_transaction(trans, ret);
7000                 goto out;
7001         } else {
7002                 btrfs_abort_transaction(trans, ret);
7003                 goto out;
7004         }
7005
7006         leaf = path->nodes[0];
7007         item_size = btrfs_item_size_nr(leaf, extent_slot);
7008 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
7009         if (item_size < sizeof(*ei)) {
7010                 BUG_ON(found_extent || extent_slot != path->slots[0]);
7011                 ret = convert_extent_item_v0(trans, extent_root, path,
7012                                              owner_objectid, 0);
7013                 if (ret < 0) {
7014                         btrfs_abort_transaction(trans, ret);
7015                         goto out;
7016                 }
7017
7018                 btrfs_release_path(path);
7019                 path->leave_spinning = 1;
7020
7021                 key.objectid = bytenr;
7022                 key.type = BTRFS_EXTENT_ITEM_KEY;
7023                 key.offset = num_bytes;
7024
7025                 ret = btrfs_search_slot(trans, extent_root, &key, path,
7026                                         -1, 1);
7027                 if (ret) {
7028                         btrfs_err(info, "umm, got %d back from search, was looking for %llu",
7029                                 ret, bytenr);
7030                         btrfs_print_leaf(extent_root, path->nodes[0]);
7031                 }
7032                 if (ret < 0) {
7033                         btrfs_abort_transaction(trans, ret);
7034                         goto out;
7035                 }
7036
7037                 extent_slot = path->slots[0];
7038                 leaf = path->nodes[0];
7039                 item_size = btrfs_item_size_nr(leaf, extent_slot);
7040         }
7041 #endif
7042         BUG_ON(item_size < sizeof(*ei));
7043         ei = btrfs_item_ptr(leaf, extent_slot,
7044                             struct btrfs_extent_item);
7045         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
7046             key.type == BTRFS_EXTENT_ITEM_KEY) {
7047                 struct btrfs_tree_block_info *bi;
7048                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
7049                 bi = (struct btrfs_tree_block_info *)(ei + 1);
7050                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
7051         }
7052
7053         refs = btrfs_extent_refs(leaf, ei);
7054         if (refs < refs_to_drop) {
7055                 btrfs_err(info, "trying to drop %d refs but we only have %Lu "
7056                           "for bytenr %Lu", refs_to_drop, refs, bytenr);
7057                 ret = -EINVAL;
7058                 btrfs_abort_transaction(trans, ret);
7059                 goto out;
7060         }
7061         refs -= refs_to_drop;
7062
7063         if (refs > 0) {
7064                 if (extent_op)
7065                         __run_delayed_extent_op(extent_op, leaf, ei);
7066                 /*
7067                  * In the case of inline back ref, reference count will
7068                  * be updated by remove_extent_backref
7069                  */
7070                 if (iref) {
7071                         BUG_ON(!found_extent);
7072                 } else {
7073                         btrfs_set_extent_refs(leaf, ei, refs);
7074                         btrfs_mark_buffer_dirty(leaf);
7075                 }
7076                 if (found_extent) {
7077                         ret = remove_extent_backref(trans, extent_root, path,
7078                                                     iref, refs_to_drop,
7079                                                     is_data, &last_ref);
7080                         if (ret) {
7081                                 btrfs_abort_transaction(trans, ret);
7082                                 goto out;
7083                         }
7084                 }
7085                 add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
7086                                  root_objectid);
7087         } else {
7088                 if (found_extent) {
7089                         BUG_ON(is_data && refs_to_drop !=
7090                                extent_data_ref_count(path, iref));
7091                         if (iref) {
7092                                 BUG_ON(path->slots[0] != extent_slot);
7093                         } else {
7094                                 BUG_ON(path->slots[0] != extent_slot + 1);
7095                                 path->slots[0] = extent_slot;
7096                                 num_to_del = 2;
7097                         }
7098                 }
7099
7100                 last_ref = 1;
7101                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
7102                                       num_to_del);
7103                 if (ret) {
7104                         btrfs_abort_transaction(trans, ret);
7105                         goto out;
7106                 }
7107                 btrfs_release_path(path);
7108
7109                 if (is_data) {
7110                         ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
7111                         if (ret) {
7112                                 btrfs_abort_transaction(trans, ret);
7113                                 goto out;
7114                         }
7115                 }
7116
7117                 ret = add_to_free_space_tree(trans, root->fs_info, bytenr,
7118                                              num_bytes);
7119                 if (ret) {
7120                         btrfs_abort_transaction(trans, ret);
7121                         goto out;
7122                 }
7123
7124                 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
7125                 if (ret) {
7126                         btrfs_abort_transaction(trans, ret);
7127                         goto out;
7128                 }
7129         }
7130         btrfs_release_path(path);
7131
7132 out:
7133         btrfs_free_path(path);
7134         return ret;
7135 }
7136
7137 /*
7138  * when we free an block, it is possible (and likely) that we free the last
7139  * delayed ref for that extent as well.  This searches the delayed ref tree for
7140  * a given extent, and if there are no other delayed refs to be processed, it
7141  * removes it from the tree.
7142  */
7143 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
7144                                       struct btrfs_root *root, u64 bytenr)
7145 {
7146         struct btrfs_delayed_ref_head *head;
7147         struct btrfs_delayed_ref_root *delayed_refs;
7148         int ret = 0;
7149
7150         delayed_refs = &trans->transaction->delayed_refs;
7151         spin_lock(&delayed_refs->lock);
7152         head = btrfs_find_delayed_ref_head(trans, bytenr);
7153         if (!head)
7154                 goto out_delayed_unlock;
7155
7156         spin_lock(&head->lock);
7157         if (!list_empty(&head->ref_list))
7158                 goto out;
7159
7160         if (head->extent_op) {
7161                 if (!head->must_insert_reserved)
7162                         goto out;
7163                 btrfs_free_delayed_extent_op(head->extent_op);
7164                 head->extent_op = NULL;
7165         }
7166
7167         /*
7168          * waiting for the lock here would deadlock.  If someone else has it
7169          * locked they are already in the process of dropping it anyway
7170          */
7171         if (!mutex_trylock(&head->mutex))
7172                 goto out;
7173
7174         /*
7175          * at this point we have a head with no other entries.  Go
7176          * ahead and process it.
7177          */
7178         head->node.in_tree = 0;
7179         rb_erase(&head->href_node, &delayed_refs->href_root);
7180
7181         atomic_dec(&delayed_refs->num_entries);
7182
7183         /*
7184          * we don't take a ref on the node because we're removing it from the
7185          * tree, so we just steal the ref the tree was holding.
7186          */
7187         delayed_refs->num_heads--;
7188         if (head->processing == 0)
7189                 delayed_refs->num_heads_ready--;
7190         head->processing = 0;
7191         spin_unlock(&head->lock);
7192         spin_unlock(&delayed_refs->lock);
7193
7194         BUG_ON(head->extent_op);
7195         if (head->must_insert_reserved)
7196                 ret = 1;
7197
7198         mutex_unlock(&head->mutex);
7199         btrfs_put_delayed_ref(&head->node);
7200         return ret;
7201 out:
7202         spin_unlock(&head->lock);
7203
7204 out_delayed_unlock:
7205         spin_unlock(&delayed_refs->lock);
7206         return 0;
7207 }
7208
7209 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
7210                            struct btrfs_root *root,
7211                            struct extent_buffer *buf,
7212                            u64 parent, int last_ref)
7213 {
7214         int pin = 1;
7215         int ret;
7216
7217         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7218                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
7219                                         buf->start, buf->len,
7220                                         parent, root->root_key.objectid,
7221                                         btrfs_header_level(buf),
7222                                         BTRFS_DROP_DELAYED_REF, NULL);
7223                 BUG_ON(ret); /* -ENOMEM */
7224         }
7225
7226         if (!last_ref)
7227                 return;
7228
7229         if (btrfs_header_generation(buf) == trans->transid) {
7230                 struct btrfs_block_group_cache *cache;
7231
7232                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7233                         ret = check_ref_cleanup(trans, root, buf->start);
7234                         if (!ret)
7235                                 goto out;
7236                 }
7237
7238                 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
7239
7240                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
7241                         pin_down_extent(root, cache, buf->start, buf->len, 1);
7242                         btrfs_put_block_group(cache);
7243                         goto out;
7244                 }
7245
7246                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
7247
7248                 btrfs_add_free_space(cache, buf->start, buf->len);
7249                 btrfs_free_reserved_bytes(cache, buf->len, 0);
7250                 btrfs_put_block_group(cache);
7251                 trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
7252                 pin = 0;
7253         }
7254 out:
7255         if (pin)
7256                 add_pinned_bytes(root->fs_info, buf->len,
7257                                  btrfs_header_level(buf),
7258                                  root->root_key.objectid);
7259
7260         /*
7261          * Deleting the buffer, clear the corrupt flag since it doesn't matter
7262          * anymore.
7263          */
7264         clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
7265 }
7266
7267 /* Can return -ENOMEM */
7268 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
7269                       u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
7270                       u64 owner, u64 offset)
7271 {
7272         int ret;
7273         struct btrfs_fs_info *fs_info = root->fs_info;
7274
7275         if (btrfs_is_testing(fs_info))
7276                 return 0;
7277
7278         add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
7279
7280         /*
7281          * tree log blocks never actually go into the extent allocation
7282          * tree, just update pinning info and exit early.
7283          */
7284         if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
7285                 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
7286                 /* unlocks the pinned mutex */
7287                 btrfs_pin_extent(root, bytenr, num_bytes, 1);
7288                 ret = 0;
7289         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
7290                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
7291                                         num_bytes,
7292                                         parent, root_objectid, (int)owner,
7293                                         BTRFS_DROP_DELAYED_REF, NULL);
7294         } else {
7295                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
7296                                                 num_bytes,
7297                                                 parent, root_objectid, owner,
7298                                                 offset, 0,
7299                                                 BTRFS_DROP_DELAYED_REF, NULL);
7300         }
7301         return ret;
7302 }
7303
7304 /*
7305  * when we wait for progress in the block group caching, its because
7306  * our allocation attempt failed at least once.  So, we must sleep
7307  * and let some progress happen before we try again.
7308  *
7309  * This function will sleep at least once waiting for new free space to
7310  * show up, and then it will check the block group free space numbers
7311  * for our min num_bytes.  Another option is to have it go ahead
7312  * and look in the rbtree for a free extent of a given size, but this
7313  * is a good start.
7314  *
7315  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
7316  * any of the information in this block group.
7317  */
7318 static noinline void
7319 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
7320                                 u64 num_bytes)
7321 {
7322         struct btrfs_caching_control *caching_ctl;
7323
7324         caching_ctl = get_caching_control(cache);
7325         if (!caching_ctl)
7326                 return;
7327
7328         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
7329                    (cache->free_space_ctl->free_space >= num_bytes));
7330
7331         put_caching_control(caching_ctl);
7332 }
7333
7334 static noinline int
7335 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
7336 {
7337         struct btrfs_caching_control *caching_ctl;
7338         int ret = 0;
7339
7340         caching_ctl = get_caching_control(cache);
7341         if (!caching_ctl)
7342                 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
7343
7344         wait_event(caching_ctl->wait, block_group_cache_done(cache));
7345         if (cache->cached == BTRFS_CACHE_ERROR)
7346                 ret = -EIO;
7347         put_caching_control(caching_ctl);
7348         return ret;
7349 }
7350
7351 int __get_raid_index(u64 flags)
7352 {
7353         if (flags & BTRFS_BLOCK_GROUP_RAID10)
7354                 return BTRFS_RAID_RAID10;
7355         else if (flags & BTRFS_BLOCK_GROUP_RAID1)
7356                 return BTRFS_RAID_RAID1;
7357         else if (flags & BTRFS_BLOCK_GROUP_DUP)
7358                 return BTRFS_RAID_DUP;
7359         else if (flags & BTRFS_BLOCK_GROUP_RAID0)
7360                 return BTRFS_RAID_RAID0;
7361         else if (flags & BTRFS_BLOCK_GROUP_RAID5)
7362                 return BTRFS_RAID_RAID5;
7363         else if (flags & BTRFS_BLOCK_GROUP_RAID6)
7364                 return BTRFS_RAID_RAID6;
7365
7366         return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
7367 }
7368
7369 int get_block_group_index(struct btrfs_block_group_cache *cache)
7370 {
7371         return __get_raid_index(cache->flags);
7372 }
7373
7374 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
7375         [BTRFS_RAID_RAID10]     = "raid10",
7376         [BTRFS_RAID_RAID1]      = "raid1",
7377         [BTRFS_RAID_DUP]        = "dup",
7378         [BTRFS_RAID_RAID0]      = "raid0",
7379         [BTRFS_RAID_SINGLE]     = "single",
7380         [BTRFS_RAID_RAID5]      = "raid5",
7381         [BTRFS_RAID_RAID6]      = "raid6",
7382 };
7383
7384 static const char *get_raid_name(enum btrfs_raid_types type)
7385 {
7386         if (type >= BTRFS_NR_RAID_TYPES)
7387                 return NULL;
7388
7389         return btrfs_raid_type_names[type];
7390 }
7391
7392 enum btrfs_loop_type {
7393         LOOP_CACHING_NOWAIT = 0,
7394         LOOP_CACHING_WAIT = 1,
7395         LOOP_ALLOC_CHUNK = 2,
7396         LOOP_NO_EMPTY_SIZE = 3,
7397 };
7398
7399 static inline void
7400 btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
7401                        int delalloc)
7402 {
7403         if (delalloc)
7404                 down_read(&cache->data_rwsem);
7405 }
7406
7407 static inline void
7408 btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
7409                        int delalloc)
7410 {
7411         btrfs_get_block_group(cache);
7412         if (delalloc)
7413                 down_read(&cache->data_rwsem);
7414 }
7415
7416 static struct btrfs_block_group_cache *
7417 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
7418                    struct btrfs_free_cluster *cluster,
7419                    int delalloc)
7420 {
7421         struct btrfs_block_group_cache *used_bg = NULL;
7422
7423         spin_lock(&cluster->refill_lock);
7424         while (1) {
7425                 used_bg = cluster->block_group;
7426                 if (!used_bg)
7427                         return NULL;
7428
7429                 if (used_bg == block_group)
7430                         return used_bg;
7431
7432                 btrfs_get_block_group(used_bg);
7433
7434                 if (!delalloc)
7435                         return used_bg;
7436
7437                 if (down_read_trylock(&used_bg->data_rwsem))
7438                         return used_bg;
7439
7440                 spin_unlock(&cluster->refill_lock);
7441
7442                 down_read(&used_bg->data_rwsem);
7443
7444                 spin_lock(&cluster->refill_lock);
7445                 if (used_bg == cluster->block_group)
7446                         return used_bg;
7447
7448                 up_read(&used_bg->data_rwsem);
7449                 btrfs_put_block_group(used_bg);
7450         }
7451 }
7452
7453 static inline void
7454 btrfs_release_block_group(struct btrfs_block_group_cache *cache,
7455                          int delalloc)
7456 {
7457         if (delalloc)
7458                 up_read(&cache->data_rwsem);
7459         btrfs_put_block_group(cache);
7460 }
7461
7462 /*
7463  * walks the btree of allocated extents and find a hole of a given size.
7464  * The key ins is changed to record the hole:
7465  * ins->objectid == start position
7466  * ins->flags = BTRFS_EXTENT_ITEM_KEY
7467  * ins->offset == the size of the hole.
7468  * Any available blocks before search_start are skipped.
7469  *
7470  * If there is no suitable free space, we will record the max size of
7471  * the free space extent currently.
7472  */
7473 static noinline int find_free_extent(struct btrfs_root *orig_root,
7474                                 u64 ram_bytes, u64 num_bytes, u64 empty_size,
7475                                 u64 hint_byte, struct btrfs_key *ins,
7476                                 u64 flags, int delalloc)
7477 {
7478         int ret = 0;
7479         struct btrfs_root *root = orig_root->fs_info->extent_root;
7480         struct btrfs_free_cluster *last_ptr = NULL;
7481         struct btrfs_block_group_cache *block_group = NULL;
7482         u64 search_start = 0;
7483         u64 max_extent_size = 0;
7484         u64 empty_cluster = 0;
7485         struct btrfs_space_info *space_info;
7486         int loop = 0;
7487         int index = __get_raid_index(flags);
7488         bool failed_cluster_refill = false;
7489         bool failed_alloc = false;
7490         bool use_cluster = true;
7491         bool have_caching_bg = false;
7492         bool orig_have_caching_bg = false;
7493         bool full_search = false;
7494
7495         WARN_ON(num_bytes < root->sectorsize);
7496         ins->type = BTRFS_EXTENT_ITEM_KEY;
7497         ins->objectid = 0;
7498         ins->offset = 0;
7499
7500         trace_find_free_extent(orig_root, num_bytes, empty_size, flags);
7501
7502         space_info = __find_space_info(root->fs_info, flags);
7503         if (!space_info) {
7504                 btrfs_err(root->fs_info, "No space info for %llu", flags);
7505                 return -ENOSPC;
7506         }
7507
7508         /*
7509          * If our free space is heavily fragmented we may not be able to make
7510          * big contiguous allocations, so instead of doing the expensive search
7511          * for free space, simply return ENOSPC with our max_extent_size so we
7512          * can go ahead and search for a more manageable chunk.
7513          *
7514          * If our max_extent_size is large enough for our allocation simply
7515          * disable clustering since we will likely not be able to find enough
7516          * space to create a cluster and induce latency trying.
7517          */
7518         if (unlikely(space_info->max_extent_size)) {
7519                 spin_lock(&space_info->lock);
7520                 if (space_info->max_extent_size &&
7521                     num_bytes > space_info->max_extent_size) {
7522                         ins->offset = space_info->max_extent_size;
7523                         spin_unlock(&space_info->lock);
7524                         return -ENOSPC;
7525                 } else if (space_info->max_extent_size) {
7526                         use_cluster = false;
7527                 }
7528                 spin_unlock(&space_info->lock);
7529         }
7530
7531         last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
7532         if (last_ptr) {
7533                 spin_lock(&last_ptr->lock);
7534                 if (last_ptr->block_group)
7535                         hint_byte = last_ptr->window_start;
7536                 if (last_ptr->fragmented) {
7537                         /*
7538                          * We still set window_start so we can keep track of the
7539                          * last place we found an allocation to try and save
7540                          * some time.
7541                          */
7542                         hint_byte = last_ptr->window_start;
7543                         use_cluster = false;
7544                 }
7545                 spin_unlock(&last_ptr->lock);
7546         }
7547
7548         search_start = max(search_start, first_logical_byte(root, 0));
7549         search_start = max(search_start, hint_byte);
7550         if (search_start == hint_byte) {
7551                 block_group = btrfs_lookup_block_group(root->fs_info,
7552                                                        search_start);
7553                 /*
7554                  * we don't want to use the block group if it doesn't match our
7555                  * allocation bits, or if its not cached.
7556                  *
7557                  * However if we are re-searching with an ideal block group
7558                  * picked out then we don't care that the block group is cached.
7559                  */
7560                 if (block_group && block_group_bits(block_group, flags) &&
7561                     block_group->cached != BTRFS_CACHE_NO) {
7562                         down_read(&space_info->groups_sem);
7563                         if (list_empty(&block_group->list) ||
7564                             block_group->ro) {
7565                                 /*
7566                                  * someone is removing this block group,
7567                                  * we can't jump into the have_block_group
7568                                  * target because our list pointers are not
7569                                  * valid
7570                                  */
7571                                 btrfs_put_block_group(block_group);
7572                                 up_read(&space_info->groups_sem);
7573                         } else {
7574                                 index = get_block_group_index(block_group);
7575                                 btrfs_lock_block_group(block_group, delalloc);
7576                                 goto have_block_group;
7577                         }
7578                 } else if (block_group) {
7579                         btrfs_put_block_group(block_group);
7580                 }
7581         }
7582 search:
7583         have_caching_bg = false;
7584         if (index == 0 || index == __get_raid_index(flags))
7585                 full_search = true;
7586         down_read(&space_info->groups_sem);
7587         list_for_each_entry(block_group, &space_info->block_groups[index],
7588                             list) {
7589                 u64 offset;
7590                 int cached;
7591
7592                 btrfs_grab_block_group(block_group, delalloc);
7593                 search_start = block_group->key.objectid;
7594
7595                 /*
7596                  * this can happen if we end up cycling through all the
7597                  * raid types, but we want to make sure we only allocate
7598                  * for the proper type.
7599                  */
7600                 if (!block_group_bits(block_group, flags)) {
7601                     u64 extra = BTRFS_BLOCK_GROUP_DUP |
7602                                 BTRFS_BLOCK_GROUP_RAID1 |
7603                                 BTRFS_BLOCK_GROUP_RAID5 |
7604                                 BTRFS_BLOCK_GROUP_RAID6 |
7605                                 BTRFS_BLOCK_GROUP_RAID10;
7606
7607                         /*
7608                          * if they asked for extra copies and this block group
7609                          * doesn't provide them, bail.  This does allow us to
7610                          * fill raid0 from raid1.
7611                          */
7612                         if ((flags & extra) && !(block_group->flags & extra))
7613                                 goto loop;
7614                 }
7615
7616 have_block_group:
7617                 cached = block_group_cache_done(block_group);
7618                 if (unlikely(!cached)) {
7619                         have_caching_bg = true;
7620                         ret = cache_block_group(block_group, 0);
7621                         BUG_ON(ret < 0);
7622                         ret = 0;
7623                 }
7624
7625                 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
7626                         goto loop;
7627                 if (unlikely(block_group->ro))
7628                         goto loop;
7629
7630                 /*
7631                  * Ok we want to try and use the cluster allocator, so
7632                  * lets look there
7633                  */
7634                 if (last_ptr && use_cluster) {
7635                         struct btrfs_block_group_cache *used_block_group;
7636                         unsigned long aligned_cluster;
7637                         /*
7638                          * the refill lock keeps out other
7639                          * people trying to start a new cluster
7640                          */
7641                         used_block_group = btrfs_lock_cluster(block_group,
7642                                                               last_ptr,
7643                                                               delalloc);
7644                         if (!used_block_group)
7645                                 goto refill_cluster;
7646
7647                         if (used_block_group != block_group &&
7648                             (used_block_group->ro ||
7649                              !block_group_bits(used_block_group, flags)))
7650                                 goto release_cluster;
7651
7652                         offset = btrfs_alloc_from_cluster(used_block_group,
7653                                                 last_ptr,
7654                                                 num_bytes,
7655                                                 used_block_group->key.objectid,
7656                                                 &max_extent_size);
7657                         if (offset) {
7658                                 /* we have a block, we're done */
7659                                 spin_unlock(&last_ptr->refill_lock);
7660                                 trace_btrfs_reserve_extent_cluster(root,
7661                                                 used_block_group,
7662                                                 search_start, num_bytes);
7663                                 if (used_block_group != block_group) {
7664                                         btrfs_release_block_group(block_group,
7665                                                                   delalloc);
7666                                         block_group = used_block_group;
7667                                 }
7668                                 goto checks;
7669                         }
7670
7671                         WARN_ON(last_ptr->block_group != used_block_group);
7672 release_cluster:
7673                         /* If we are on LOOP_NO_EMPTY_SIZE, we can't
7674                          * set up a new clusters, so lets just skip it
7675                          * and let the allocator find whatever block
7676                          * it can find.  If we reach this point, we
7677                          * will have tried the cluster allocator
7678                          * plenty of times and not have found
7679                          * anything, so we are likely way too
7680                          * fragmented for the clustering stuff to find
7681                          * anything.
7682                          *
7683                          * However, if the cluster is taken from the
7684                          * current block group, release the cluster
7685                          * first, so that we stand a better chance of
7686                          * succeeding in the unclustered
7687                          * allocation.  */
7688                         if (loop >= LOOP_NO_EMPTY_SIZE &&
7689                             used_block_group != block_group) {
7690                                 spin_unlock(&last_ptr->refill_lock);
7691                                 btrfs_release_block_group(used_block_group,
7692                                                           delalloc);
7693                                 goto unclustered_alloc;
7694                         }
7695
7696                         /*
7697                          * this cluster didn't work out, free it and
7698                          * start over
7699                          */
7700                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
7701
7702                         if (used_block_group != block_group)
7703                                 btrfs_release_block_group(used_block_group,
7704                                                           delalloc);
7705 refill_cluster:
7706                         if (loop >= LOOP_NO_EMPTY_SIZE) {
7707                                 spin_unlock(&last_ptr->refill_lock);
7708                                 goto unclustered_alloc;
7709                         }
7710
7711                         aligned_cluster = max_t(unsigned long,
7712                                                 empty_cluster + empty_size,
7713                                               block_group->full_stripe_len);
7714
7715                         /* allocate a cluster in this block group */
7716                         ret = btrfs_find_space_cluster(root, block_group,
7717                                                        last_ptr, search_start,
7718                                                        num_bytes,
7719                                                        aligned_cluster);
7720                         if (ret == 0) {
7721                                 /*
7722                                  * now pull our allocation out of this
7723                                  * cluster
7724                                  */
7725                                 offset = btrfs_alloc_from_cluster(block_group,
7726                                                         last_ptr,
7727                                                         num_bytes,
7728                                                         search_start,
7729                                                         &max_extent_size);
7730                                 if (offset) {
7731                                         /* we found one, proceed */
7732                                         spin_unlock(&last_ptr->refill_lock);
7733                                         trace_btrfs_reserve_extent_cluster(root,
7734                                                 block_group, search_start,
7735                                                 num_bytes);
7736                                         goto checks;
7737                                 }
7738                         } else if (!cached && loop > LOOP_CACHING_NOWAIT
7739                                    && !failed_cluster_refill) {
7740                                 spin_unlock(&last_ptr->refill_lock);
7741
7742                                 failed_cluster_refill = true;
7743                                 wait_block_group_cache_progress(block_group,
7744                                        num_bytes + empty_cluster + empty_size);
7745                                 goto have_block_group;
7746                         }
7747
7748                         /*
7749                          * at this point we either didn't find a cluster
7750                          * or we weren't able to allocate a block from our
7751                          * cluster.  Free the cluster we've been trying
7752                          * to use, and go to the next block group
7753                          */
7754                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
7755                         spin_unlock(&last_ptr->refill_lock);
7756                         goto loop;
7757                 }
7758
7759 unclustered_alloc:
7760                 /*
7761                  * We are doing an unclustered alloc, set the fragmented flag so
7762                  * we don't bother trying to setup a cluster again until we get
7763                  * more space.
7764                  */
7765                 if (unlikely(last_ptr)) {
7766                         spin_lock(&last_ptr->lock);
7767                         last_ptr->fragmented = 1;
7768                         spin_unlock(&last_ptr->lock);
7769                 }
7770                 spin_lock(&block_group->free_space_ctl->tree_lock);
7771                 if (cached &&
7772                     block_group->free_space_ctl->free_space <
7773                     num_bytes + empty_cluster + empty_size) {
7774                         if (block_group->free_space_ctl->free_space >
7775                             max_extent_size)
7776                                 max_extent_size =
7777                                         block_group->free_space_ctl->free_space;
7778                         spin_unlock(&block_group->free_space_ctl->tree_lock);
7779                         goto loop;
7780                 }
7781                 spin_unlock(&block_group->free_space_ctl->tree_lock);
7782
7783                 offset = btrfs_find_space_for_alloc(block_group, search_start,
7784                                                     num_bytes, empty_size,
7785                                                     &max_extent_size);
7786                 /*
7787                  * If we didn't find a chunk, and we haven't failed on this
7788                  * block group before, and this block group is in the middle of
7789                  * caching and we are ok with waiting, then go ahead and wait
7790                  * for progress to be made, and set failed_alloc to true.
7791                  *
7792                  * If failed_alloc is true then we've already waited on this
7793                  * block group once and should move on to the next block group.
7794                  */
7795                 if (!offset && !failed_alloc && !cached &&
7796                     loop > LOOP_CACHING_NOWAIT) {
7797                         wait_block_group_cache_progress(block_group,
7798                                                 num_bytes + empty_size);
7799                         failed_alloc = true;
7800                         goto have_block_group;
7801                 } else if (!offset) {
7802                         goto loop;
7803                 }
7804 checks:
7805                 search_start = ALIGN(offset, root->stripesize);
7806
7807                 /* move on to the next group */
7808                 if (search_start + num_bytes >
7809                     block_group->key.objectid + block_group->key.offset) {
7810                         btrfs_add_free_space(block_group, offset, num_bytes);
7811                         goto loop;
7812                 }
7813
7814                 if (offset < search_start)
7815                         btrfs_add_free_space(block_group, offset,
7816                                              search_start - offset);
7817                 BUG_ON(offset > search_start);
7818
7819                 ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
7820                                 num_bytes, delalloc);
7821                 if (ret == -EAGAIN) {
7822                         btrfs_add_free_space(block_group, offset, num_bytes);
7823                         goto loop;
7824                 }
7825                 btrfs_inc_block_group_reservations(block_group);
7826
7827                 /* we are all good, lets return */
7828                 ins->objectid = search_start;
7829                 ins->offset = num_bytes;
7830
7831                 trace_btrfs_reserve_extent(orig_root, block_group,
7832                                            search_start, num_bytes);
7833                 btrfs_release_block_group(block_group, delalloc);
7834                 break;
7835 loop:
7836                 failed_cluster_refill = false;
7837                 failed_alloc = false;
7838                 BUG_ON(index != get_block_group_index(block_group));
7839                 btrfs_release_block_group(block_group, delalloc);
7840         }
7841         up_read(&space_info->groups_sem);
7842
7843         if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
7844                 && !orig_have_caching_bg)
7845                 orig_have_caching_bg = true;
7846
7847         if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
7848                 goto search;
7849
7850         if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
7851                 goto search;
7852
7853         /*
7854          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7855          *                      caching kthreads as we move along
7856          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7857          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7858          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7859          *                      again
7860          */
7861         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
7862                 index = 0;
7863                 if (loop == LOOP_CACHING_NOWAIT) {
7864                         /*
7865                          * We want to skip the LOOP_CACHING_WAIT step if we
7866                          * don't have any uncached bgs and we've already done a
7867                          * full search through.
7868                          */
7869                         if (orig_have_caching_bg || !full_search)
7870                                 loop = LOOP_CACHING_WAIT;
7871                         else
7872                                 loop = LOOP_ALLOC_CHUNK;
7873                 } else {
7874                         loop++;
7875                 }
7876
7877                 if (loop == LOOP_ALLOC_CHUNK) {
7878                         struct btrfs_trans_handle *trans;
7879                         int exist = 0;
7880
7881                         trans = current->journal_info;
7882                         if (trans)
7883                                 exist = 1;
7884                         else
7885                                 trans = btrfs_join_transaction(root);
7886
7887                         if (IS_ERR(trans)) {
7888                                 ret = PTR_ERR(trans);
7889                                 goto out;
7890                         }
7891
7892                         ret = do_chunk_alloc(trans, root, flags,
7893                                              CHUNK_ALLOC_FORCE);
7894
7895                         /*
7896                          * If we can't allocate a new chunk we've already looped
7897                          * through at least once, move on to the NO_EMPTY_SIZE
7898                          * case.
7899                          */
7900                         if (ret == -ENOSPC)
7901                                 loop = LOOP_NO_EMPTY_SIZE;
7902
7903                         /*
7904                          * Do not bail out on ENOSPC since we
7905                          * can do more things.
7906                          */
7907                         if (ret < 0 && ret != -ENOSPC)
7908                                 btrfs_abort_transaction(trans, ret);
7909                         else
7910                                 ret = 0;
7911                         if (!exist)
7912                                 btrfs_end_transaction(trans, root);
7913                         if (ret)
7914                                 goto out;
7915                 }
7916
7917                 if (loop == LOOP_NO_EMPTY_SIZE) {
7918                         /*
7919                          * Don't loop again if we already have no empty_size and
7920                          * no empty_cluster.
7921                          */
7922                         if (empty_size == 0 &&
7923                             empty_cluster == 0) {
7924                                 ret = -ENOSPC;
7925                                 goto out;
7926                         }
7927                         empty_size = 0;
7928                         empty_cluster = 0;
7929                 }
7930
7931                 goto search;
7932         } else if (!ins->objectid) {
7933                 ret = -ENOSPC;
7934         } else if (ins->objectid) {
7935                 if (!use_cluster && last_ptr) {
7936                         spin_lock(&last_ptr->lock);
7937                         last_ptr->window_start = ins->objectid;
7938                         spin_unlock(&last_ptr->lock);
7939                 }
7940                 ret = 0;
7941         }
7942 out:
7943         if (ret == -ENOSPC) {
7944                 spin_lock(&space_info->lock);
7945                 space_info->max_extent_size = max_extent_size;
7946                 spin_unlock(&space_info->lock);
7947                 ins->offset = max_extent_size;
7948         }
7949         return ret;
7950 }
7951
7952 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
7953                             int dump_block_groups)
7954 {
7955         struct btrfs_block_group_cache *cache;
7956         int index = 0;
7957
7958         spin_lock(&info->lock);
7959         printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
7960                info->flags,
7961                info->total_bytes - info->bytes_used - info->bytes_pinned -
7962                info->bytes_reserved - info->bytes_readonly -
7963                info->bytes_may_use, (info->full) ? "" : "not ");
7964         printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
7965                "reserved=%llu, may_use=%llu, readonly=%llu\n",
7966                info->total_bytes, info->bytes_used, info->bytes_pinned,
7967                info->bytes_reserved, info->bytes_may_use,
7968                info->bytes_readonly);
7969         spin_unlock(&info->lock);
7970
7971         if (!dump_block_groups)
7972                 return;
7973
7974         down_read(&info->groups_sem);
7975 again:
7976         list_for_each_entry(cache, &info->block_groups[index], list) {
7977                 spin_lock(&cache->lock);
7978                 printk(KERN_INFO "BTRFS: "
7979                            "block group %llu has %llu bytes, "
7980                            "%llu used %llu pinned %llu reserved %s\n",
7981                        cache->key.objectid, cache->key.offset,
7982                        btrfs_block_group_used(&cache->item), cache->pinned,
7983                        cache->reserved, cache->ro ? "[readonly]" : "");
7984                 btrfs_dump_free_space(cache, bytes);
7985                 spin_unlock(&cache->lock);
7986         }
7987         if (++index < BTRFS_NR_RAID_TYPES)
7988                 goto again;
7989         up_read(&info->groups_sem);
7990 }
7991
7992 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
7993                          u64 num_bytes, u64 min_alloc_size,
7994                          u64 empty_size, u64 hint_byte,
7995                          struct btrfs_key *ins, int is_data, int delalloc)
7996 {
7997         bool final_tried = num_bytes == min_alloc_size;
7998         u64 flags;
7999         int ret;
8000
8001         flags = btrfs_get_alloc_profile(root, is_data);
8002 again:
8003         WARN_ON(num_bytes < root->sectorsize);
8004         ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
8005                                hint_byte, ins, flags, delalloc);
8006         if (!ret && !is_data) {
8007                 btrfs_dec_block_group_reservations(root->fs_info,
8008                                                    ins->objectid);
8009         } else if (ret == -ENOSPC) {
8010                 if (!final_tried && ins->offset) {
8011                         num_bytes = min(num_bytes >> 1, ins->offset);
8012                         num_bytes = round_down(num_bytes, root->sectorsize);
8013                         num_bytes = max(num_bytes, min_alloc_size);
8014                         ram_bytes = num_bytes;
8015                         if (num_bytes == min_alloc_size)
8016                                 final_tried = true;
8017                         goto again;
8018                 } else if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
8019                         struct btrfs_space_info *sinfo;
8020
8021                         sinfo = __find_space_info(root->fs_info, flags);
8022                         btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu",
8023                                 flags, num_bytes);
8024                         if (sinfo)
8025                                 dump_space_info(sinfo, num_bytes, 1);
8026                 }
8027         }
8028
8029         return ret;
8030 }
8031
8032 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
8033                                         u64 start, u64 len,
8034                                         int pin, int delalloc)
8035 {
8036         struct btrfs_block_group_cache *cache;
8037         int ret = 0;
8038
8039         cache = btrfs_lookup_block_group(root->fs_info, start);
8040         if (!cache) {
8041                 btrfs_err(root->fs_info, "Unable to find block group for %llu",
8042                         start);
8043                 return -ENOSPC;
8044         }
8045
8046         if (pin)
8047                 pin_down_extent(root, cache, start, len, 1);
8048         else {
8049                 if (btrfs_test_opt(root->fs_info, DISCARD))
8050                         ret = btrfs_discard_extent(root, start, len, NULL);
8051                 btrfs_add_free_space(cache, start, len);
8052                 btrfs_free_reserved_bytes(cache, len, delalloc);
8053                 trace_btrfs_reserved_extent_free(root, start, len);
8054         }
8055
8056         btrfs_put_block_group(cache);
8057         return ret;
8058 }
8059
8060 int btrfs_free_reserved_extent(struct btrfs_root *root,
8061                                u64 start, u64 len, int delalloc)
8062 {
8063         return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
8064 }
8065
8066 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
8067                                        u64 start, u64 len)
8068 {
8069         return __btrfs_free_reserved_extent(root, start, len, 1, 0);
8070 }
8071
8072 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8073                                       struct btrfs_root *root,
8074                                       u64 parent, u64 root_objectid,
8075                                       u64 flags, u64 owner, u64 offset,
8076                                       struct btrfs_key *ins, int ref_mod)
8077 {
8078         int ret;
8079         struct btrfs_fs_info *fs_info = root->fs_info;
8080         struct btrfs_extent_item *extent_item;
8081         struct btrfs_extent_inline_ref *iref;
8082         struct btrfs_path *path;
8083         struct extent_buffer *leaf;
8084         int type;
8085         u32 size;
8086
8087         if (parent > 0)
8088                 type = BTRFS_SHARED_DATA_REF_KEY;
8089         else
8090                 type = BTRFS_EXTENT_DATA_REF_KEY;
8091
8092         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
8093
8094         path = btrfs_alloc_path();
8095         if (!path)
8096                 return -ENOMEM;
8097
8098         path->leave_spinning = 1;
8099         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8100                                       ins, size);
8101         if (ret) {
8102                 btrfs_free_path(path);
8103                 return ret;
8104         }
8105
8106         leaf = path->nodes[0];
8107         extent_item = btrfs_item_ptr(leaf, path->slots[0],
8108                                      struct btrfs_extent_item);
8109         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
8110         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8111         btrfs_set_extent_flags(leaf, extent_item,
8112                                flags | BTRFS_EXTENT_FLAG_DATA);
8113
8114         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8115         btrfs_set_extent_inline_ref_type(leaf, iref, type);
8116         if (parent > 0) {
8117                 struct btrfs_shared_data_ref *ref;
8118                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
8119                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
8120                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
8121         } else {
8122                 struct btrfs_extent_data_ref *ref;
8123                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
8124                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
8125                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
8126                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
8127                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
8128         }
8129
8130         btrfs_mark_buffer_dirty(path->nodes[0]);
8131         btrfs_free_path(path);
8132
8133         ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
8134                                           ins->offset);
8135         if (ret)
8136                 return ret;
8137
8138         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
8139         if (ret) { /* -ENOENT, logic error */
8140                 btrfs_err(fs_info, "update block group failed for %llu %llu",
8141                         ins->objectid, ins->offset);
8142                 BUG();
8143         }
8144         trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
8145         return ret;
8146 }
8147
8148 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
8149                                      struct btrfs_root *root,
8150                                      u64 parent, u64 root_objectid,
8151                                      u64 flags, struct btrfs_disk_key *key,
8152                                      int level, struct btrfs_key *ins)
8153 {
8154         int ret;
8155         struct btrfs_fs_info *fs_info = root->fs_info;
8156         struct btrfs_extent_item *extent_item;
8157         struct btrfs_tree_block_info *block_info;
8158         struct btrfs_extent_inline_ref *iref;
8159         struct btrfs_path *path;
8160         struct extent_buffer *leaf;
8161         u32 size = sizeof(*extent_item) + sizeof(*iref);
8162         u64 num_bytes = ins->offset;
8163         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
8164                                                  SKINNY_METADATA);
8165
8166         if (!skinny_metadata)
8167                 size += sizeof(*block_info);
8168
8169         path = btrfs_alloc_path();
8170         if (!path) {
8171                 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
8172                                                    root->nodesize);
8173                 return -ENOMEM;
8174         }
8175
8176         path->leave_spinning = 1;
8177         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8178                                       ins, size);
8179         if (ret) {
8180                 btrfs_free_path(path);
8181                 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
8182                                                    root->nodesize);
8183                 return ret;
8184         }
8185
8186         leaf = path->nodes[0];
8187         extent_item = btrfs_item_ptr(leaf, path->slots[0],
8188                                      struct btrfs_extent_item);
8189         btrfs_set_extent_refs(leaf, extent_item, 1);
8190         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8191         btrfs_set_extent_flags(leaf, extent_item,
8192                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
8193
8194         if (skinny_metadata) {
8195                 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8196                 num_bytes = root->nodesize;
8197         } else {
8198                 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
8199                 btrfs_set_tree_block_key(leaf, block_info, key);
8200                 btrfs_set_tree_block_level(leaf, block_info, level);
8201                 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
8202         }
8203
8204         if (parent > 0) {
8205                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
8206                 btrfs_set_extent_inline_ref_type(leaf, iref,
8207                                                  BTRFS_SHARED_BLOCK_REF_KEY);
8208                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
8209         } else {
8210                 btrfs_set_extent_inline_ref_type(leaf, iref,
8211                                                  BTRFS_TREE_BLOCK_REF_KEY);
8212                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
8213         }
8214
8215         btrfs_mark_buffer_dirty(leaf);
8216         btrfs_free_path(path);
8217
8218         ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
8219                                           num_bytes);
8220         if (ret)
8221                 return ret;
8222
8223         ret = update_block_group(trans, root, ins->objectid, root->nodesize,
8224                                  1);
8225         if (ret) { /* -ENOENT, logic error */
8226                 btrfs_err(fs_info, "update block group failed for %llu %llu",
8227                         ins->objectid, ins->offset);
8228                 BUG();
8229         }
8230
8231         trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
8232         return ret;
8233 }
8234
8235 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8236                                      struct btrfs_root *root,
8237                                      u64 root_objectid, u64 owner,
8238                                      u64 offset, u64 ram_bytes,
8239                                      struct btrfs_key *ins)
8240 {
8241         int ret;
8242
8243         BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
8244
8245         ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
8246                                          ins->offset, 0,
8247                                          root_objectid, owner, offset,
8248                                          ram_bytes, BTRFS_ADD_DELAYED_EXTENT,
8249                                          NULL);
8250         return ret;
8251 }
8252
8253 /*
8254  * this is used by the tree logging recovery code.  It records that
8255  * an extent has been allocated and makes sure to clear the free
8256  * space cache bits as well
8257  */
8258 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
8259                                    struct btrfs_root *root,
8260                                    u64 root_objectid, u64 owner, u64 offset,
8261                                    struct btrfs_key *ins)
8262 {
8263         int ret;
8264         struct btrfs_block_group_cache *block_group;
8265         struct btrfs_space_info *space_info;
8266
8267         /*
8268          * Mixed block groups will exclude before processing the log so we only
8269          * need to do the exclude dance if this fs isn't mixed.
8270          */
8271         if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
8272                 ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
8273                 if (ret)
8274                         return ret;
8275         }
8276
8277         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
8278         if (!block_group)
8279                 return -EINVAL;
8280
8281         space_info = block_group->space_info;
8282         spin_lock(&space_info->lock);
8283         spin_lock(&block_group->lock);
8284         space_info->bytes_reserved += ins->offset;
8285         block_group->reserved += ins->offset;
8286         spin_unlock(&block_group->lock);
8287         spin_unlock(&space_info->lock);
8288
8289         ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
8290                                          0, owner, offset, ins, 1);
8291         btrfs_put_block_group(block_group);
8292         return ret;
8293 }
8294
8295 static struct extent_buffer *
8296 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
8297                       u64 bytenr, int level)
8298 {
8299         struct extent_buffer *buf;
8300
8301         buf = btrfs_find_create_tree_block(root, bytenr);
8302         if (IS_ERR(buf))
8303                 return buf;
8304
8305         btrfs_set_header_generation(buf, trans->transid);
8306         btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
8307         btrfs_tree_lock(buf);
8308         clean_tree_block(trans, root->fs_info, buf);
8309         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
8310
8311         btrfs_set_lock_blocking(buf);
8312         set_extent_buffer_uptodate(buf);
8313
8314         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
8315                 buf->log_index = root->log_transid % 2;
8316                 /*
8317                  * we allow two log transactions at a time, use different
8318                  * EXENT bit to differentiate dirty pages.
8319                  */
8320                 if (buf->log_index == 0)
8321                         set_extent_dirty(&root->dirty_log_pages, buf->start,
8322                                         buf->start + buf->len - 1, GFP_NOFS);
8323                 else
8324                         set_extent_new(&root->dirty_log_pages, buf->start,
8325                                         buf->start + buf->len - 1);
8326         } else {
8327                 buf->log_index = -1;
8328                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
8329                          buf->start + buf->len - 1, GFP_NOFS);
8330         }
8331         trans->dirty = true;
8332         /* this returns a buffer locked for blocking */
8333         return buf;
8334 }
8335
8336 static struct btrfs_block_rsv *
8337 use_block_rsv(struct btrfs_trans_handle *trans,
8338               struct btrfs_root *root, u32 blocksize)
8339 {
8340         struct btrfs_block_rsv *block_rsv;
8341         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
8342         int ret;
8343         bool global_updated = false;
8344
8345         block_rsv = get_block_rsv(trans, root);
8346
8347         if (unlikely(block_rsv->size == 0))
8348                 goto try_reserve;
8349 again:
8350         ret = block_rsv_use_bytes(block_rsv, blocksize);
8351         if (!ret)
8352                 return block_rsv;
8353
8354         if (block_rsv->failfast)
8355                 return ERR_PTR(ret);
8356
8357         if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
8358                 global_updated = true;
8359                 update_global_block_rsv(root->fs_info);
8360                 goto again;
8361         }
8362
8363         if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
8364                 static DEFINE_RATELIMIT_STATE(_rs,
8365                                 DEFAULT_RATELIMIT_INTERVAL * 10,
8366                                 /*DEFAULT_RATELIMIT_BURST*/ 1);
8367                 if (__ratelimit(&_rs))
8368                         WARN(1, KERN_DEBUG
8369                                 "BTRFS: block rsv returned %d\n", ret);
8370         }
8371 try_reserve:
8372         ret = reserve_metadata_bytes(root, block_rsv, blocksize,
8373                                      BTRFS_RESERVE_NO_FLUSH);
8374         if (!ret)
8375                 return block_rsv;
8376         /*
8377          * If we couldn't reserve metadata bytes try and use some from
8378          * the global reserve if its space type is the same as the global
8379          * reservation.
8380          */
8381         if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
8382             block_rsv->space_info == global_rsv->space_info) {
8383                 ret = block_rsv_use_bytes(global_rsv, blocksize);
8384                 if (!ret)
8385                         return global_rsv;
8386         }
8387         return ERR_PTR(ret);
8388 }
8389
8390 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
8391                             struct btrfs_block_rsv *block_rsv, u32 blocksize)
8392 {
8393         block_rsv_add_bytes(block_rsv, blocksize, 0);
8394         block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
8395 }
8396
8397 /*
8398  * finds a free extent and does all the dirty work required for allocation
8399  * returns the tree buffer or an ERR_PTR on error.
8400  */
8401 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
8402                                         struct btrfs_root *root,
8403                                         u64 parent, u64 root_objectid,
8404                                         struct btrfs_disk_key *key, int level,
8405                                         u64 hint, u64 empty_size)
8406 {
8407         struct btrfs_key ins;
8408         struct btrfs_block_rsv *block_rsv;
8409         struct extent_buffer *buf;
8410         struct btrfs_delayed_extent_op *extent_op;
8411         u64 flags = 0;
8412         int ret;
8413         u32 blocksize = root->nodesize;
8414         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
8415                                                  SKINNY_METADATA);
8416
8417 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8418         if (btrfs_is_testing(root->fs_info)) {
8419                 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
8420                                             level);
8421                 if (!IS_ERR(buf))
8422                         root->alloc_bytenr += blocksize;
8423                 return buf;
8424         }
8425 #endif
8426
8427         block_rsv = use_block_rsv(trans, root, blocksize);
8428         if (IS_ERR(block_rsv))
8429                 return ERR_CAST(block_rsv);
8430
8431         ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
8432                                    empty_size, hint, &ins, 0, 0);
8433         if (ret)
8434                 goto out_unuse;
8435
8436         buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
8437         if (IS_ERR(buf)) {
8438                 ret = PTR_ERR(buf);
8439                 goto out_free_reserved;
8440         }
8441
8442         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
8443                 if (parent == 0)
8444                         parent = ins.objectid;
8445                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
8446         } else
8447                 BUG_ON(parent > 0);
8448
8449         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
8450                 extent_op = btrfs_alloc_delayed_extent_op();
8451                 if (!extent_op) {
8452                         ret = -ENOMEM;
8453                         goto out_free_buf;
8454                 }
8455                 if (key)
8456                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
8457                 else
8458                         memset(&extent_op->key, 0, sizeof(extent_op->key));
8459                 extent_op->flags_to_set = flags;
8460                 extent_op->update_key = skinny_metadata ? false : true;
8461                 extent_op->update_flags = true;
8462                 extent_op->is_data = false;
8463                 extent_op->level = level;
8464
8465                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
8466                                                  ins.objectid, ins.offset,
8467                                                  parent, root_objectid, level,
8468                                                  BTRFS_ADD_DELAYED_EXTENT,
8469                                                  extent_op);
8470                 if (ret)
8471                         goto out_free_delayed;
8472         }
8473         return buf;
8474
8475 out_free_delayed:
8476         btrfs_free_delayed_extent_op(extent_op);
8477 out_free_buf:
8478         free_extent_buffer(buf);
8479 out_free_reserved:
8480         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0);
8481 out_unuse:
8482         unuse_block_rsv(root->fs_info, block_rsv, blocksize);
8483         return ERR_PTR(ret);
8484 }
8485
8486 struct walk_control {
8487         u64 refs[BTRFS_MAX_LEVEL];
8488         u64 flags[BTRFS_MAX_LEVEL];
8489         struct btrfs_key update_progress;
8490         int stage;
8491         int level;
8492         int shared_level;
8493         int update_ref;
8494         int keep_locks;
8495         int reada_slot;
8496         int reada_count;
8497         int for_reloc;
8498 };
8499
8500 #define DROP_REFERENCE  1
8501 #define UPDATE_BACKREF  2
8502
8503 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
8504                                      struct btrfs_root *root,
8505                                      struct walk_control *wc,
8506                                      struct btrfs_path *path)
8507 {
8508         u64 bytenr;
8509         u64 generation;
8510         u64 refs;
8511         u64 flags;
8512         u32 nritems;
8513         struct btrfs_key key;
8514         struct extent_buffer *eb;
8515         int ret;
8516         int slot;
8517         int nread = 0;
8518
8519         if (path->slots[wc->level] < wc->reada_slot) {
8520                 wc->reada_count = wc->reada_count * 2 / 3;
8521                 wc->reada_count = max(wc->reada_count, 2);
8522         } else {
8523                 wc->reada_count = wc->reada_count * 3 / 2;
8524                 wc->reada_count = min_t(int, wc->reada_count,
8525                                         BTRFS_NODEPTRS_PER_BLOCK(root));
8526         }
8527
8528         eb = path->nodes[wc->level];
8529         nritems = btrfs_header_nritems(eb);
8530
8531         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
8532                 if (nread >= wc->reada_count)
8533                         break;
8534
8535                 cond_resched();
8536                 bytenr = btrfs_node_blockptr(eb, slot);
8537                 generation = btrfs_node_ptr_generation(eb, slot);
8538
8539                 if (slot == path->slots[wc->level])
8540                         goto reada;
8541
8542                 if (wc->stage == UPDATE_BACKREF &&
8543                     generation <= root->root_key.offset)
8544                         continue;
8545
8546                 /* We don't lock the tree block, it's OK to be racy here */
8547                 ret = btrfs_lookup_extent_info(trans, root, bytenr,
8548                                                wc->level - 1, 1, &refs,
8549                                                &flags);
8550                 /* We don't care about errors in readahead. */
8551                 if (ret < 0)
8552                         continue;
8553                 BUG_ON(refs == 0);
8554
8555                 if (wc->stage == DROP_REFERENCE) {
8556                         if (refs == 1)
8557                                 goto reada;
8558
8559                         if (wc->level == 1 &&
8560                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8561                                 continue;
8562                         if (!wc->update_ref ||
8563                             generation <= root->root_key.offset)
8564                                 continue;
8565                         btrfs_node_key_to_cpu(eb, &key, slot);
8566                         ret = btrfs_comp_cpu_keys(&key,
8567                                                   &wc->update_progress);
8568                         if (ret < 0)
8569                                 continue;
8570                 } else {
8571                         if (wc->level == 1 &&
8572                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8573                                 continue;
8574                 }
8575 reada:
8576                 readahead_tree_block(root, bytenr);
8577                 nread++;
8578         }
8579         wc->reada_slot = slot;
8580 }
8581
8582 static int account_leaf_items(struct btrfs_trans_handle *trans,
8583                               struct btrfs_root *root,
8584                               struct extent_buffer *eb)
8585 {
8586         int nr = btrfs_header_nritems(eb);
8587         int i, extent_type, ret;
8588         struct btrfs_key key;
8589         struct btrfs_file_extent_item *fi;
8590         u64 bytenr, num_bytes;
8591
8592         /* We can be called directly from walk_up_proc() */
8593         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags))
8594                 return 0;
8595
8596         for (i = 0; i < nr; i++) {
8597                 btrfs_item_key_to_cpu(eb, &key, i);
8598
8599                 if (key.type != BTRFS_EXTENT_DATA_KEY)
8600                         continue;
8601
8602                 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
8603                 /* filter out non qgroup-accountable extents  */
8604                 extent_type = btrfs_file_extent_type(eb, fi);
8605
8606                 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
8607                         continue;
8608
8609                 bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
8610                 if (!bytenr)
8611                         continue;
8612
8613                 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
8614
8615                 ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info,
8616                                 bytenr, num_bytes, GFP_NOFS);
8617                 if (ret)
8618                         return ret;
8619         }
8620         return 0;
8621 }
8622
8623 /*
8624  * Walk up the tree from the bottom, freeing leaves and any interior
8625  * nodes which have had all slots visited. If a node (leaf or
8626  * interior) is freed, the node above it will have it's slot
8627  * incremented. The root node will never be freed.
8628  *
8629  * At the end of this function, we should have a path which has all
8630  * slots incremented to the next position for a search. If we need to
8631  * read a new node it will be NULL and the node above it will have the
8632  * correct slot selected for a later read.
8633  *
8634  * If we increment the root nodes slot counter past the number of
8635  * elements, 1 is returned to signal completion of the search.
8636  */
8637 static int adjust_slots_upwards(struct btrfs_root *root,
8638                                 struct btrfs_path *path, int root_level)
8639 {
8640         int level = 0;
8641         int nr, slot;
8642         struct extent_buffer *eb;
8643
8644         if (root_level == 0)
8645                 return 1;
8646
8647         while (level <= root_level) {
8648                 eb = path->nodes[level];
8649                 nr = btrfs_header_nritems(eb);
8650                 path->slots[level]++;
8651                 slot = path->slots[level];
8652                 if (slot >= nr || level == 0) {
8653                         /*
8654                          * Don't free the root -  we will detect this
8655                          * condition after our loop and return a
8656                          * positive value for caller to stop walking the tree.
8657                          */
8658                         if (level != root_level) {
8659                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8660                                 path->locks[level] = 0;
8661
8662                                 free_extent_buffer(eb);
8663                                 path->nodes[level] = NULL;
8664                                 path->slots[level] = 0;
8665                         }
8666                 } else {
8667                         /*
8668                          * We have a valid slot to walk back down
8669                          * from. Stop here so caller can process these
8670                          * new nodes.
8671                          */
8672                         break;
8673                 }
8674
8675                 level++;
8676         }
8677
8678         eb = path->nodes[root_level];
8679         if (path->slots[root_level] >= btrfs_header_nritems(eb))
8680                 return 1;
8681
8682         return 0;
8683 }
8684
8685 /*
8686  * root_eb is the subtree root and is locked before this function is called.
8687  */
8688 static int account_shared_subtree(struct btrfs_trans_handle *trans,
8689                                   struct btrfs_root *root,
8690                                   struct extent_buffer *root_eb,
8691                                   u64 root_gen,
8692                                   int root_level)
8693 {
8694         int ret = 0;
8695         int level;
8696         struct extent_buffer *eb = root_eb;
8697         struct btrfs_path *path = NULL;
8698
8699         BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
8700         BUG_ON(root_eb == NULL);
8701
8702         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags))
8703                 return 0;
8704
8705         if (!extent_buffer_uptodate(root_eb)) {
8706                 ret = btrfs_read_buffer(root_eb, root_gen);
8707                 if (ret)
8708                         goto out;
8709         }
8710
8711         if (root_level == 0) {
8712                 ret = account_leaf_items(trans, root, root_eb);
8713                 goto out;
8714         }
8715
8716         path = btrfs_alloc_path();
8717         if (!path)
8718                 return -ENOMEM;
8719
8720         /*
8721          * Walk down the tree.  Missing extent blocks are filled in as
8722          * we go. Metadata is accounted every time we read a new
8723          * extent block.
8724          *
8725          * When we reach a leaf, we account for file extent items in it,
8726          * walk back up the tree (adjusting slot pointers as we go)
8727          * and restart the search process.
8728          */
8729         extent_buffer_get(root_eb); /* For path */
8730         path->nodes[root_level] = root_eb;
8731         path->slots[root_level] = 0;
8732         path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
8733 walk_down:
8734         level = root_level;
8735         while (level >= 0) {
8736                 if (path->nodes[level] == NULL) {
8737                         int parent_slot;
8738                         u64 child_gen;
8739                         u64 child_bytenr;
8740
8741                         /* We need to get child blockptr/gen from
8742                          * parent before we can read it. */
8743                         eb = path->nodes[level + 1];
8744                         parent_slot = path->slots[level + 1];
8745                         child_bytenr = btrfs_node_blockptr(eb, parent_slot);
8746                         child_gen = btrfs_node_ptr_generation(eb, parent_slot);
8747
8748                         eb = read_tree_block(root, child_bytenr, child_gen);
8749                         if (IS_ERR(eb)) {
8750                                 ret = PTR_ERR(eb);
8751                                 goto out;
8752                         } else if (!extent_buffer_uptodate(eb)) {
8753                                 free_extent_buffer(eb);
8754                                 ret = -EIO;
8755                                 goto out;
8756                         }
8757
8758                         path->nodes[level] = eb;
8759                         path->slots[level] = 0;
8760
8761                         btrfs_tree_read_lock(eb);
8762                         btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
8763                         path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
8764
8765                         ret = btrfs_qgroup_insert_dirty_extent(trans,
8766                                         root->fs_info, child_bytenr,
8767                                         root->nodesize, GFP_NOFS);
8768                         if (ret)
8769                                 goto out;
8770                 }
8771
8772                 if (level == 0) {
8773                         ret = account_leaf_items(trans, root, path->nodes[level]);
8774                         if (ret)
8775                                 goto out;
8776
8777                         /* Nonzero return here means we completed our search */
8778                         ret = adjust_slots_upwards(root, path, root_level);
8779                         if (ret)
8780                                 break;
8781
8782                         /* Restart search with new slots */
8783                         goto walk_down;
8784                 }
8785
8786                 level--;
8787         }
8788
8789         ret = 0;
8790 out:
8791         btrfs_free_path(path);
8792
8793         return ret;
8794 }
8795
8796 /*
8797  * helper to process tree block while walking down the tree.
8798  *
8799  * when wc->stage == UPDATE_BACKREF, this function updates
8800  * back refs for pointers in the block.
8801  *
8802  * NOTE: return value 1 means we should stop walking down.
8803  */
8804 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
8805                                    struct btrfs_root *root,
8806                                    struct btrfs_path *path,
8807                                    struct walk_control *wc, int lookup_info)
8808 {
8809         int level = wc->level;
8810         struct extent_buffer *eb = path->nodes[level];
8811         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8812         int ret;
8813
8814         if (wc->stage == UPDATE_BACKREF &&
8815             btrfs_header_owner(eb) != root->root_key.objectid)
8816                 return 1;
8817
8818         /*
8819          * when reference count of tree block is 1, it won't increase
8820          * again. once full backref flag is set, we never clear it.
8821          */
8822         if (lookup_info &&
8823             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
8824              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
8825                 BUG_ON(!path->locks[level]);
8826                 ret = btrfs_lookup_extent_info(trans, root,
8827                                                eb->start, level, 1,
8828                                                &wc->refs[level],
8829                                                &wc->flags[level]);
8830                 BUG_ON(ret == -ENOMEM);
8831                 if (ret)
8832                         return ret;
8833                 BUG_ON(wc->refs[level] == 0);
8834         }
8835
8836         if (wc->stage == DROP_REFERENCE) {
8837                 if (wc->refs[level] > 1)
8838                         return 1;
8839
8840                 if (path->locks[level] && !wc->keep_locks) {
8841                         btrfs_tree_unlock_rw(eb, path->locks[level]);
8842                         path->locks[level] = 0;
8843                 }
8844                 return 0;
8845         }
8846
8847         /* wc->stage == UPDATE_BACKREF */
8848         if (!(wc->flags[level] & flag)) {
8849                 BUG_ON(!path->locks[level]);
8850                 ret = btrfs_inc_ref(trans, root, eb, 1);
8851                 BUG_ON(ret); /* -ENOMEM */
8852                 ret = btrfs_dec_ref(trans, root, eb, 0);
8853                 BUG_ON(ret); /* -ENOMEM */
8854                 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
8855                                                   eb->len, flag,
8856                                                   btrfs_header_level(eb), 0);
8857                 BUG_ON(ret); /* -ENOMEM */
8858                 wc->flags[level] |= flag;
8859         }
8860
8861         /*
8862          * the block is shared by multiple trees, so it's not good to
8863          * keep the tree lock
8864          */
8865         if (path->locks[level] && level > 0) {
8866                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8867                 path->locks[level] = 0;
8868         }
8869         return 0;
8870 }
8871
8872 /*
8873  * helper to process tree block pointer.
8874  *
8875  * when wc->stage == DROP_REFERENCE, this function checks
8876  * reference count of the block pointed to. if the block
8877  * is shared and we need update back refs for the subtree
8878  * rooted at the block, this function changes wc->stage to
8879  * UPDATE_BACKREF. if the block is shared and there is no
8880  * need to update back, this function drops the reference
8881  * to the block.
8882  *
8883  * NOTE: return value 1 means we should stop walking down.
8884  */
8885 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8886                                  struct btrfs_root *root,
8887                                  struct btrfs_path *path,
8888                                  struct walk_control *wc, int *lookup_info)
8889 {
8890         u64 bytenr;
8891         u64 generation;
8892         u64 parent;
8893         u32 blocksize;
8894         struct btrfs_key key;
8895         struct extent_buffer *next;
8896         int level = wc->level;
8897         int reada = 0;
8898         int ret = 0;
8899         bool need_account = false;
8900
8901         generation = btrfs_node_ptr_generation(path->nodes[level],
8902                                                path->slots[level]);
8903         /*
8904          * if the lower level block was created before the snapshot
8905          * was created, we know there is no need to update back refs
8906          * for the subtree
8907          */
8908         if (wc->stage == UPDATE_BACKREF &&
8909             generation <= root->root_key.offset) {
8910                 *lookup_info = 1;
8911                 return 1;
8912         }
8913
8914         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
8915         blocksize = root->nodesize;
8916
8917         next = btrfs_find_tree_block(root->fs_info, bytenr);
8918         if (!next) {
8919                 next = btrfs_find_create_tree_block(root, bytenr);
8920                 if (IS_ERR(next))
8921                         return PTR_ERR(next);
8922
8923                 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
8924                                                level - 1);
8925                 reada = 1;
8926         }
8927         btrfs_tree_lock(next);
8928         btrfs_set_lock_blocking(next);
8929
8930         ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
8931                                        &wc->refs[level - 1],
8932                                        &wc->flags[level - 1]);
8933         if (ret < 0) {
8934                 btrfs_tree_unlock(next);
8935                 return ret;
8936         }
8937
8938         if (unlikely(wc->refs[level - 1] == 0)) {
8939                 btrfs_err(root->fs_info, "Missing references.");
8940                 BUG();
8941         }
8942         *lookup_info = 0;
8943
8944         if (wc->stage == DROP_REFERENCE) {
8945                 if (wc->refs[level - 1] > 1) {
8946                         need_account = true;
8947                         if (level == 1 &&
8948                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8949                                 goto skip;
8950
8951                         if (!wc->update_ref ||
8952                             generation <= root->root_key.offset)
8953                                 goto skip;
8954
8955                         btrfs_node_key_to_cpu(path->nodes[level], &key,
8956                                               path->slots[level]);
8957                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
8958                         if (ret < 0)
8959                                 goto skip;
8960
8961                         wc->stage = UPDATE_BACKREF;
8962                         wc->shared_level = level - 1;
8963                 }
8964         } else {
8965                 if (level == 1 &&
8966                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8967                         goto skip;
8968         }
8969
8970         if (!btrfs_buffer_uptodate(next, generation, 0)) {
8971                 btrfs_tree_unlock(next);
8972                 free_extent_buffer(next);
8973                 next = NULL;
8974                 *lookup_info = 1;
8975         }
8976
8977         if (!next) {
8978                 if (reada && level == 1)
8979                         reada_walk_down(trans, root, wc, path);
8980                 next = read_tree_block(root, bytenr, generation);
8981                 if (IS_ERR(next)) {
8982                         return PTR_ERR(next);
8983                 } else if (!extent_buffer_uptodate(next)) {
8984                         free_extent_buffer(next);
8985                         return -EIO;
8986                 }
8987                 btrfs_tree_lock(next);
8988                 btrfs_set_lock_blocking(next);
8989         }
8990
8991         level--;
8992         BUG_ON(level != btrfs_header_level(next));
8993         path->nodes[level] = next;
8994         path->slots[level] = 0;
8995         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8996         wc->level = level;
8997         if (wc->level == 1)
8998                 wc->reada_slot = 0;
8999         return 0;
9000 skip:
9001         wc->refs[level - 1] = 0;
9002         wc->flags[level - 1] = 0;
9003         if (wc->stage == DROP_REFERENCE) {
9004                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
9005                         parent = path->nodes[level]->start;
9006                 } else {
9007                         BUG_ON(root->root_key.objectid !=
9008                                btrfs_header_owner(path->nodes[level]));
9009                         parent = 0;
9010                 }
9011
9012                 if (need_account) {
9013                         ret = account_shared_subtree(trans, root, next,
9014                                                      generation, level - 1);
9015                         if (ret) {
9016                                 btrfs_err_rl(root->fs_info,
9017                                         "Error "
9018                                         "%d accounting shared subtree. Quota "
9019                                         "is out of sync, rescan required.",
9020                                         ret);
9021                         }
9022                 }
9023                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
9024                                 root->root_key.objectid, level - 1, 0);
9025                 BUG_ON(ret); /* -ENOMEM */
9026         }
9027         btrfs_tree_unlock(next);
9028         free_extent_buffer(next);
9029         *lookup_info = 1;
9030         return 1;
9031 }
9032
9033 /*
9034  * helper to process tree block while walking up the tree.
9035  *
9036  * when wc->stage == DROP_REFERENCE, this function drops
9037  * reference count on the block.
9038  *
9039  * when wc->stage == UPDATE_BACKREF, this function changes
9040  * wc->stage back to DROP_REFERENCE if we changed wc->stage
9041  * to UPDATE_BACKREF previously while processing the block.
9042  *
9043  * NOTE: return value 1 means we should stop walking up.
9044  */
9045 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
9046                                  struct btrfs_root *root,
9047                                  struct btrfs_path *path,
9048                                  struct walk_control *wc)
9049 {
9050         int ret;
9051         int level = wc->level;
9052         struct extent_buffer *eb = path->nodes[level];
9053         u64 parent = 0;
9054
9055         if (wc->stage == UPDATE_BACKREF) {
9056                 BUG_ON(wc->shared_level < level);
9057                 if (level < wc->shared_level)
9058                         goto out;
9059
9060                 ret = find_next_key(path, level + 1, &wc->update_progress);
9061                 if (ret > 0)
9062                         wc->update_ref = 0;
9063
9064                 wc->stage = DROP_REFERENCE;
9065                 wc->shared_level = -1;
9066                 path->slots[level] = 0;
9067
9068                 /*
9069                  * check reference count again if the block isn't locked.
9070                  * we should start walking down the tree again if reference
9071                  * count is one.
9072                  */
9073                 if (!path->locks[level]) {
9074                         BUG_ON(level == 0);
9075                         btrfs_tree_lock(eb);
9076                         btrfs_set_lock_blocking(eb);
9077                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9078
9079                         ret = btrfs_lookup_extent_info(trans, root,
9080                                                        eb->start, level, 1,
9081                                                        &wc->refs[level],
9082                                                        &wc->flags[level]);
9083                         if (ret < 0) {
9084                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
9085                                 path->locks[level] = 0;
9086                                 return ret;
9087                         }
9088                         BUG_ON(wc->refs[level] == 0);
9089                         if (wc->refs[level] == 1) {
9090                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
9091                                 path->locks[level] = 0;
9092                                 return 1;
9093                         }
9094                 }
9095         }
9096
9097         /* wc->stage == DROP_REFERENCE */
9098         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
9099
9100         if (wc->refs[level] == 1) {
9101                 if (level == 0) {
9102                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9103                                 ret = btrfs_dec_ref(trans, root, eb, 1);
9104                         else
9105                                 ret = btrfs_dec_ref(trans, root, eb, 0);
9106                         BUG_ON(ret); /* -ENOMEM */
9107                         ret = account_leaf_items(trans, root, eb);
9108                         if (ret) {
9109                                 btrfs_err_rl(root->fs_info,
9110                                         "error "
9111                                         "%d accounting leaf items. Quota "
9112                                         "is out of sync, rescan required.",
9113                                         ret);
9114                         }
9115                 }
9116                 /* make block locked assertion in clean_tree_block happy */
9117                 if (!path->locks[level] &&
9118                     btrfs_header_generation(eb) == trans->transid) {
9119                         btrfs_tree_lock(eb);
9120                         btrfs_set_lock_blocking(eb);
9121                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9122                 }
9123                 clean_tree_block(trans, root->fs_info, eb);
9124         }
9125
9126         if (eb == root->node) {
9127                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9128                         parent = eb->start;
9129                 else
9130                         BUG_ON(root->root_key.objectid !=
9131                                btrfs_header_owner(eb));
9132         } else {
9133                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9134                         parent = path->nodes[level + 1]->start;
9135                 else
9136                         BUG_ON(root->root_key.objectid !=
9137                                btrfs_header_owner(path->nodes[level + 1]));
9138         }
9139
9140         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
9141 out:
9142         wc->refs[level] = 0;
9143         wc->flags[level] = 0;
9144         return 0;
9145 }
9146
9147 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
9148                                    struct btrfs_root *root,
9149                                    struct btrfs_path *path,
9150                                    struct walk_control *wc)
9151 {
9152         int level = wc->level;
9153         int lookup_info = 1;
9154         int ret;
9155
9156         while (level >= 0) {
9157                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
9158                 if (ret > 0)
9159                         break;
9160
9161                 if (level == 0)
9162                         break;
9163
9164                 if (path->slots[level] >=
9165                     btrfs_header_nritems(path->nodes[level]))
9166                         break;
9167
9168                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
9169                 if (ret > 0) {
9170                         path->slots[level]++;
9171                         continue;
9172                 } else if (ret < 0)
9173                         return ret;
9174                 level = wc->level;
9175         }
9176         return 0;
9177 }
9178
9179 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
9180                                  struct btrfs_root *root,
9181                                  struct btrfs_path *path,
9182                                  struct walk_control *wc, int max_level)
9183 {
9184         int level = wc->level;
9185         int ret;
9186
9187         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
9188         while (level < max_level && path->nodes[level]) {
9189                 wc->level = level;
9190                 if (path->slots[level] + 1 <
9191                     btrfs_header_nritems(path->nodes[level])) {
9192                         path->slots[level]++;
9193                         return 0;
9194                 } else {
9195                         ret = walk_up_proc(trans, root, path, wc);
9196                         if (ret > 0)
9197                                 return 0;
9198
9199                         if (path->locks[level]) {
9200                                 btrfs_tree_unlock_rw(path->nodes[level],
9201                                                      path->locks[level]);
9202                                 path->locks[level] = 0;
9203                         }
9204                         free_extent_buffer(path->nodes[level]);
9205                         path->nodes[level] = NULL;
9206                         level++;
9207                 }
9208         }
9209         return 1;
9210 }
9211
9212 /*
9213  * drop a subvolume tree.
9214  *
9215  * this function traverses the tree freeing any blocks that only
9216  * referenced by the tree.
9217  *
9218  * when a shared tree block is found. this function decreases its
9219  * reference count by one. if update_ref is true, this function
9220  * also make sure backrefs for the shared block and all lower level
9221  * blocks are properly updated.
9222  *
9223  * If called with for_reloc == 0, may exit early with -EAGAIN
9224  */
9225 int btrfs_drop_snapshot(struct btrfs_root *root,
9226                          struct btrfs_block_rsv *block_rsv, int update_ref,
9227                          int for_reloc)
9228 {
9229         struct btrfs_path *path;
9230         struct btrfs_trans_handle *trans;
9231         struct btrfs_root *tree_root = root->fs_info->tree_root;
9232         struct btrfs_root_item *root_item = &root->root_item;
9233         struct walk_control *wc;
9234         struct btrfs_key key;
9235         int err = 0;
9236         int ret;
9237         int level;
9238         bool root_dropped = false;
9239
9240         btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid);
9241
9242         path = btrfs_alloc_path();
9243         if (!path) {
9244                 err = -ENOMEM;
9245                 goto out;
9246         }
9247
9248         wc = kzalloc(sizeof(*wc), GFP_NOFS);
9249         if (!wc) {
9250                 btrfs_free_path(path);
9251                 err = -ENOMEM;
9252                 goto out;
9253         }
9254
9255         trans = btrfs_start_transaction(tree_root, 0);
9256         if (IS_ERR(trans)) {
9257                 err = PTR_ERR(trans);
9258                 goto out_free;
9259         }
9260
9261         if (block_rsv)
9262                 trans->block_rsv = block_rsv;
9263
9264         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
9265                 level = btrfs_header_level(root->node);
9266                 path->nodes[level] = btrfs_lock_root_node(root);
9267                 btrfs_set_lock_blocking(path->nodes[level]);
9268                 path->slots[level] = 0;
9269                 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9270                 memset(&wc->update_progress, 0,
9271                        sizeof(wc->update_progress));
9272         } else {
9273                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
9274                 memcpy(&wc->update_progress, &key,
9275                        sizeof(wc->update_progress));
9276
9277                 level = root_item->drop_level;
9278                 BUG_ON(level == 0);
9279                 path->lowest_level = level;
9280                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
9281                 path->lowest_level = 0;
9282                 if (ret < 0) {
9283                         err = ret;
9284                         goto out_end_trans;
9285                 }
9286                 WARN_ON(ret > 0);
9287
9288                 /*
9289                  * unlock our path, this is safe because only this
9290                  * function is allowed to delete this snapshot
9291                  */
9292                 btrfs_unlock_up_safe(path, 0);
9293
9294                 level = btrfs_header_level(root->node);
9295                 while (1) {
9296                         btrfs_tree_lock(path->nodes[level]);
9297                         btrfs_set_lock_blocking(path->nodes[level]);
9298                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9299
9300                         ret = btrfs_lookup_extent_info(trans, root,
9301                                                 path->nodes[level]->start,
9302                                                 level, 1, &wc->refs[level],
9303                                                 &wc->flags[level]);
9304                         if (ret < 0) {
9305                                 err = ret;
9306                                 goto out_end_trans;
9307                         }
9308                         BUG_ON(wc->refs[level] == 0);
9309
9310                         if (level == root_item->drop_level)
9311                                 break;
9312
9313                         btrfs_tree_unlock(path->nodes[level]);
9314                         path->locks[level] = 0;
9315                         WARN_ON(wc->refs[level] != 1);
9316                         level--;
9317                 }
9318         }
9319
9320         wc->level = level;
9321         wc->shared_level = -1;
9322         wc->stage = DROP_REFERENCE;
9323         wc->update_ref = update_ref;
9324         wc->keep_locks = 0;
9325         wc->for_reloc = for_reloc;
9326         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
9327
9328         while (1) {
9329
9330                 ret = walk_down_tree(trans, root, path, wc);
9331                 if (ret < 0) {
9332                         err = ret;
9333                         break;
9334                 }
9335
9336                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
9337                 if (ret < 0) {
9338                         err = ret;
9339                         break;
9340                 }
9341
9342                 if (ret > 0) {
9343                         BUG_ON(wc->stage != DROP_REFERENCE);
9344                         break;
9345                 }
9346
9347                 if (wc->stage == DROP_REFERENCE) {
9348                         level = wc->level;
9349                         btrfs_node_key(path->nodes[level],
9350                                        &root_item->drop_progress,
9351                                        path->slots[level]);
9352                         root_item->drop_level = level;
9353                 }
9354
9355                 BUG_ON(wc->level == 0);
9356                 if (btrfs_should_end_transaction(trans, tree_root) ||
9357                     (!for_reloc && btrfs_need_cleaner_sleep(root))) {
9358                         ret = btrfs_update_root(trans, tree_root,
9359                                                 &root->root_key,
9360                                                 root_item);
9361                         if (ret) {
9362                                 btrfs_abort_transaction(trans, ret);
9363                                 err = ret;
9364                                 goto out_end_trans;
9365                         }
9366
9367                         btrfs_end_transaction_throttle(trans, tree_root);
9368                         if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
9369                                 pr_debug("BTRFS: drop snapshot early exit\n");
9370                                 err = -EAGAIN;
9371                                 goto out_free;
9372                         }
9373
9374                         trans = btrfs_start_transaction(tree_root, 0);
9375                         if (IS_ERR(trans)) {
9376                                 err = PTR_ERR(trans);
9377                                 goto out_free;
9378                         }
9379                         if (block_rsv)
9380                                 trans->block_rsv = block_rsv;
9381                 }
9382         }
9383         btrfs_release_path(path);
9384         if (err)
9385                 goto out_end_trans;
9386
9387         ret = btrfs_del_root(trans, tree_root, &root->root_key);
9388         if (ret) {
9389                 btrfs_abort_transaction(trans, ret);
9390                 goto out_end_trans;
9391         }
9392
9393         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
9394                 ret = btrfs_find_root(tree_root, &root->root_key, path,
9395                                       NULL, NULL);
9396                 if (ret < 0) {
9397                         btrfs_abort_transaction(trans, ret);
9398                         err = ret;
9399                         goto out_end_trans;
9400                 } else if (ret > 0) {
9401                         /* if we fail to delete the orphan item this time
9402                          * around, it'll get picked up the next time.
9403                          *
9404                          * The most common failure here is just -ENOENT.
9405                          */
9406                         btrfs_del_orphan_item(trans, tree_root,
9407                                               root->root_key.objectid);
9408                 }
9409         }
9410
9411         if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
9412                 btrfs_add_dropped_root(trans, root);
9413         } else {
9414                 free_extent_buffer(root->node);
9415                 free_extent_buffer(root->commit_root);
9416                 btrfs_put_fs_root(root);
9417         }
9418         root_dropped = true;
9419 out_end_trans:
9420         btrfs_end_transaction_throttle(trans, tree_root);
9421 out_free:
9422         kfree(wc);
9423         btrfs_free_path(path);
9424 out:
9425         /*
9426          * So if we need to stop dropping the snapshot for whatever reason we
9427          * need to make sure to add it back to the dead root list so that we
9428          * keep trying to do the work later.  This also cleans up roots if we
9429          * don't have it in the radix (like when we recover after a power fail
9430          * or unmount) so we don't leak memory.
9431          */
9432         if (!for_reloc && root_dropped == false)
9433                 btrfs_add_dead_root(root);
9434         if (err && err != -EAGAIN)
9435                 btrfs_handle_fs_error(root->fs_info, err, NULL);
9436         return err;
9437 }
9438
9439 /*
9440  * drop subtree rooted at tree block 'node'.
9441  *
9442  * NOTE: this function will unlock and release tree block 'node'
9443  * only used by relocation code
9444  */
9445 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
9446                         struct btrfs_root *root,
9447                         struct extent_buffer *node,
9448                         struct extent_buffer *parent)
9449 {
9450         struct btrfs_path *path;
9451         struct walk_control *wc;
9452         int level;
9453         int parent_level;
9454         int ret = 0;
9455         int wret;
9456
9457         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
9458
9459         path = btrfs_alloc_path();
9460         if (!path)
9461                 return -ENOMEM;
9462
9463         wc = kzalloc(sizeof(*wc), GFP_NOFS);
9464         if (!wc) {
9465                 btrfs_free_path(path);
9466                 return -ENOMEM;
9467         }
9468
9469         btrfs_assert_tree_locked(parent);
9470         parent_level = btrfs_header_level(parent);
9471         extent_buffer_get(parent);
9472         path->nodes[parent_level] = parent;
9473         path->slots[parent_level] = btrfs_header_nritems(parent);
9474
9475         btrfs_assert_tree_locked(node);
9476         level = btrfs_header_level(node);
9477         path->nodes[level] = node;
9478         path->slots[level] = 0;
9479         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9480
9481         wc->refs[parent_level] = 1;
9482         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
9483         wc->level = level;
9484         wc->shared_level = -1;
9485         wc->stage = DROP_REFERENCE;
9486         wc->update_ref = 0;
9487         wc->keep_locks = 1;
9488         wc->for_reloc = 1;
9489         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
9490
9491         while (1) {
9492                 wret = walk_down_tree(trans, root, path, wc);
9493                 if (wret < 0) {
9494                         ret = wret;
9495                         break;
9496                 }
9497
9498                 wret = walk_up_tree(trans, root, path, wc, parent_level);
9499                 if (wret < 0)
9500                         ret = wret;
9501                 if (wret != 0)
9502                         break;
9503         }
9504
9505         kfree(wc);
9506         btrfs_free_path(path);
9507         return ret;
9508 }
9509
9510 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
9511 {
9512         u64 num_devices;
9513         u64 stripped;
9514
9515         /*
9516          * if restripe for this chunk_type is on pick target profile and
9517          * return, otherwise do the usual balance
9518          */
9519         stripped = get_restripe_target(root->fs_info, flags);
9520         if (stripped)
9521                 return extended_to_chunk(stripped);
9522
9523         num_devices = root->fs_info->fs_devices->rw_devices;
9524
9525         stripped = BTRFS_BLOCK_GROUP_RAID0 |
9526                 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
9527                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
9528
9529         if (num_devices == 1) {
9530                 stripped |= BTRFS_BLOCK_GROUP_DUP;
9531                 stripped = flags & ~stripped;
9532
9533                 /* turn raid0 into single device chunks */
9534                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
9535                         return stripped;
9536
9537                 /* turn mirroring into duplication */
9538                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
9539                              BTRFS_BLOCK_GROUP_RAID10))
9540                         return stripped | BTRFS_BLOCK_GROUP_DUP;
9541         } else {
9542                 /* they already had raid on here, just return */
9543                 if (flags & stripped)
9544                         return flags;
9545
9546                 stripped |= BTRFS_BLOCK_GROUP_DUP;
9547                 stripped = flags & ~stripped;
9548
9549                 /* switch duplicated blocks with raid1 */
9550                 if (flags & BTRFS_BLOCK_GROUP_DUP)
9551                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
9552
9553                 /* this is drive concat, leave it alone */
9554         }
9555
9556         return flags;
9557 }
9558
9559 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9560 {
9561         struct btrfs_space_info *sinfo = cache->space_info;
9562         u64 num_bytes;
9563         u64 min_allocable_bytes;
9564         int ret = -ENOSPC;
9565
9566         /*
9567          * We need some metadata space and system metadata space for
9568          * allocating chunks in some corner cases until we force to set
9569          * it to be readonly.
9570          */
9571         if ((sinfo->flags &
9572              (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
9573             !force)
9574                 min_allocable_bytes = SZ_1M;
9575         else
9576                 min_allocable_bytes = 0;
9577
9578         spin_lock(&sinfo->lock);
9579         spin_lock(&cache->lock);
9580
9581         if (cache->ro) {
9582                 cache->ro++;
9583                 ret = 0;
9584                 goto out;
9585         }
9586
9587         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
9588                     cache->bytes_super - btrfs_block_group_used(&cache->item);
9589
9590         if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
9591             sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
9592             min_allocable_bytes <= sinfo->total_bytes) {
9593                 sinfo->bytes_readonly += num_bytes;
9594                 cache->ro++;
9595                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
9596                 ret = 0;
9597         }
9598 out:
9599         spin_unlock(&cache->lock);
9600         spin_unlock(&sinfo->lock);
9601         return ret;
9602 }
9603
9604 int btrfs_inc_block_group_ro(struct btrfs_root *root,
9605                              struct btrfs_block_group_cache *cache)
9606
9607 {
9608         struct btrfs_trans_handle *trans;
9609         u64 alloc_flags;
9610         int ret;
9611
9612 again:
9613         trans = btrfs_join_transaction(root);
9614         if (IS_ERR(trans))
9615                 return PTR_ERR(trans);
9616
9617         /*
9618          * we're not allowed to set block groups readonly after the dirty
9619          * block groups cache has started writing.  If it already started,
9620          * back off and let this transaction commit
9621          */
9622         mutex_lock(&root->fs_info->ro_block_group_mutex);
9623         if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
9624                 u64 transid = trans->transid;
9625
9626                 mutex_unlock(&root->fs_info->ro_block_group_mutex);
9627                 btrfs_end_transaction(trans, root);
9628
9629                 ret = btrfs_wait_for_commit(root, transid);
9630                 if (ret)
9631                         return ret;
9632                 goto again;
9633         }
9634
9635         /*
9636          * if we are changing raid levels, try to allocate a corresponding
9637          * block group with the new raid level.
9638          */
9639         alloc_flags = update_block_group_flags(root, cache->flags);
9640         if (alloc_flags != cache->flags) {
9641                 ret = do_chunk_alloc(trans, root, alloc_flags,
9642                                      CHUNK_ALLOC_FORCE);
9643                 /*
9644                  * ENOSPC is allowed here, we may have enough space
9645                  * already allocated at the new raid level to
9646                  * carry on
9647                  */
9648                 if (ret == -ENOSPC)
9649                         ret = 0;
9650                 if (ret < 0)
9651                         goto out;
9652         }
9653
9654         ret = inc_block_group_ro(cache, 0);
9655         if (!ret)
9656                 goto out;
9657         alloc_flags = get_alloc_profile(root, cache->space_info->flags);
9658         ret = do_chunk_alloc(trans, root, alloc_flags,
9659                              CHUNK_ALLOC_FORCE);
9660         if (ret < 0)
9661                 goto out;
9662         ret = inc_block_group_ro(cache, 0);
9663 out:
9664         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
9665                 alloc_flags = update_block_group_flags(root, cache->flags);
9666                 lock_chunks(root->fs_info->chunk_root);
9667                 check_system_chunk(trans, root, alloc_flags);
9668                 unlock_chunks(root->fs_info->chunk_root);
9669         }
9670         mutex_unlock(&root->fs_info->ro_block_group_mutex);
9671
9672         btrfs_end_transaction(trans, root);
9673         return ret;
9674 }
9675
9676 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
9677                             struct btrfs_root *root, u64 type)
9678 {
9679         u64 alloc_flags = get_alloc_profile(root, type);
9680         return do_chunk_alloc(trans, root, alloc_flags,
9681                               CHUNK_ALLOC_FORCE);
9682 }
9683
9684 /*
9685  * helper to account the unused space of all the readonly block group in the
9686  * space_info. takes mirrors into account.
9687  */
9688 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
9689 {
9690         struct btrfs_block_group_cache *block_group;
9691         u64 free_bytes = 0;
9692         int factor;
9693
9694         /* It's df, we don't care if it's racy */
9695         if (list_empty(&sinfo->ro_bgs))
9696                 return 0;
9697
9698         spin_lock(&sinfo->lock);
9699         list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
9700                 spin_lock(&block_group->lock);
9701
9702                 if (!block_group->ro) {
9703                         spin_unlock(&block_group->lock);
9704                         continue;
9705                 }
9706
9707                 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
9708                                           BTRFS_BLOCK_GROUP_RAID10 |
9709                                           BTRFS_BLOCK_GROUP_DUP))
9710                         factor = 2;
9711                 else
9712                         factor = 1;
9713
9714                 free_bytes += (block_group->key.offset -
9715                                btrfs_block_group_used(&block_group->item)) *
9716                                factor;
9717
9718                 spin_unlock(&block_group->lock);
9719         }
9720         spin_unlock(&sinfo->lock);
9721
9722         return free_bytes;
9723 }
9724
9725 void btrfs_dec_block_group_ro(struct btrfs_root *root,
9726                               struct btrfs_block_group_cache *cache)
9727 {
9728         struct btrfs_space_info *sinfo = cache->space_info;
9729         u64 num_bytes;
9730
9731         BUG_ON(!cache->ro);
9732
9733         spin_lock(&sinfo->lock);
9734         spin_lock(&cache->lock);
9735         if (!--cache->ro) {
9736                 num_bytes = cache->key.offset - cache->reserved -
9737                             cache->pinned - cache->bytes_super -
9738                             btrfs_block_group_used(&cache->item);
9739                 sinfo->bytes_readonly -= num_bytes;
9740                 list_del_init(&cache->ro_list);
9741         }
9742         spin_unlock(&cache->lock);
9743         spin_unlock(&sinfo->lock);
9744 }
9745
9746 /*
9747  * checks to see if its even possible to relocate this block group.
9748  *
9749  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
9750  * ok to go ahead and try.
9751  */
9752 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
9753 {
9754         struct btrfs_block_group_cache *block_group;
9755         struct btrfs_space_info *space_info;
9756         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
9757         struct btrfs_device *device;
9758         struct btrfs_trans_handle *trans;
9759         u64 min_free;
9760         u64 dev_min = 1;
9761         u64 dev_nr = 0;
9762         u64 target;
9763         int debug;
9764         int index;
9765         int full = 0;
9766         int ret = 0;
9767
9768         debug = btrfs_test_opt(root->fs_info, ENOSPC_DEBUG);
9769
9770         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
9771
9772         /* odd, couldn't find the block group, leave it alone */
9773         if (!block_group) {
9774                 if (debug)
9775                         btrfs_warn(root->fs_info,
9776                                    "can't find block group for bytenr %llu",
9777                                    bytenr);
9778                 return -1;
9779         }
9780
9781         min_free = btrfs_block_group_used(&block_group->item);
9782
9783         /* no bytes used, we're good */
9784         if (!min_free)
9785                 goto out;
9786
9787         space_info = block_group->space_info;
9788         spin_lock(&space_info->lock);
9789
9790         full = space_info->full;
9791
9792         /*
9793          * if this is the last block group we have in this space, we can't
9794          * relocate it unless we're able to allocate a new chunk below.
9795          *
9796          * Otherwise, we need to make sure we have room in the space to handle
9797          * all of the extents from this block group.  If we can, we're good
9798          */
9799         if ((space_info->total_bytes != block_group->key.offset) &&
9800             (space_info->bytes_used + space_info->bytes_reserved +
9801              space_info->bytes_pinned + space_info->bytes_readonly +
9802              min_free < space_info->total_bytes)) {
9803                 spin_unlock(&space_info->lock);
9804                 goto out;
9805         }
9806         spin_unlock(&space_info->lock);
9807
9808         /*
9809          * ok we don't have enough space, but maybe we have free space on our
9810          * devices to allocate new chunks for relocation, so loop through our
9811          * alloc devices and guess if we have enough space.  if this block
9812          * group is going to be restriped, run checks against the target
9813          * profile instead of the current one.
9814          */
9815         ret = -1;
9816
9817         /*
9818          * index:
9819          *      0: raid10
9820          *      1: raid1
9821          *      2: dup
9822          *      3: raid0
9823          *      4: single
9824          */
9825         target = get_restripe_target(root->fs_info, block_group->flags);
9826         if (target) {
9827                 index = __get_raid_index(extended_to_chunk(target));
9828         } else {
9829                 /*
9830                  * this is just a balance, so if we were marked as full
9831                  * we know there is no space for a new chunk
9832                  */
9833                 if (full) {
9834                         if (debug)
9835                                 btrfs_warn(root->fs_info,
9836                                         "no space to alloc new chunk for block group %llu",
9837                                         block_group->key.objectid);
9838                         goto out;
9839                 }
9840
9841                 index = get_block_group_index(block_group);
9842         }
9843
9844         if (index == BTRFS_RAID_RAID10) {
9845                 dev_min = 4;
9846                 /* Divide by 2 */
9847                 min_free >>= 1;
9848         } else if (index == BTRFS_RAID_RAID1) {
9849                 dev_min = 2;
9850         } else if (index == BTRFS_RAID_DUP) {
9851                 /* Multiply by 2 */
9852                 min_free <<= 1;
9853         } else if (index == BTRFS_RAID_RAID0) {
9854                 dev_min = fs_devices->rw_devices;
9855                 min_free = div64_u64(min_free, dev_min);
9856         }
9857
9858         /* We need to do this so that we can look at pending chunks */
9859         trans = btrfs_join_transaction(root);
9860         if (IS_ERR(trans)) {
9861                 ret = PTR_ERR(trans);
9862                 goto out;
9863         }
9864
9865         mutex_lock(&root->fs_info->chunk_mutex);
9866         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
9867                 u64 dev_offset;
9868
9869                 /*
9870                  * check to make sure we can actually find a chunk with enough
9871                  * space to fit our block group in.
9872                  */
9873                 if (device->total_bytes > device->bytes_used + min_free &&
9874                     !device->is_tgtdev_for_dev_replace) {
9875                         ret = find_free_dev_extent(trans, device, min_free,
9876                                                    &dev_offset, NULL);
9877                         if (!ret)
9878                                 dev_nr++;
9879
9880                         if (dev_nr >= dev_min)
9881                                 break;
9882
9883                         ret = -1;
9884                 }
9885         }
9886         if (debug && ret == -1)
9887                 btrfs_warn(root->fs_info,
9888                         "no space to allocate a new chunk for block group %llu",
9889                         block_group->key.objectid);
9890         mutex_unlock(&root->fs_info->chunk_mutex);
9891         btrfs_end_transaction(trans, root);
9892 out:
9893         btrfs_put_block_group(block_group);
9894         return ret;
9895 }
9896
9897 static int find_first_block_group(struct btrfs_root *root,
9898                 struct btrfs_path *path, struct btrfs_key *key)
9899 {
9900         int ret = 0;
9901         struct btrfs_key found_key;
9902         struct extent_buffer *leaf;
9903         int slot;
9904
9905         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
9906         if (ret < 0)
9907                 goto out;
9908
9909         while (1) {
9910                 slot = path->slots[0];
9911                 leaf = path->nodes[0];
9912                 if (slot >= btrfs_header_nritems(leaf)) {
9913                         ret = btrfs_next_leaf(root, path);
9914                         if (ret == 0)
9915                                 continue;
9916                         if (ret < 0)
9917                                 goto out;
9918                         break;
9919                 }
9920                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
9921
9922                 if (found_key.objectid >= key->objectid &&
9923                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
9924                         struct extent_map_tree *em_tree;
9925                         struct extent_map *em;
9926
9927                         em_tree = &root->fs_info->mapping_tree.map_tree;
9928                         read_lock(&em_tree->lock);
9929                         em = lookup_extent_mapping(em_tree, found_key.objectid,
9930                                                    found_key.offset);
9931                         read_unlock(&em_tree->lock);
9932                         if (!em) {
9933                                 btrfs_err(root->fs_info,
9934                         "logical %llu len %llu found bg but no related chunk",
9935                                           found_key.objectid, found_key.offset);
9936                                 ret = -ENOENT;
9937                         } else {
9938                                 ret = 0;
9939                         }
9940                         free_extent_map(em);
9941                         goto out;
9942                 }
9943                 path->slots[0]++;
9944         }
9945 out:
9946         return ret;
9947 }
9948
9949 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
9950 {
9951         struct btrfs_block_group_cache *block_group;
9952         u64 last = 0;
9953
9954         while (1) {
9955                 struct inode *inode;
9956
9957                 block_group = btrfs_lookup_first_block_group(info, last);
9958                 while (block_group) {
9959                         spin_lock(&block_group->lock);
9960                         if (block_group->iref)
9961                                 break;
9962                         spin_unlock(&block_group->lock);
9963                         block_group = next_block_group(info->tree_root,
9964                                                        block_group);
9965                 }
9966                 if (!block_group) {
9967                         if (last == 0)
9968                                 break;
9969                         last = 0;
9970                         continue;
9971                 }
9972
9973                 inode = block_group->inode;
9974                 block_group->iref = 0;
9975                 block_group->inode = NULL;
9976                 spin_unlock(&block_group->lock);
9977                 ASSERT(block_group->io_ctl.inode == NULL);
9978                 iput(inode);
9979                 last = block_group->key.objectid + block_group->key.offset;
9980                 btrfs_put_block_group(block_group);
9981         }
9982 }
9983
9984 int btrfs_free_block_groups(struct btrfs_fs_info *info)
9985 {
9986         struct btrfs_block_group_cache *block_group;
9987         struct btrfs_space_info *space_info;
9988         struct btrfs_caching_control *caching_ctl;
9989         struct rb_node *n;
9990
9991         down_write(&info->commit_root_sem);
9992         while (!list_empty(&info->caching_block_groups)) {
9993                 caching_ctl = list_entry(info->caching_block_groups.next,
9994                                          struct btrfs_caching_control, list);
9995                 list_del(&caching_ctl->list);
9996                 put_caching_control(caching_ctl);
9997         }
9998         up_write(&info->commit_root_sem);
9999
10000         spin_lock(&info->unused_bgs_lock);
10001         while (!list_empty(&info->unused_bgs)) {
10002                 block_group = list_first_entry(&info->unused_bgs,
10003                                                struct btrfs_block_group_cache,
10004                                                bg_list);
10005                 list_del_init(&block_group->bg_list);
10006                 btrfs_put_block_group(block_group);
10007         }
10008         spin_unlock(&info->unused_bgs_lock);
10009
10010         spin_lock(&info->block_group_cache_lock);
10011         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
10012                 block_group = rb_entry(n, struct btrfs_block_group_cache,
10013                                        cache_node);
10014                 rb_erase(&block_group->cache_node,
10015                          &info->block_group_cache_tree);
10016                 RB_CLEAR_NODE(&block_group->cache_node);
10017                 spin_unlock(&info->block_group_cache_lock);
10018
10019                 down_write(&block_group->space_info->groups_sem);
10020                 list_del(&block_group->list);
10021                 up_write(&block_group->space_info->groups_sem);
10022
10023                 if (block_group->cached == BTRFS_CACHE_STARTED)
10024                         wait_block_group_cache_done(block_group);
10025
10026                 /*
10027                  * We haven't cached this block group, which means we could
10028                  * possibly have excluded extents on this block group.
10029                  */
10030                 if (block_group->cached == BTRFS_CACHE_NO ||
10031                     block_group->cached == BTRFS_CACHE_ERROR)
10032                         free_excluded_extents(info->extent_root, block_group);
10033
10034                 btrfs_remove_free_space_cache(block_group);
10035                 ASSERT(list_empty(&block_group->dirty_list));
10036                 ASSERT(list_empty(&block_group->io_list));
10037                 ASSERT(list_empty(&block_group->bg_list));
10038                 ASSERT(atomic_read(&block_group->count) == 1);
10039                 btrfs_put_block_group(block_group);
10040
10041                 spin_lock(&info->block_group_cache_lock);
10042         }
10043         spin_unlock(&info->block_group_cache_lock);
10044
10045         /* now that all the block groups are freed, go through and
10046          * free all the space_info structs.  This is only called during
10047          * the final stages of unmount, and so we know nobody is
10048          * using them.  We call synchronize_rcu() once before we start,
10049          * just to be on the safe side.
10050          */
10051         synchronize_rcu();
10052
10053         release_global_block_rsv(info);
10054
10055         while (!list_empty(&info->space_info)) {
10056                 int i;
10057
10058                 space_info = list_entry(info->space_info.next,
10059                                         struct btrfs_space_info,
10060                                         list);
10061
10062                 /*
10063                  * Do not hide this behind enospc_debug, this is actually
10064                  * important and indicates a real bug if this happens.
10065                  */
10066                 if (WARN_ON(space_info->bytes_pinned > 0 ||
10067                             space_info->bytes_reserved > 0 ||
10068                             space_info->bytes_may_use > 0))
10069                         dump_space_info(space_info, 0, 0);
10070                 list_del(&space_info->list);
10071                 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
10072                         struct kobject *kobj;
10073                         kobj = space_info->block_group_kobjs[i];
10074                         space_info->block_group_kobjs[i] = NULL;
10075                         if (kobj) {
10076                                 kobject_del(kobj);
10077                                 kobject_put(kobj);
10078                         }
10079                 }
10080                 kobject_del(&space_info->kobj);
10081                 kobject_put(&space_info->kobj);
10082         }
10083         return 0;
10084 }
10085
10086 static void __link_block_group(struct btrfs_space_info *space_info,
10087                                struct btrfs_block_group_cache *cache)
10088 {
10089         int index = get_block_group_index(cache);
10090         bool first = false;
10091
10092         down_write(&space_info->groups_sem);
10093         if (list_empty(&space_info->block_groups[index]))
10094                 first = true;
10095         list_add_tail(&cache->list, &space_info->block_groups[index]);
10096         up_write(&space_info->groups_sem);
10097
10098         if (first) {
10099                 struct raid_kobject *rkobj;
10100                 int ret;
10101
10102                 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
10103                 if (!rkobj)
10104                         goto out_err;
10105                 rkobj->raid_type = index;
10106                 kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
10107                 ret = kobject_add(&rkobj->kobj, &space_info->kobj,
10108                                   "%s", get_raid_name(index));
10109                 if (ret) {
10110                         kobject_put(&rkobj->kobj);
10111                         goto out_err;
10112                 }
10113                 space_info->block_group_kobjs[index] = &rkobj->kobj;
10114         }
10115
10116         return;
10117 out_err:
10118         pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
10119 }
10120
10121 static struct btrfs_block_group_cache *
10122 btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
10123 {
10124         struct btrfs_block_group_cache *cache;
10125
10126         cache = kzalloc(sizeof(*cache), GFP_NOFS);
10127         if (!cache)
10128                 return NULL;
10129
10130         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
10131                                         GFP_NOFS);
10132         if (!cache->free_space_ctl) {
10133                 kfree(cache);
10134                 return NULL;
10135         }
10136
10137         cache->key.objectid = start;
10138         cache->key.offset = size;
10139         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10140
10141         cache->sectorsize = root->sectorsize;
10142         cache->fs_info = root->fs_info;
10143         cache->full_stripe_len = btrfs_full_stripe_len(root,
10144                                                &root->fs_info->mapping_tree,
10145                                                start);
10146         set_free_space_tree_thresholds(cache);
10147
10148         atomic_set(&cache->count, 1);
10149         spin_lock_init(&cache->lock);
10150         init_rwsem(&cache->data_rwsem);
10151         INIT_LIST_HEAD(&cache->list);
10152         INIT_LIST_HEAD(&cache->cluster_list);
10153         INIT_LIST_HEAD(&cache->bg_list);
10154         INIT_LIST_HEAD(&cache->ro_list);
10155         INIT_LIST_HEAD(&cache->dirty_list);
10156         INIT_LIST_HEAD(&cache->io_list);
10157         btrfs_init_free_space_ctl(cache);
10158         atomic_set(&cache->trimming, 0);
10159         mutex_init(&cache->free_space_lock);
10160
10161         return cache;
10162 }
10163
10164 int btrfs_read_block_groups(struct btrfs_root *root)
10165 {
10166         struct btrfs_path *path;
10167         int ret;
10168         struct btrfs_block_group_cache *cache;
10169         struct btrfs_fs_info *info = root->fs_info;
10170         struct btrfs_space_info *space_info;
10171         struct btrfs_key key;
10172         struct btrfs_key found_key;
10173         struct extent_buffer *leaf;
10174         int need_clear = 0;
10175         u64 cache_gen;
10176         u64 feature;
10177         int mixed;
10178
10179         feature = btrfs_super_incompat_flags(info->super_copy);
10180         mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
10181
10182         root = info->extent_root;
10183         key.objectid = 0;
10184         key.offset = 0;
10185         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10186         path = btrfs_alloc_path();
10187         if (!path)
10188                 return -ENOMEM;
10189         path->reada = READA_FORWARD;
10190
10191         cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
10192         if (btrfs_test_opt(root->fs_info, SPACE_CACHE) &&
10193             btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
10194                 need_clear = 1;
10195         if (btrfs_test_opt(root->fs_info, CLEAR_CACHE))
10196                 need_clear = 1;
10197
10198         while (1) {
10199                 ret = find_first_block_group(root, path, &key);
10200                 if (ret > 0)
10201                         break;
10202                 if (ret != 0)
10203                         goto error;
10204
10205                 leaf = path->nodes[0];
10206                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
10207
10208                 cache = btrfs_create_block_group_cache(root, found_key.objectid,
10209                                                        found_key.offset);
10210                 if (!cache) {
10211                         ret = -ENOMEM;
10212                         goto error;
10213                 }
10214
10215                 if (need_clear) {
10216                         /*
10217                          * When we mount with old space cache, we need to
10218                          * set BTRFS_DC_CLEAR and set dirty flag.
10219                          *
10220                          * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
10221                          *    truncate the old free space cache inode and
10222                          *    setup a new one.
10223                          * b) Setting 'dirty flag' makes sure that we flush
10224                          *    the new space cache info onto disk.
10225                          */
10226                         if (btrfs_test_opt(root->fs_info, SPACE_CACHE))
10227                                 cache->disk_cache_state = BTRFS_DC_CLEAR;
10228                 }
10229
10230                 read_extent_buffer(leaf, &cache->item,
10231                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
10232                                    sizeof(cache->item));
10233                 cache->flags = btrfs_block_group_flags(&cache->item);
10234                 if (!mixed &&
10235                     ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
10236                     (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
10237                         btrfs_err(info,
10238 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
10239                                   cache->key.objectid);
10240                         ret = -EINVAL;
10241                         goto error;
10242                 }
10243
10244                 key.objectid = found_key.objectid + found_key.offset;
10245                 btrfs_release_path(path);
10246
10247                 /*
10248                  * We need to exclude the super stripes now so that the space
10249                  * info has super bytes accounted for, otherwise we'll think
10250                  * we have more space than we actually do.
10251                  */
10252                 ret = exclude_super_stripes(root, cache);
10253                 if (ret) {
10254                         /*
10255                          * We may have excluded something, so call this just in
10256                          * case.
10257                          */
10258                         free_excluded_extents(root, cache);
10259                         btrfs_put_block_group(cache);
10260                         goto error;
10261                 }
10262
10263                 /*
10264                  * check for two cases, either we are full, and therefore
10265                  * don't need to bother with the caching work since we won't
10266                  * find any space, or we are empty, and we can just add all
10267                  * the space in and be done with it.  This saves us _alot_ of
10268                  * time, particularly in the full case.
10269                  */
10270                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
10271                         cache->last_byte_to_unpin = (u64)-1;
10272                         cache->cached = BTRFS_CACHE_FINISHED;
10273                         free_excluded_extents(root, cache);
10274                 } else if (btrfs_block_group_used(&cache->item) == 0) {
10275                         cache->last_byte_to_unpin = (u64)-1;
10276                         cache->cached = BTRFS_CACHE_FINISHED;
10277                         add_new_free_space(cache, root->fs_info,
10278                                            found_key.objectid,
10279                                            found_key.objectid +
10280                                            found_key.offset);
10281                         free_excluded_extents(root, cache);
10282                 }
10283
10284                 ret = btrfs_add_block_group_cache(root->fs_info, cache);
10285                 if (ret) {
10286                         btrfs_remove_free_space_cache(cache);
10287                         btrfs_put_block_group(cache);
10288                         goto error;
10289                 }
10290
10291                 trace_btrfs_add_block_group(root->fs_info, cache, 0);
10292                 ret = update_space_info(info, cache->flags, found_key.offset,
10293                                         btrfs_block_group_used(&cache->item),
10294                                         cache->bytes_super, &space_info);
10295                 if (ret) {
10296                         btrfs_remove_free_space_cache(cache);
10297                         spin_lock(&info->block_group_cache_lock);
10298                         rb_erase(&cache->cache_node,
10299                                  &info->block_group_cache_tree);
10300                         RB_CLEAR_NODE(&cache->cache_node);
10301                         spin_unlock(&info->block_group_cache_lock);
10302                         btrfs_put_block_group(cache);
10303                         goto error;
10304                 }
10305
10306                 cache->space_info = space_info;
10307
10308                 __link_block_group(space_info, cache);
10309
10310                 set_avail_alloc_bits(root->fs_info, cache->flags);
10311                 if (btrfs_chunk_readonly(root, cache->key.objectid)) {
10312                         inc_block_group_ro(cache, 1);
10313                 } else if (btrfs_block_group_used(&cache->item) == 0) {
10314                         spin_lock(&info->unused_bgs_lock);
10315                         /* Should always be true but just in case. */
10316                         if (list_empty(&cache->bg_list)) {
10317                                 btrfs_get_block_group(cache);
10318                                 list_add_tail(&cache->bg_list,
10319                                               &info->unused_bgs);
10320                         }
10321                         spin_unlock(&info->unused_bgs_lock);
10322                 }
10323         }
10324
10325         list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
10326                 if (!(get_alloc_profile(root, space_info->flags) &
10327                       (BTRFS_BLOCK_GROUP_RAID10 |
10328                        BTRFS_BLOCK_GROUP_RAID1 |
10329                        BTRFS_BLOCK_GROUP_RAID5 |
10330                        BTRFS_BLOCK_GROUP_RAID6 |
10331                        BTRFS_BLOCK_GROUP_DUP)))
10332                         continue;
10333                 /*
10334                  * avoid allocating from un-mirrored block group if there are
10335                  * mirrored block groups.
10336                  */
10337                 list_for_each_entry(cache,
10338                                 &space_info->block_groups[BTRFS_RAID_RAID0],
10339                                 list)
10340                         inc_block_group_ro(cache, 1);
10341                 list_for_each_entry(cache,
10342                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
10343                                 list)
10344                         inc_block_group_ro(cache, 1);
10345         }
10346
10347         init_global_block_rsv(info);
10348         ret = 0;
10349 error:
10350         btrfs_free_path(path);
10351         return ret;
10352 }
10353
10354 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
10355                                        struct btrfs_root *root)
10356 {
10357         struct btrfs_block_group_cache *block_group, *tmp;
10358         struct btrfs_root *extent_root = root->fs_info->extent_root;
10359         struct btrfs_block_group_item item;
10360         struct btrfs_key key;
10361         int ret = 0;
10362         bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
10363
10364         trans->can_flush_pending_bgs = false;
10365         list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
10366                 if (ret)
10367                         goto next;
10368
10369                 spin_lock(&block_group->lock);
10370                 memcpy(&item, &block_group->item, sizeof(item));
10371                 memcpy(&key, &block_group->key, sizeof(key));
10372                 spin_unlock(&block_group->lock);
10373
10374                 ret = btrfs_insert_item(trans, extent_root, &key, &item,
10375                                         sizeof(item));
10376                 if (ret)
10377                         btrfs_abort_transaction(trans, ret);
10378                 ret = btrfs_finish_chunk_alloc(trans, extent_root,
10379                                                key.objectid, key.offset);
10380                 if (ret)
10381                         btrfs_abort_transaction(trans, ret);
10382                 add_block_group_free_space(trans, root->fs_info, block_group);
10383                 /* already aborted the transaction if it failed. */
10384 next:
10385                 list_del_init(&block_group->bg_list);
10386         }
10387         trans->can_flush_pending_bgs = can_flush_pending_bgs;
10388 }
10389
10390 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
10391                            struct btrfs_root *root, u64 bytes_used,
10392                            u64 type, u64 chunk_objectid, u64 chunk_offset,
10393                            u64 size)
10394 {
10395         int ret;
10396         struct btrfs_root *extent_root;
10397         struct btrfs_block_group_cache *cache;
10398         extent_root = root->fs_info->extent_root;
10399
10400         btrfs_set_log_full_commit(root->fs_info, trans);
10401
10402         cache = btrfs_create_block_group_cache(root, chunk_offset, size);
10403         if (!cache)
10404                 return -ENOMEM;
10405
10406         btrfs_set_block_group_used(&cache->item, bytes_used);
10407         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
10408         btrfs_set_block_group_flags(&cache->item, type);
10409
10410         cache->flags = type;
10411         cache->last_byte_to_unpin = (u64)-1;
10412         cache->cached = BTRFS_CACHE_FINISHED;
10413         cache->needs_free_space = 1;
10414         ret = exclude_super_stripes(root, cache);
10415         if (ret) {
10416                 /*
10417                  * We may have excluded something, so call this just in
10418                  * case.
10419                  */
10420                 free_excluded_extents(root, cache);
10421                 btrfs_put_block_group(cache);
10422                 return ret;
10423         }
10424
10425         add_new_free_space(cache, root->fs_info, chunk_offset,
10426                            chunk_offset + size);
10427
10428         free_excluded_extents(root, cache);
10429
10430 #ifdef CONFIG_BTRFS_DEBUG
10431         if (btrfs_should_fragment_free_space(root, cache)) {
10432                 u64 new_bytes_used = size - bytes_used;
10433
10434                 bytes_used += new_bytes_used >> 1;
10435                 fragment_free_space(root, cache);
10436         }
10437 #endif
10438         /*
10439          * Call to ensure the corresponding space_info object is created and
10440          * assigned to our block group, but don't update its counters just yet.
10441          * We want our bg to be added to the rbtree with its ->space_info set.
10442          */
10443         ret = update_space_info(root->fs_info, cache->flags, 0, 0, 0,
10444                                 &cache->space_info);
10445         if (ret) {
10446                 btrfs_remove_free_space_cache(cache);
10447                 btrfs_put_block_group(cache);
10448                 return ret;
10449         }
10450
10451         ret = btrfs_add_block_group_cache(root->fs_info, cache);
10452         if (ret) {
10453                 btrfs_remove_free_space_cache(cache);
10454                 btrfs_put_block_group(cache);
10455                 return ret;
10456         }
10457
10458         /*
10459          * Now that our block group has its ->space_info set and is inserted in
10460          * the rbtree, update the space info's counters.
10461          */
10462         trace_btrfs_add_block_group(root->fs_info, cache, 1);
10463         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
10464                                 cache->bytes_super, &cache->space_info);
10465         if (ret) {
10466                 btrfs_remove_free_space_cache(cache);
10467                 spin_lock(&root->fs_info->block_group_cache_lock);
10468                 rb_erase(&cache->cache_node,
10469                          &root->fs_info->block_group_cache_tree);
10470                 RB_CLEAR_NODE(&cache->cache_node);
10471                 spin_unlock(&root->fs_info->block_group_cache_lock);
10472                 btrfs_put_block_group(cache);
10473                 return ret;
10474         }
10475         update_global_block_rsv(root->fs_info);
10476
10477         __link_block_group(cache->space_info, cache);
10478
10479         list_add_tail(&cache->bg_list, &trans->new_bgs);
10480
10481         set_avail_alloc_bits(extent_root->fs_info, type);
10482         return 0;
10483 }
10484
10485 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10486 {
10487         u64 extra_flags = chunk_to_extended(flags) &
10488                                 BTRFS_EXTENDED_PROFILE_MASK;
10489
10490         write_seqlock(&fs_info->profiles_lock);
10491         if (flags & BTRFS_BLOCK_GROUP_DATA)
10492                 fs_info->avail_data_alloc_bits &= ~extra_flags;
10493         if (flags & BTRFS_BLOCK_GROUP_METADATA)
10494                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10495         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10496                 fs_info->avail_system_alloc_bits &= ~extra_flags;
10497         write_sequnlock(&fs_info->profiles_lock);
10498 }
10499
10500 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10501                              struct btrfs_root *root, u64 group_start,
10502                              struct extent_map *em)
10503 {
10504         struct btrfs_path *path;
10505         struct btrfs_block_group_cache *block_group;
10506         struct btrfs_free_cluster *cluster;
10507         struct btrfs_root *tree_root = root->fs_info->tree_root;
10508         struct btrfs_key key;
10509         struct inode *inode;
10510         struct kobject *kobj = NULL;
10511         int ret;
10512         int index;
10513         int factor;
10514         struct btrfs_caching_control *caching_ctl = NULL;
10515         bool remove_em;
10516
10517         root = root->fs_info->extent_root;
10518
10519         block_group = btrfs_lookup_block_group(root->fs_info, group_start);
10520         BUG_ON(!block_group);
10521         BUG_ON(!block_group->ro);
10522
10523         /*
10524          * Free the reserved super bytes from this block group before
10525          * remove it.
10526          */
10527         free_excluded_extents(root, block_group);
10528
10529         memcpy(&key, &block_group->key, sizeof(key));
10530         index = get_block_group_index(block_group);
10531         if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
10532                                   BTRFS_BLOCK_GROUP_RAID1 |
10533                                   BTRFS_BLOCK_GROUP_RAID10))
10534                 factor = 2;
10535         else
10536                 factor = 1;
10537
10538         /* make sure this block group isn't part of an allocation cluster */
10539         cluster = &root->fs_info->data_alloc_cluster;
10540         spin_lock(&cluster->refill_lock);
10541         btrfs_return_cluster_to_free_space(block_group, cluster);
10542         spin_unlock(&cluster->refill_lock);
10543
10544         /*
10545          * make sure this block group isn't part of a metadata
10546          * allocation cluster
10547          */
10548         cluster = &root->fs_info->meta_alloc_cluster;
10549         spin_lock(&cluster->refill_lock);
10550         btrfs_return_cluster_to_free_space(block_group, cluster);
10551         spin_unlock(&cluster->refill_lock);
10552
10553         path = btrfs_alloc_path();
10554         if (!path) {
10555                 ret = -ENOMEM;
10556                 goto out;
10557         }
10558
10559         /*
10560          * get the inode first so any iput calls done for the io_list
10561          * aren't the final iput (no unlinks allowed now)
10562          */
10563         inode = lookup_free_space_inode(tree_root, block_group, path);
10564
10565         mutex_lock(&trans->transaction->cache_write_mutex);
10566         /*
10567          * make sure our free spache cache IO is done before remove the
10568          * free space inode
10569          */
10570         spin_lock(&trans->transaction->dirty_bgs_lock);
10571         if (!list_empty(&block_group->io_list)) {
10572                 list_del_init(&block_group->io_list);
10573
10574                 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10575
10576                 spin_unlock(&trans->transaction->dirty_bgs_lock);
10577                 btrfs_wait_cache_io(root, trans, block_group,
10578                                     &block_group->io_ctl, path,
10579                                     block_group->key.objectid);
10580                 btrfs_put_block_group(block_group);
10581                 spin_lock(&trans->transaction->dirty_bgs_lock);
10582         }
10583
10584         if (!list_empty(&block_group->dirty_list)) {
10585                 list_del_init(&block_group->dirty_list);
10586                 btrfs_put_block_group(block_group);
10587         }
10588         spin_unlock(&trans->transaction->dirty_bgs_lock);
10589         mutex_unlock(&trans->transaction->cache_write_mutex);
10590
10591         if (!IS_ERR(inode)) {
10592                 ret = btrfs_orphan_add(trans, inode);
10593                 if (ret) {
10594                         btrfs_add_delayed_iput(inode);
10595                         goto out;
10596                 }
10597                 clear_nlink(inode);
10598                 /* One for the block groups ref */
10599                 spin_lock(&block_group->lock);
10600                 if (block_group->iref) {
10601                         block_group->iref = 0;
10602                         block_group->inode = NULL;
10603                         spin_unlock(&block_group->lock);
10604                         iput(inode);
10605                 } else {
10606                         spin_unlock(&block_group->lock);
10607                 }
10608                 /* One for our lookup ref */
10609                 btrfs_add_delayed_iput(inode);
10610         }
10611
10612         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10613         key.offset = block_group->key.objectid;
10614         key.type = 0;
10615
10616         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10617         if (ret < 0)
10618                 goto out;
10619         if (ret > 0)
10620                 btrfs_release_path(path);
10621         if (ret == 0) {
10622                 ret = btrfs_del_item(trans, tree_root, path);
10623                 if (ret)
10624                         goto out;
10625                 btrfs_release_path(path);
10626         }
10627
10628         spin_lock(&root->fs_info->block_group_cache_lock);
10629         rb_erase(&block_group->cache_node,
10630                  &root->fs_info->block_group_cache_tree);
10631         RB_CLEAR_NODE(&block_group->cache_node);
10632
10633         if (root->fs_info->first_logical_byte == block_group->key.objectid)
10634                 root->fs_info->first_logical_byte = (u64)-1;
10635         spin_unlock(&root->fs_info->block_group_cache_lock);
10636
10637         down_write(&block_group->space_info->groups_sem);
10638         /*
10639          * we must use list_del_init so people can check to see if they
10640          * are still on the list after taking the semaphore
10641          */
10642         list_del_init(&block_group->list);
10643         if (list_empty(&block_group->space_info->block_groups[index])) {
10644                 kobj = block_group->space_info->block_group_kobjs[index];
10645                 block_group->space_info->block_group_kobjs[index] = NULL;
10646                 clear_avail_alloc_bits(root->fs_info, block_group->flags);
10647         }
10648         up_write(&block_group->space_info->groups_sem);
10649         if (kobj) {
10650                 kobject_del(kobj);
10651                 kobject_put(kobj);
10652         }
10653
10654         if (block_group->has_caching_ctl)
10655                 caching_ctl = get_caching_control(block_group);
10656         if (block_group->cached == BTRFS_CACHE_STARTED)
10657                 wait_block_group_cache_done(block_group);
10658         if (block_group->has_caching_ctl) {
10659                 down_write(&root->fs_info->commit_root_sem);
10660                 if (!caching_ctl) {
10661                         struct btrfs_caching_control *ctl;
10662
10663                         list_for_each_entry(ctl,
10664                                     &root->fs_info->caching_block_groups, list)
10665                                 if (ctl->block_group == block_group) {
10666                                         caching_ctl = ctl;
10667                                         atomic_inc(&caching_ctl->count);
10668                                         break;
10669                                 }
10670                 }
10671                 if (caching_ctl)
10672                         list_del_init(&caching_ctl->list);
10673                 up_write(&root->fs_info->commit_root_sem);
10674                 if (caching_ctl) {
10675                         /* Once for the caching bgs list and once for us. */
10676                         put_caching_control(caching_ctl);
10677                         put_caching_control(caching_ctl);
10678                 }
10679         }
10680
10681         spin_lock(&trans->transaction->dirty_bgs_lock);
10682         if (!list_empty(&block_group->dirty_list)) {
10683                 WARN_ON(1);
10684         }
10685         if (!list_empty(&block_group->io_list)) {
10686                 WARN_ON(1);
10687         }
10688         spin_unlock(&trans->transaction->dirty_bgs_lock);
10689         btrfs_remove_free_space_cache(block_group);
10690
10691         spin_lock(&block_group->space_info->lock);
10692         list_del_init(&block_group->ro_list);
10693
10694         if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
10695                 WARN_ON(block_group->space_info->total_bytes
10696                         < block_group->key.offset);
10697                 WARN_ON(block_group->space_info->bytes_readonly
10698                         < block_group->key.offset);
10699                 WARN_ON(block_group->space_info->disk_total
10700                         < block_group->key.offset * factor);
10701         }
10702         block_group->space_info->total_bytes -= block_group->key.offset;
10703         block_group->space_info->bytes_readonly -= block_group->key.offset;
10704         block_group->space_info->disk_total -= block_group->key.offset * factor;
10705
10706         spin_unlock(&block_group->space_info->lock);
10707
10708         memcpy(&key, &block_group->key, sizeof(key));
10709
10710         lock_chunks(root);
10711         if (!list_empty(&em->list)) {
10712                 /* We're in the transaction->pending_chunks list. */
10713                 free_extent_map(em);
10714         }
10715         spin_lock(&block_group->lock);
10716         block_group->removed = 1;
10717         /*
10718          * At this point trimming can't start on this block group, because we
10719          * removed the block group from the tree fs_info->block_group_cache_tree
10720          * so no one can't find it anymore and even if someone already got this
10721          * block group before we removed it from the rbtree, they have already
10722          * incremented block_group->trimming - if they didn't, they won't find
10723          * any free space entries because we already removed them all when we
10724          * called btrfs_remove_free_space_cache().
10725          *
10726          * And we must not remove the extent map from the fs_info->mapping_tree
10727          * to prevent the same logical address range and physical device space
10728          * ranges from being reused for a new block group. This is because our
10729          * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10730          * completely transactionless, so while it is trimming a range the
10731          * currently running transaction might finish and a new one start,
10732          * allowing for new block groups to be created that can reuse the same
10733          * physical device locations unless we take this special care.
10734          *
10735          * There may also be an implicit trim operation if the file system
10736          * is mounted with -odiscard. The same protections must remain
10737          * in place until the extents have been discarded completely when
10738          * the transaction commit has completed.
10739          */
10740         remove_em = (atomic_read(&block_group->trimming) == 0);
10741         /*
10742          * Make sure a trimmer task always sees the em in the pinned_chunks list
10743          * if it sees block_group->removed == 1 (needs to lock block_group->lock
10744          * before checking block_group->removed).
10745          */
10746         if (!remove_em) {
10747                 /*
10748                  * Our em might be in trans->transaction->pending_chunks which
10749                  * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
10750                  * and so is the fs_info->pinned_chunks list.
10751                  *
10752                  * So at this point we must be holding the chunk_mutex to avoid
10753                  * any races with chunk allocation (more specifically at
10754                  * volumes.c:contains_pending_extent()), to ensure it always
10755                  * sees the em, either in the pending_chunks list or in the
10756                  * pinned_chunks list.
10757                  */
10758                 list_move_tail(&em->list, &root->fs_info->pinned_chunks);
10759         }
10760         spin_unlock(&block_group->lock);
10761
10762         if (remove_em) {
10763                 struct extent_map_tree *em_tree;
10764
10765                 em_tree = &root->fs_info->mapping_tree.map_tree;
10766                 write_lock(&em_tree->lock);
10767                 /*
10768                  * The em might be in the pending_chunks list, so make sure the
10769                  * chunk mutex is locked, since remove_extent_mapping() will
10770                  * delete us from that list.
10771                  */
10772                 remove_extent_mapping(em_tree, em);
10773                 write_unlock(&em_tree->lock);
10774                 /* once for the tree */
10775                 free_extent_map(em);
10776         }
10777
10778         unlock_chunks(root);
10779
10780         ret = remove_block_group_free_space(trans, root->fs_info, block_group);
10781         if (ret)
10782                 goto out;
10783
10784         btrfs_put_block_group(block_group);
10785         btrfs_put_block_group(block_group);
10786
10787         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10788         if (ret > 0)
10789                 ret = -EIO;
10790         if (ret < 0)
10791                 goto out;
10792
10793         ret = btrfs_del_item(trans, root, path);
10794 out:
10795         btrfs_free_path(path);
10796         return ret;
10797 }
10798
10799 struct btrfs_trans_handle *
10800 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10801                                      const u64 chunk_offset)
10802 {
10803         struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
10804         struct extent_map *em;
10805         struct map_lookup *map;
10806         unsigned int num_items;
10807
10808         read_lock(&em_tree->lock);
10809         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10810         read_unlock(&em_tree->lock);
10811         ASSERT(em && em->start == chunk_offset);
10812
10813         /*
10814          * We need to reserve 3 + N units from the metadata space info in order
10815          * to remove a block group (done at btrfs_remove_chunk() and at
10816          * btrfs_remove_block_group()), which are used for:
10817          *
10818          * 1 unit for adding the free space inode's orphan (located in the tree
10819          * of tree roots).
10820          * 1 unit for deleting the block group item (located in the extent
10821          * tree).
10822          * 1 unit for deleting the free space item (located in tree of tree
10823          * roots).
10824          * N units for deleting N device extent items corresponding to each
10825          * stripe (located in the device tree).
10826          *
10827          * In order to remove a block group we also need to reserve units in the
10828          * system space info in order to update the chunk tree (update one or
10829          * more device items and remove one chunk item), but this is done at
10830          * btrfs_remove_chunk() through a call to check_system_chunk().
10831          */
10832         map = em->map_lookup;
10833         num_items = 3 + map->num_stripes;
10834         free_extent_map(em);
10835
10836         return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
10837                                                            num_items, 1);
10838 }
10839
10840 /*
10841  * Process the unused_bgs list and remove any that don't have any allocated
10842  * space inside of them.
10843  */
10844 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10845 {
10846         struct btrfs_block_group_cache *block_group;
10847         struct btrfs_space_info *space_info;
10848         struct btrfs_root *root = fs_info->extent_root;
10849         struct btrfs_trans_handle *trans;
10850         int ret = 0;
10851
10852         if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
10853                 return;
10854
10855         spin_lock(&fs_info->unused_bgs_lock);
10856         while (!list_empty(&fs_info->unused_bgs)) {
10857                 u64 start, end;
10858                 int trimming;
10859
10860                 block_group = list_first_entry(&fs_info->unused_bgs,
10861                                                struct btrfs_block_group_cache,
10862                                                bg_list);
10863                 list_del_init(&block_group->bg_list);
10864
10865                 space_info = block_group->space_info;
10866
10867                 if (ret || btrfs_mixed_space_info(space_info)) {
10868                         btrfs_put_block_group(block_group);
10869                         continue;
10870                 }
10871                 spin_unlock(&fs_info->unused_bgs_lock);
10872
10873                 down_write(&root->fs_info->bg_delete_sem);
10874
10875                 /* Don't want to race with allocators so take the groups_sem */
10876                 down_write(&space_info->groups_sem);
10877                 spin_lock(&block_group->lock);
10878                 if (block_group->reserved ||
10879                     btrfs_block_group_used(&block_group->item) ||
10880                     (block_group->ro && !block_group->removed) ||
10881                     list_is_singular(&block_group->list)) {
10882                         /*
10883                          * We want to bail if we made new allocations or have
10884                          * outstanding allocations in this block group.  We do
10885                          * the ro check in case balance is currently acting on
10886                          * this block group.
10887                          */
10888                         spin_unlock(&block_group->lock);
10889                         up_write(&space_info->groups_sem);
10890                         goto next;
10891                 }
10892                 spin_unlock(&block_group->lock);
10893
10894                 /* We don't want to force the issue, only flip if it's ok. */
10895                 ret = inc_block_group_ro(block_group, 0);
10896                 up_write(&space_info->groups_sem);
10897                 if (ret < 0) {
10898                         ret = 0;
10899                         goto next;
10900                 }
10901
10902                 /*
10903                  * Want to do this before we do anything else so we can recover
10904                  * properly if we fail to join the transaction.
10905                  */
10906                 trans = btrfs_start_trans_remove_block_group(fs_info,
10907                                                      block_group->key.objectid);
10908                 if (IS_ERR(trans)) {
10909                         btrfs_dec_block_group_ro(root, block_group);
10910                         ret = PTR_ERR(trans);
10911                         goto next;
10912                 }
10913
10914                 /*
10915                  * We could have pending pinned extents for this block group,
10916                  * just delete them, we don't care about them anymore.
10917                  */
10918                 start = block_group->key.objectid;
10919                 end = start + block_group->key.offset - 1;
10920                 /*
10921                  * Hold the unused_bg_unpin_mutex lock to avoid racing with
10922                  * btrfs_finish_extent_commit(). If we are at transaction N,
10923                  * another task might be running finish_extent_commit() for the
10924                  * previous transaction N - 1, and have seen a range belonging
10925                  * to the block group in freed_extents[] before we were able to
10926                  * clear the whole block group range from freed_extents[]. This
10927                  * means that task can lookup for the block group after we
10928                  * unpinned it from freed_extents[] and removed it, leading to
10929                  * a BUG_ON() at btrfs_unpin_extent_range().
10930                  */
10931                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
10932                 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
10933                                   EXTENT_DIRTY);
10934                 if (ret) {
10935                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10936                         btrfs_dec_block_group_ro(root, block_group);
10937                         goto end_trans;
10938                 }
10939                 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
10940                                   EXTENT_DIRTY);
10941                 if (ret) {
10942                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10943                         btrfs_dec_block_group_ro(root, block_group);
10944                         goto end_trans;
10945                 }
10946                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10947
10948                 /* Reset pinned so btrfs_put_block_group doesn't complain */
10949                 spin_lock(&space_info->lock);
10950                 spin_lock(&block_group->lock);
10951
10952                 space_info->bytes_pinned -= block_group->pinned;
10953                 space_info->bytes_readonly += block_group->pinned;
10954                 percpu_counter_add(&space_info->total_bytes_pinned,
10955                                    -block_group->pinned);
10956                 block_group->pinned = 0;
10957
10958                 spin_unlock(&block_group->lock);
10959                 spin_unlock(&space_info->lock);
10960
10961                 /* DISCARD can flip during remount */
10962                 trimming = btrfs_test_opt(root->fs_info, DISCARD);
10963
10964                 /* Implicit trim during transaction commit. */
10965                 if (trimming)
10966                         btrfs_get_block_group_trimming(block_group);
10967
10968                 /*
10969                  * Btrfs_remove_chunk will abort the transaction if things go
10970                  * horribly wrong.
10971                  */
10972                 ret = btrfs_remove_chunk(trans, root,
10973                                          block_group->key.objectid);
10974
10975                 if (ret) {
10976                         if (trimming)
10977                                 btrfs_put_block_group_trimming(block_group);
10978                         goto end_trans;
10979                 }
10980
10981                 /*
10982                  * If we're not mounted with -odiscard, we can just forget
10983                  * about this block group. Otherwise we'll need to wait
10984                  * until transaction commit to do the actual discard.
10985                  */
10986                 if (trimming) {
10987                         spin_lock(&fs_info->unused_bgs_lock);
10988                         /*
10989                          * A concurrent scrub might have added us to the list
10990                          * fs_info->unused_bgs, so use a list_move operation
10991                          * to add the block group to the deleted_bgs list.
10992                          */
10993                         list_move(&block_group->bg_list,
10994                                   &trans->transaction->deleted_bgs);
10995                         spin_unlock(&fs_info->unused_bgs_lock);
10996                         btrfs_get_block_group(block_group);
10997                 }
10998 end_trans:
10999                 btrfs_end_transaction(trans, root);
11000 next:
11001                 up_write(&root->fs_info->bg_delete_sem);
11002                 btrfs_put_block_group(block_group);
11003                 spin_lock(&fs_info->unused_bgs_lock);
11004         }
11005         spin_unlock(&fs_info->unused_bgs_lock);
11006 }
11007
11008 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
11009 {
11010         struct btrfs_space_info *space_info;
11011         struct btrfs_super_block *disk_super;
11012         u64 features;
11013         u64 flags;
11014         int mixed = 0;
11015         int ret;
11016
11017         disk_super = fs_info->super_copy;
11018         if (!btrfs_super_root(disk_super))
11019                 return -EINVAL;
11020
11021         features = btrfs_super_incompat_flags(disk_super);
11022         if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
11023                 mixed = 1;
11024
11025         flags = BTRFS_BLOCK_GROUP_SYSTEM;
11026         ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
11027         if (ret)
11028                 goto out;
11029
11030         if (mixed) {
11031                 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
11032                 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
11033         } else {
11034                 flags = BTRFS_BLOCK_GROUP_METADATA;
11035                 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
11036                 if (ret)
11037                         goto out;
11038
11039                 flags = BTRFS_BLOCK_GROUP_DATA;
11040                 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
11041         }
11042 out:
11043         return ret;
11044 }
11045
11046 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
11047 {
11048         return unpin_extent_range(root, start, end, false);
11049 }
11050
11051 /*
11052  * It used to be that old block groups would be left around forever.
11053  * Iterating over them would be enough to trim unused space.  Since we
11054  * now automatically remove them, we also need to iterate over unallocated
11055  * space.
11056  *
11057  * We don't want a transaction for this since the discard may take a
11058  * substantial amount of time.  We don't require that a transaction be
11059  * running, but we do need to take a running transaction into account
11060  * to ensure that we're not discarding chunks that were released in
11061  * the current transaction.
11062  *
11063  * Holding the chunks lock will prevent other threads from allocating
11064  * or releasing chunks, but it won't prevent a running transaction
11065  * from committing and releasing the memory that the pending chunks
11066  * list head uses.  For that, we need to take a reference to the
11067  * transaction.
11068  */
11069 static int btrfs_trim_free_extents(struct btrfs_device *device,
11070                                    u64 minlen, u64 *trimmed)
11071 {
11072         u64 start = 0, len = 0;
11073         int ret;
11074
11075         *trimmed = 0;
11076
11077         /* Not writeable = nothing to do. */
11078         if (!device->writeable)
11079                 return 0;
11080
11081         /* No free space = nothing to do. */
11082         if (device->total_bytes <= device->bytes_used)
11083                 return 0;
11084
11085         ret = 0;
11086
11087         while (1) {
11088                 struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
11089                 struct btrfs_transaction *trans;
11090                 u64 bytes;
11091
11092                 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
11093                 if (ret)
11094                         return ret;
11095
11096                 down_read(&fs_info->commit_root_sem);
11097
11098                 spin_lock(&fs_info->trans_lock);
11099                 trans = fs_info->running_transaction;
11100                 if (trans)
11101                         atomic_inc(&trans->use_count);
11102                 spin_unlock(&fs_info->trans_lock);
11103
11104                 ret = find_free_dev_extent_start(trans, device, minlen, start,
11105                                                  &start, &len);
11106                 if (trans)
11107                         btrfs_put_transaction(trans);
11108
11109                 if (ret) {
11110                         up_read(&fs_info->commit_root_sem);
11111                         mutex_unlock(&fs_info->chunk_mutex);
11112                         if (ret == -ENOSPC)
11113                                 ret = 0;
11114                         break;
11115                 }
11116
11117                 ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
11118                 up_read(&fs_info->commit_root_sem);
11119                 mutex_unlock(&fs_info->chunk_mutex);
11120
11121                 if (ret)
11122                         break;
11123
11124                 start += len;
11125                 *trimmed += bytes;
11126
11127                 if (fatal_signal_pending(current)) {
11128                         ret = -ERESTARTSYS;
11129                         break;
11130                 }
11131
11132                 cond_resched();
11133         }
11134
11135         return ret;
11136 }
11137
11138 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
11139 {
11140         struct btrfs_fs_info *fs_info = root->fs_info;
11141         struct btrfs_block_group_cache *cache = NULL;
11142         struct btrfs_device *device;
11143         struct list_head *devices;
11144         u64 group_trimmed;
11145         u64 start;
11146         u64 end;
11147         u64 trimmed = 0;
11148         u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
11149         int ret = 0;
11150
11151         /*
11152          * try to trim all FS space, our block group may start from non-zero.
11153          */
11154         if (range->len == total_bytes)
11155                 cache = btrfs_lookup_first_block_group(fs_info, range->start);
11156         else
11157                 cache = btrfs_lookup_block_group(fs_info, range->start);
11158
11159         while (cache) {
11160                 if (cache->key.objectid >= (range->start + range->len)) {
11161                         btrfs_put_block_group(cache);
11162                         break;
11163                 }
11164
11165                 start = max(range->start, cache->key.objectid);
11166                 end = min(range->start + range->len,
11167                                 cache->key.objectid + cache->key.offset);
11168
11169                 if (end - start >= range->minlen) {
11170                         if (!block_group_cache_done(cache)) {
11171                                 ret = cache_block_group(cache, 0);
11172                                 if (ret) {
11173                                         btrfs_put_block_group(cache);
11174                                         break;
11175                                 }
11176                                 ret = wait_block_group_cache_done(cache);
11177                                 if (ret) {
11178                                         btrfs_put_block_group(cache);
11179                                         break;
11180                                 }
11181                         }
11182                         ret = btrfs_trim_block_group(cache,
11183                                                      &group_trimmed,
11184                                                      start,
11185                                                      end,
11186                                                      range->minlen);
11187
11188                         trimmed += group_trimmed;
11189                         if (ret) {
11190                                 btrfs_put_block_group(cache);
11191                                 break;
11192                         }
11193                 }
11194
11195                 cache = next_block_group(fs_info->tree_root, cache);
11196         }
11197
11198         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
11199         devices = &root->fs_info->fs_devices->alloc_list;
11200         list_for_each_entry(device, devices, dev_alloc_list) {
11201                 ret = btrfs_trim_free_extents(device, range->minlen,
11202                                               &group_trimmed);
11203                 if (ret)
11204                         break;
11205
11206                 trimmed += group_trimmed;
11207         }
11208         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
11209
11210         range->len = trimmed;
11211         return ret;
11212 }
11213
11214 /*
11215  * btrfs_{start,end}_write_no_snapshoting() are similar to
11216  * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
11217  * data into the page cache through nocow before the subvolume is snapshoted,
11218  * but flush the data into disk after the snapshot creation, or to prevent
11219  * operations while snapshoting is ongoing and that cause the snapshot to be
11220  * inconsistent (writes followed by expanding truncates for example).
11221  */
11222 void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
11223 {
11224         percpu_counter_dec(&root->subv_writers->counter);
11225         /*
11226          * Make sure counter is updated before we wake up waiters.
11227          */
11228         smp_mb();
11229         if (waitqueue_active(&root->subv_writers->wait))
11230                 wake_up(&root->subv_writers->wait);
11231 }
11232
11233 int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
11234 {
11235         if (atomic_read(&root->will_be_snapshoted))
11236                 return 0;
11237
11238         percpu_counter_inc(&root->subv_writers->counter);
11239         /*
11240          * Make sure counter is updated before we check for snapshot creation.
11241          */
11242         smp_mb();
11243         if (atomic_read(&root->will_be_snapshoted)) {
11244                 btrfs_end_write_no_snapshoting(root);
11245                 return 0;
11246         }
11247         return 1;
11248 }
11249
11250 static int wait_snapshoting_atomic_t(atomic_t *a)
11251 {
11252         schedule();
11253         return 0;
11254 }
11255
11256 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
11257 {
11258         while (true) {
11259                 int ret;
11260
11261                 ret = btrfs_start_write_no_snapshoting(root);
11262                 if (ret)
11263                         break;
11264                 wait_on_atomic_t(&root->will_be_snapshoted,
11265                                  wait_snapshoting_atomic_t,
11266                                  TASK_UNINTERRUPTIBLE);
11267         }
11268 }