cleaner_kthread() may run at any time, in which it'll call btrfs_delete_unused_bgs()
to delete unused block groups. Because this work is asynchronous, it may also result
in false ENOSPC error. Please see below race window:
CPU1 | CPU2
|
|-> btrfs_alloc_data_chunk_ondemand() |-> cleaner_kthread()
|-> do_chunk_alloc() | |
| assume it returns ENOSPC, which means | |
| btrfs_space_info is full and have free| |
| space to satisfy data request. | |
| | |- > btrfs_delete_unused_bgs()
| | | it will decrease btrfs_space_info
| | | total_bytes and make
| | | btrfs_space_info is not full.
| | |
In this case, we may get ENOSPC error, but btrfs_space_info is not full.
To fix this issue, in btrfs_alloc_data_chunk_ondemand(), if we need to call
do_chunk_alloc() to allocating new chunk, we should block btrfs_delete_unused_bgs().
Here we introduce a new struct rw_semaphore bg_delete_sem to do this job.
Indeed there is already a "struct mutex delete_unused_bgs_mutex", but it's mutex,
we can not use it for this purpose. Of course, we can re-define it to be struct
rw_semaphore, then use it in btrfs_alloc_data_chunk_ondemand(). Either method will
work.
But given that delete_unused_bgs_mutex's name length is longer than bg_delete_sem,
I choose the first method, to create a new struct rw_semaphore bg_delete_sem and
delete delete_unused_bgs_mutex :)
Reported-by: Stefan Priebe <s.priebe@profihost.ag>
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Signed-off-by: David Sterba <dsterba@suse.com>
struct mutex cleaner_mutex;
struct mutex chunk_mutex;
struct mutex volume_mutex;
+ struct rw_semaphore bg_delete_sem;
/*
* this is taken to make sure we don't set block groups ro after
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
struct mutex unused_bg_unpin_mutex;
- struct mutex delete_unused_bgs_mutex;
/* For btrfs to record security options */
struct security_mnt_opts security_opts;
btrfs_run_defrag_inodes(root->fs_info);
/*
- * Acquires fs_info->delete_unused_bgs_mutex to avoid racing
- * with relocation (btrfs_relocate_chunk) and relocation
- * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
- * after acquiring fs_info->delete_unused_bgs_mutex. So we
- * can't hold, nor need to, fs_info->cleaner_mutex when deleting
- * unused block groups.
+ * Acquires fs_info->bg_delete_sem to avoid racing with
+ * relocation (btrfs_relocate_chunk) and relocation acquires
+ * fs_info->cleaner_mutex (btrfs_relocate_block_group) after
+ * acquiring fs_info->bg_delete_sem. So we can't hold, nor need
+ * to, fs_info->cleaner_mutex when deleting unused block groups.
*/
btrfs_delete_unused_bgs(root->fs_info);
sleep:
spin_lock_init(&fs_info->unused_bgs_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->unused_bg_unpin_mutex);
- mutex_init(&fs_info->delete_unused_bgs_mutex);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
mutex_init(&fs_info->cleaner_delayed_iput_mutex);
init_rwsem(&fs_info->commit_root_sem);
init_rwsem(&fs_info->cleanup_work_sem);
init_rwsem(&fs_info->subvol_sem);
+ init_rwsem(&fs_info->bg_delete_sem);
sema_init(&fs_info->uuid_tree_rescan_sem, 1);
btrfs_init_dev_replace_locks(fs_info);
int ret = 0;
int need_commit = 2;
int have_pinned_space;
+ int have_bg_delete_sem = 0;
+ bool free_space_inode = btrfs_is_free_space_inode(inode);
/* make sure bytes are sectorsize aligned */
bytes = ALIGN(bytes, root->sectorsize);
- if (btrfs_is_free_space_inode(inode)) {
+ if (free_space_inode) {
need_commit = 0;
ASSERT(current->journal_info);
}
+ /*
+ * Here we shouldn't call down_read(bg_delete_sem) for free space inode,
+ * there is lock order between bg_delete_sem and "wait current trans
+ * finished". Meanwhile because we only do the data space reservation
+ * for free space cache in the transaction context,
+ * btrfs_delete_unused_bgs() will either have finished its job, or start
+ * a new transaction waiting current transaction to complete, there will
+ * be no unused block groups to be deleted, so it's safe to not call
+ * down_read(bg_delete_sem).
+ */
data_sinfo = fs_info->data_sinfo;
- if (!data_sinfo)
+ if (!data_sinfo) {
+ if (!free_space_inode) {
+ down_read(&root->fs_info->bg_delete_sem);
+ have_bg_delete_sem = 1;
+ }
goto alloc;
+ }
again:
/* make sure we have enough space to handle the data first */
if (used + bytes > data_sinfo->total_bytes) {
struct btrfs_trans_handle *trans;
+ /*
+ * We may need to allocate new chunk, so we should block
+ * btrfs_delete_unused_bgs()
+ */
+ if (!have_bg_delete_sem && !free_space_inode) {
+ spin_unlock(&data_sinfo->lock);
+ down_read(&root->fs_info->bg_delete_sem);
+ have_bg_delete_sem = 1;
+ goto again;
+ }
+
/*
* if we don't have enough free bytes in this space then we need
* to alloc a new chunk.
* the fs.
*/
trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
ret = do_chunk_alloc(trans, root->fs_info->extent_root,
alloc_target,
btrfs_end_transaction(trans, root);
if (ret < 0) {
if (ret != -ENOSPC)
- return ret;
+ goto out;
else {
have_pinned_space = 1;
goto commit_trans;
}
trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
if (have_pinned_space >= 0 ||
test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
&trans->transaction->flags) ||
need_commit > 0) {
ret = btrfs_commit_transaction(trans, root);
if (ret)
- return ret;
+ goto out;
/*
* The cleaner kthread might still be doing iput
* operations. Wait for it to finish so that
trace_btrfs_space_reservation(root->fs_info,
"space_info:enospc",
data_sinfo->flags, bytes, 1);
- return -ENOSPC;
+ ret = -ENOSPC;
+ goto out;
}
data_sinfo->bytes_may_use += bytes;
trace_btrfs_space_reservation(root->fs_info, "space_info",
data_sinfo->flags, bytes, 1);
spin_unlock(&data_sinfo->lock);
+out:
+ if (have_bg_delete_sem && !free_space_inode)
+ up_read(&root->fs_info->bg_delete_sem);
+
return ret;
}
}
spin_unlock(&fs_info->unused_bgs_lock);
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
+ down_write(&root->fs_info->bg_delete_sem);
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
end_trans:
btrfs_end_transaction(trans, root);
next:
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ up_write(&root->fs_info->bg_delete_sem);
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
* we release the path used to search the chunk/dev tree and before
* the current task acquires this mutex and calls us.
*/
- ASSERT(mutex_is_locked(&root->fs_info->delete_unused_bgs_mutex));
+ ASSERT(rwsem_is_locked(&root->fs_info->bg_delete_sem));
ret = btrfs_can_relocate(extent_root, chunk_offset);
if (ret)
key.type = BTRFS_CHUNK_ITEM_KEY;
while (1) {
- mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
+ down_read(&root->fs_info->bg_delete_sem);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0) {
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ up_read(&root->fs_info->bg_delete_sem);
goto error;
}
BUG_ON(ret == 0); /* Corruption */
ret = btrfs_previous_item(chunk_root, path, key.objectid,
key.type);
if (ret)
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ up_read(&root->fs_info->bg_delete_sem);
if (ret < 0)
goto error;
if (ret > 0)
else
BUG_ON(ret);
}
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ up_read(&root->fs_info->bg_delete_sem);
if (found_key.offset == 0)
break;
goto error;
}
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
+ down_read(&fs_info->bg_delete_sem);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ up_read(&fs_info->bg_delete_sem);
goto error;
}
ret = btrfs_previous_item(chunk_root, path, 0,
BTRFS_CHUNK_ITEM_KEY);
if (ret) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ up_read(&fs_info->bg_delete_sem);
ret = 0;
break;
}
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid != key.objectid) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ up_read(&fs_info->bg_delete_sem);
break;
}
btrfs_release_path(path);
if (!ret) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ up_read(&fs_info->bg_delete_sem);
goto loop;
}
if (counting) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ up_read(&fs_info->bg_delete_sem);
spin_lock(&fs_info->balance_lock);
bctl->stat.expected++;
spin_unlock(&fs_info->balance_lock);
count_meta < bctl->meta.limit_min)
|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
count_sys < bctl->sys.limit_min)) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ up_read(&fs_info->bg_delete_sem);
goto loop;
}
!chunk_reserved && !bytes_used) {
trans = btrfs_start_transaction(chunk_root, 0);
if (IS_ERR(trans)) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ up_read(&fs_info->bg_delete_sem);
ret = PTR_ERR(trans);
goto error;
}
BTRFS_BLOCK_GROUP_DATA);
btrfs_end_transaction(trans, chunk_root);
if (ret < 0) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ up_read(&fs_info->bg_delete_sem);
goto error;
}
chunk_reserved = 1;
ret = btrfs_relocate_chunk(chunk_root,
found_key.offset);
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ up_read(&fs_info->bg_delete_sem);
if (ret && ret != -ENOSPC)
goto error;
if (ret == -ENOSPC) {
key.type = BTRFS_DEV_EXTENT_KEY;
do {
- mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
+ down_read(&root->fs_info->bg_delete_sem);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ up_read(&root->fs_info->bg_delete_sem);
goto done;
}
ret = btrfs_previous_item(root, path, 0, key.type);
if (ret)
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ up_read(&root->fs_info->bg_delete_sem);
if (ret < 0)
goto done;
if (ret) {
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
if (key.objectid != device->devid) {
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ up_read(&root->fs_info->bg_delete_sem);
btrfs_release_path(path);
break;
}
length = btrfs_dev_extent_length(l, dev_extent);
if (key.offset + length <= new_size) {
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ up_read(&root->fs_info->bg_delete_sem);
btrfs_release_path(path);
break;
}
btrfs_release_path(path);
ret = btrfs_relocate_chunk(root, chunk_offset);
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ up_read(&root->fs_info->bg_delete_sem);
if (ret && ret != -ENOSPC)
goto done;
if (ret == -ENOSPC)