#include <linux/hash.h>
#include <linux/list_sort.h>
#include <linux/raid/xor.h>
+#include <linux/vmalloc.h>
#include <asm/div64.h>
#include "compat.h"
#include "ctree.h"
struct bio_list bio_list;
spinlock_t bio_list_lock;
- /*
- * also protected by the bio_list_lock, the
- * stripe locking code uses plug_list to hand off
+ /* also protected by the bio_list_lock, the
+ * plug list is used by the plugging code
+ * to collect partial bios while plugged. The
+ * stripe locking code also uses it to hand off
* the stripe lock to the next pending IO
*/
struct list_head plug_list;
struct btrfs_stripe_hash *h;
int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
int i;
+ int table_size;
if (info->stripe_hash_table)
return 0;
- table = kzalloc(sizeof(*table) + sizeof(*h) * num_entries, GFP_NOFS);
- if (!table)
- return -ENOMEM;
+ /*
+ * The table is large, starting with order 4 and can go as high as
+ * order 7 in case lock debugging is turned on.
+ *
+ * Try harder to allocate and fallback to vmalloc to lower the chance
+ * of a failing mount.
+ */
+ table_size = sizeof(*table) + sizeof(*h) * num_entries;
+ table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+ if (!table) {
+ table = vzalloc(table_size);
+ if (!table)
+ return -ENOMEM;
+ }
spin_lock_init(&table->cache_lock);
INIT_LIST_HEAD(&table->stripe_cache);
}
x = cmpxchg(&info->stripe_hash_table, NULL, table);
- if (x)
- kfree(x);
+ if (x) {
+ if (is_vmalloc_addr(x))
+ vfree(x);
+ else
+ kfree(x);
+ }
return 0;
}
/*
* remove everything in the cache
*/
-void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
+static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
{
struct btrfs_stripe_hash_table *table;
unsigned long flags;
if (!info->stripe_hash_table)
return;
btrfs_clear_rbio_cache(info);
- kfree(info->stripe_hash_table);
+ if (is_vmalloc_addr(info->stripe_hash_table))
+ vfree(info->stripe_hash_table);
+ else
+ kfree(info->stripe_hash_table);
info->stripe_hash_table = NULL;
}
* this will try to merge into existing bios if possible, and returns
* zero if all went well.
*/
-int rbio_add_io_page(struct btrfs_raid_bio *rbio,
- struct bio_list *bio_list,
- struct page *page,
- int stripe_nr,
- unsigned long page_index,
- unsigned long bio_max_len)
+static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list,
+ struct page *page,
+ int stripe_nr,
+ unsigned long page_index,
+ unsigned long bio_max_len)
{
struct bio *last = bio_list->tail;
u64 last_end = 0;
}
/* put a new bio on the list */
- bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
+ bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
if (!bio)
return -ENOMEM;
return full_stripe_write(rbio);
}
+/*
+ * We use plugging call backs to collect full stripes.
+ * Any time we get a partial stripe write while plugged
+ * we collect it into a list. When the unplug comes down,
+ * we sort the list by logical block number and merge
+ * everything we can into the same rbios
+ */
+struct btrfs_plug_cb {
+ struct blk_plug_cb cb;
+ struct btrfs_fs_info *info;
+ struct list_head rbio_list;
+ struct btrfs_work work;
+};
+
+/*
+ * rbios on the plug list are sorted for easier merging.
+ */
+static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
+ plug_list);
+ struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
+ plug_list);
+ u64 a_sector = ra->bio_list.head->bi_sector;
+ u64 b_sector = rb->bio_list.head->bi_sector;
+
+ if (a_sector < b_sector)
+ return -1;
+ if (a_sector > b_sector)
+ return 1;
+ return 0;
+}
+
+static void run_plug(struct btrfs_plug_cb *plug)
+{
+ struct btrfs_raid_bio *cur;
+ struct btrfs_raid_bio *last = NULL;
+
+ /*
+ * sort our plug list then try to merge
+ * everything we can in hopes of creating full
+ * stripes.
+ */
+ list_sort(NULL, &plug->rbio_list, plug_cmp);
+ while (!list_empty(&plug->rbio_list)) {
+ cur = list_entry(plug->rbio_list.next,
+ struct btrfs_raid_bio, plug_list);
+ list_del_init(&cur->plug_list);
+
+ if (rbio_is_full(cur)) {
+ /* we have a full stripe, send it down */
+ full_stripe_write(cur);
+ continue;
+ }
+ if (last) {
+ if (rbio_can_merge(last, cur)) {
+ merge_rbio(last, cur);
+ __free_raid_bio(cur);
+ continue;
+
+ }
+ __raid56_parity_write(last);
+ }
+ last = cur;
+ }
+ if (last) {
+ __raid56_parity_write(last);
+ }
+ kfree(plug);
+}
+
+/*
+ * if the unplug comes from schedule, we have to push the
+ * work off to a helper thread
+ */
+static void unplug_work(struct btrfs_work *work)
+{
+ struct btrfs_plug_cb *plug;
+ plug = container_of(work, struct btrfs_plug_cb, work);
+ run_plug(plug);
+}
+
+static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+ struct btrfs_plug_cb *plug;
+ plug = container_of(cb, struct btrfs_plug_cb, cb);
+
+ if (from_schedule) {
+ plug->work.flags = 0;
+ plug->work.func = unplug_work;
+ btrfs_queue_worker(&plug->info->rmw_workers,
+ &plug->work);
+ return;
+ }
+ run_plug(plug);
+}
+
/*
* our main entry point for writes from the rest of the FS.
*/
u64 stripe_len)
{
struct btrfs_raid_bio *rbio;
+ struct btrfs_plug_cb *plug = NULL;
+ struct blk_plug_cb *cb;
rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
if (IS_ERR(rbio)) {
}
bio_list_add(&rbio->bio_list, bio);
rbio->bio_list_bytes = bio->bi_size;
- return __raid56_parity_write(rbio);
+
+ /*
+ * don't plug on full rbios, just get them out the door
+ * as quickly as we can
+ */
+ if (rbio_is_full(rbio))
+ return full_stripe_write(rbio);
+
+ cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
+ sizeof(*plug));
+ if (cb) {
+ plug = container_of(cb, struct btrfs_plug_cb, cb);
+ if (!plug->info) {
+ plug->info = root->fs_info;
+ INIT_LIST_HEAD(&plug->rbio_list);
+ }
+ list_add_tail(&rbio->plug_list, &plug->rbio_list);
+ } else {
+ return __raid56_parity_write(rbio);
+ }
+ return 0;
}
/*