rbd: lay out header probe infrastructure
[deliverable/linux.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
df111be6
AE
55/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
f0f8cef5
AE
59#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
61
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
602adf40
YS
64#define RBD_MAX_SNAP_NAME_LEN 32
65#define RBD_MAX_OPT_LEN 1024
66
67#define RBD_SNAP_HEAD_NAME "-"
68
589d30e0
AE
69#define RBD_IMAGE_ID_LEN_MAX 64
70
81a89793
AE
71/*
72 * An RBD device name will be "rbd#", where the "rbd" comes from
73 * RBD_DRV_NAME above, and # is a unique integer identifier.
74 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
75 * enough to hold all possible device names.
76 */
602adf40 77#define DEV_NAME_LEN 32
81a89793 78#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40 79
cc0538b6 80#define RBD_READ_ONLY_DEFAULT false
59c2be1e 81
602adf40
YS
82/*
83 * block device image metadata (in-memory version)
84 */
85struct rbd_image_header {
f84344f3 86 /* These four fields never change for a given rbd image */
849b4260 87 char *object_prefix;
34b13184 88 u64 features;
602adf40
YS
89 __u8 obj_order;
90 __u8 crypt_type;
91 __u8 comp_type;
602adf40 92
f84344f3
AE
93 /* The remaining fields need to be updated occasionally */
94 u64 image_size;
95 struct ceph_snap_context *snapc;
602adf40
YS
96 char *snap_names;
97 u64 *snap_sizes;
59c2be1e
YS
98
99 u64 obj_version;
100};
101
102struct rbd_options {
cc0538b6 103 bool read_only;
602adf40
YS
104};
105
106/*
f0f8cef5 107 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
108 */
109struct rbd_client {
110 struct ceph_client *client;
111 struct kref kref;
112 struct list_head node;
113};
114
115/*
f0f8cef5 116 * a request completion status
602adf40 117 */
1fec7093
YS
118struct rbd_req_status {
119 int done;
120 int rc;
121 u64 bytes;
122};
123
124/*
125 * a collection of requests
126 */
127struct rbd_req_coll {
128 int total;
129 int num_done;
130 struct kref kref;
131 struct rbd_req_status status[0];
602adf40
YS
132};
133
f0f8cef5
AE
134/*
135 * a single io request
136 */
137struct rbd_request {
138 struct request *rq; /* blk layer request */
139 struct bio *bio; /* cloned bio */
140 struct page **pages; /* list of used pages */
141 u64 len;
142 int coll_index;
143 struct rbd_req_coll *coll;
144};
145
dfc5606d
YS
146struct rbd_snap {
147 struct device dev;
148 const char *name;
3591538f 149 u64 size;
dfc5606d
YS
150 struct list_head node;
151 u64 id;
34b13184 152 u64 features;
dfc5606d
YS
153};
154
f84344f3
AE
155struct rbd_mapping {
156 char *snap_name;
157 u64 snap_id;
99c1f08f 158 u64 size;
34b13184 159 u64 features;
f84344f3
AE
160 bool snap_exists;
161 bool read_only;
162};
163
602adf40
YS
164/*
165 * a single device
166 */
167struct rbd_device {
de71a297 168 int dev_id; /* blkdev unique id */
602adf40
YS
169
170 int major; /* blkdev assigned major */
171 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 172
a30b71b9 173 u32 image_format; /* Either 1 or 2 */
f8c38929 174 struct rbd_options rbd_opts;
602adf40
YS
175 struct rbd_client *rbd_client;
176
177 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
178
179 spinlock_t lock; /* queue lock */
180
181 struct rbd_image_header header;
589d30e0
AE
182 char *image_id;
183 size_t image_id_len;
0bed54dc
AE
184 char *image_name;
185 size_t image_name_len;
186 char *header_name;
d22f76e7 187 char *pool_name;
9bb2f334 188 int pool_id;
602adf40 189
59c2be1e
YS
190 struct ceph_osd_event *watch_event;
191 struct ceph_osd_request *watch_request;
192
c666601a
JD
193 /* protects updating the header */
194 struct rw_semaphore header_rwsem;
f84344f3
AE
195
196 struct rbd_mapping mapping;
602adf40
YS
197
198 struct list_head node;
dfc5606d
YS
199
200 /* list of snapshots */
201 struct list_head snaps;
202
203 /* sysfs related */
204 struct device dev;
205};
206
602adf40 207static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 208
602adf40 209static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
210static DEFINE_SPINLOCK(rbd_dev_list_lock);
211
432b8587
AE
212static LIST_HEAD(rbd_client_list); /* clients */
213static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 214
304f6808
AE
215static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
216static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
217
dfc5606d 218static void rbd_dev_release(struct device *dev);
14e7085d 219static void __rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 220
f0f8cef5
AE
221static ssize_t rbd_add(struct bus_type *bus, const char *buf,
222 size_t count);
223static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
224 size_t count);
225
226static struct bus_attribute rbd_bus_attrs[] = {
227 __ATTR(add, S_IWUSR, NULL, rbd_add),
228 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
229 __ATTR_NULL
230};
231
232static struct bus_type rbd_bus_type = {
233 .name = "rbd",
234 .bus_attrs = rbd_bus_attrs,
235};
236
237static void rbd_root_dev_release(struct device *dev)
238{
239}
240
241static struct device rbd_root_dev = {
242 .init_name = "rbd",
243 .release = rbd_root_dev_release,
244};
245
aafb230e
AE
246#ifdef RBD_DEBUG
247#define rbd_assert(expr) \
248 if (unlikely(!(expr))) { \
249 printk(KERN_ERR "\nAssertion failure in %s() " \
250 "at line %d:\n\n" \
251 "\trbd_assert(%s);\n\n", \
252 __func__, __LINE__, #expr); \
253 BUG(); \
254 }
255#else /* !RBD_DEBUG */
256# define rbd_assert(expr) ((void) 0)
257#endif /* !RBD_DEBUG */
dfc5606d 258
dfc5606d
YS
259static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
260{
261 return get_device(&rbd_dev->dev);
262}
263
264static void rbd_put_dev(struct rbd_device *rbd_dev)
265{
266 put_device(&rbd_dev->dev);
267}
602adf40 268
1fe5e993 269static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 270
602adf40
YS
271static int rbd_open(struct block_device *bdev, fmode_t mode)
272{
f0f8cef5 273 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
602adf40 274
f84344f3 275 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
276 return -EROFS;
277
340c7a2b 278 rbd_get_dev(rbd_dev);
f84344f3 279 set_device_ro(bdev, rbd_dev->mapping.read_only);
340c7a2b 280
602adf40
YS
281 return 0;
282}
283
dfc5606d
YS
284static int rbd_release(struct gendisk *disk, fmode_t mode)
285{
286 struct rbd_device *rbd_dev = disk->private_data;
287
288 rbd_put_dev(rbd_dev);
289
290 return 0;
291}
292
602adf40
YS
293static const struct block_device_operations rbd_bd_ops = {
294 .owner = THIS_MODULE,
295 .open = rbd_open,
dfc5606d 296 .release = rbd_release,
602adf40
YS
297};
298
299/*
300 * Initialize an rbd client instance.
43ae4701 301 * We own *ceph_opts.
602adf40 302 */
f8c38929 303static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
304{
305 struct rbd_client *rbdc;
306 int ret = -ENOMEM;
307
308 dout("rbd_client_create\n");
309 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
310 if (!rbdc)
311 goto out_opt;
312
313 kref_init(&rbdc->kref);
314 INIT_LIST_HEAD(&rbdc->node);
315
bc534d86
AE
316 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
317
43ae4701 318 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 319 if (IS_ERR(rbdc->client))
bc534d86 320 goto out_mutex;
43ae4701 321 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
322
323 ret = ceph_open_session(rbdc->client);
324 if (ret < 0)
325 goto out_err;
326
432b8587 327 spin_lock(&rbd_client_list_lock);
602adf40 328 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 329 spin_unlock(&rbd_client_list_lock);
602adf40 330
bc534d86
AE
331 mutex_unlock(&ctl_mutex);
332
602adf40
YS
333 dout("rbd_client_create created %p\n", rbdc);
334 return rbdc;
335
336out_err:
337 ceph_destroy_client(rbdc->client);
bc534d86
AE
338out_mutex:
339 mutex_unlock(&ctl_mutex);
602adf40
YS
340 kfree(rbdc);
341out_opt:
43ae4701
AE
342 if (ceph_opts)
343 ceph_destroy_options(ceph_opts);
28f259b7 344 return ERR_PTR(ret);
602adf40
YS
345}
346
347/*
1f7ba331
AE
348 * Find a ceph client with specific addr and configuration. If
349 * found, bump its reference count.
602adf40 350 */
1f7ba331 351static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
352{
353 struct rbd_client *client_node;
1f7ba331 354 bool found = false;
602adf40 355
43ae4701 356 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
357 return NULL;
358
1f7ba331
AE
359 spin_lock(&rbd_client_list_lock);
360 list_for_each_entry(client_node, &rbd_client_list, node) {
361 if (!ceph_compare_options(ceph_opts, client_node->client)) {
362 kref_get(&client_node->kref);
363 found = true;
364 break;
365 }
366 }
367 spin_unlock(&rbd_client_list_lock);
368
369 return found ? client_node : NULL;
602adf40
YS
370}
371
59c2be1e
YS
372/*
373 * mount options
374 */
375enum {
59c2be1e
YS
376 Opt_last_int,
377 /* int args above */
378 Opt_last_string,
379 /* string args above */
cc0538b6
AE
380 Opt_read_only,
381 Opt_read_write,
382 /* Boolean args above */
383 Opt_last_bool,
59c2be1e
YS
384};
385
43ae4701 386static match_table_t rbd_opts_tokens = {
59c2be1e
YS
387 /* int args above */
388 /* string args above */
f84344f3 389 {Opt_read_only, "mapping.read_only"},
cc0538b6
AE
390 {Opt_read_only, "ro"}, /* Alternate spelling */
391 {Opt_read_write, "read_write"},
392 {Opt_read_write, "rw"}, /* Alternate spelling */
393 /* Boolean args above */
59c2be1e
YS
394 {-1, NULL}
395};
396
397static int parse_rbd_opts_token(char *c, void *private)
398{
43ae4701 399 struct rbd_options *rbd_opts = private;
59c2be1e
YS
400 substring_t argstr[MAX_OPT_ARGS];
401 int token, intval, ret;
402
43ae4701 403 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
404 if (token < 0)
405 return -EINVAL;
406
407 if (token < Opt_last_int) {
408 ret = match_int(&argstr[0], &intval);
409 if (ret < 0) {
410 pr_err("bad mount option arg (not int) "
411 "at '%s'\n", c);
412 return ret;
413 }
414 dout("got int token %d val %d\n", token, intval);
415 } else if (token > Opt_last_int && token < Opt_last_string) {
416 dout("got string token %d val %s\n", token,
417 argstr[0].from);
cc0538b6
AE
418 } else if (token > Opt_last_string && token < Opt_last_bool) {
419 dout("got Boolean token %d\n", token);
59c2be1e
YS
420 } else {
421 dout("got token %d\n", token);
422 }
423
424 switch (token) {
cc0538b6
AE
425 case Opt_read_only:
426 rbd_opts->read_only = true;
427 break;
428 case Opt_read_write:
429 rbd_opts->read_only = false;
430 break;
59c2be1e 431 default:
aafb230e
AE
432 rbd_assert(false);
433 break;
59c2be1e
YS
434 }
435 return 0;
436}
437
602adf40
YS
438/*
439 * Get a ceph client with specific addr and configuration, if one does
440 * not exist create it.
441 */
f8c38929
AE
442static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
443 size_t mon_addr_len, char *options)
602adf40 444{
f8c38929 445 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
43ae4701 446 struct ceph_options *ceph_opts;
f8c38929 447 struct rbd_client *rbdc;
59c2be1e 448
cc0538b6 449 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
602adf40 450
43ae4701
AE
451 ceph_opts = ceph_parse_options(options, mon_addr,
452 mon_addr + mon_addr_len,
453 parse_rbd_opts_token, rbd_opts);
f8c38929
AE
454 if (IS_ERR(ceph_opts))
455 return PTR_ERR(ceph_opts);
602adf40 456
1f7ba331 457 rbdc = rbd_client_find(ceph_opts);
602adf40 458 if (rbdc) {
602adf40 459 /* using an existing client */
43ae4701 460 ceph_destroy_options(ceph_opts);
f8c38929
AE
461 } else {
462 rbdc = rbd_client_create(ceph_opts);
463 if (IS_ERR(rbdc))
464 return PTR_ERR(rbdc);
602adf40 465 }
f8c38929 466 rbd_dev->rbd_client = rbdc;
602adf40 467
f8c38929 468 return 0;
602adf40
YS
469}
470
471/*
472 * Destroy ceph client
d23a4b3f 473 *
432b8587 474 * Caller must hold rbd_client_list_lock.
602adf40
YS
475 */
476static void rbd_client_release(struct kref *kref)
477{
478 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
479
480 dout("rbd_release_client %p\n", rbdc);
cd9d9f5d 481 spin_lock(&rbd_client_list_lock);
602adf40 482 list_del(&rbdc->node);
cd9d9f5d 483 spin_unlock(&rbd_client_list_lock);
602adf40
YS
484
485 ceph_destroy_client(rbdc->client);
486 kfree(rbdc);
487}
488
489/*
490 * Drop reference to ceph client node. If it's not referenced anymore, release
491 * it.
492 */
493static void rbd_put_client(struct rbd_device *rbd_dev)
494{
495 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
496 rbd_dev->rbd_client = NULL;
602adf40
YS
497}
498
1fec7093
YS
499/*
500 * Destroy requests collection
501 */
502static void rbd_coll_release(struct kref *kref)
503{
504 struct rbd_req_coll *coll =
505 container_of(kref, struct rbd_req_coll, kref);
506
507 dout("rbd_coll_release %p\n", coll);
508 kfree(coll);
509}
602adf40 510
a30b71b9
AE
511static bool rbd_image_format_valid(u32 image_format)
512{
513 return image_format == 1 || image_format == 2;
514}
515
8e94af8e
AE
516static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
517{
103a150f
AE
518 size_t size;
519 u32 snap_count;
520
521 /* The header has to start with the magic rbd header text */
522 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
523 return false;
524
525 /*
526 * The size of a snapshot header has to fit in a size_t, and
527 * that limits the number of snapshots.
528 */
529 snap_count = le32_to_cpu(ondisk->snap_count);
530 size = SIZE_MAX - sizeof (struct ceph_snap_context);
531 if (snap_count > size / sizeof (__le64))
532 return false;
533
534 /*
535 * Not only that, but the size of the entire the snapshot
536 * header must also be representable in a size_t.
537 */
538 size -= snap_count * sizeof (__le64);
539 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
540 return false;
541
542 return true;
8e94af8e
AE
543}
544
602adf40
YS
545/*
546 * Create a new header structure, translate header format from the on-disk
547 * header.
548 */
549static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 550 struct rbd_image_header_ondisk *ondisk)
602adf40 551{
ccece235 552 u32 snap_count;
58c17b0e 553 size_t len;
d2bb24e5 554 size_t size;
621901d6 555 u32 i;
602adf40 556
6a52325f
AE
557 memset(header, 0, sizeof (*header));
558
103a150f
AE
559 snap_count = le32_to_cpu(ondisk->snap_count);
560
58c17b0e
AE
561 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
562 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 563 if (!header->object_prefix)
602adf40 564 return -ENOMEM;
58c17b0e
AE
565 memcpy(header->object_prefix, ondisk->object_prefix, len);
566 header->object_prefix[len] = '\0';
00f1f36f 567
602adf40 568 if (snap_count) {
f785cc1d
AE
569 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
570
621901d6
AE
571 /* Save a copy of the snapshot names */
572
f785cc1d
AE
573 if (snap_names_len > (u64) SIZE_MAX)
574 return -EIO;
575 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 576 if (!header->snap_names)
6a52325f 577 goto out_err;
f785cc1d
AE
578 /*
579 * Note that rbd_dev_v1_header_read() guarantees
580 * the ondisk buffer we're working with has
581 * snap_names_len bytes beyond the end of the
582 * snapshot id array, this memcpy() is safe.
583 */
584 memcpy(header->snap_names, &ondisk->snaps[snap_count],
585 snap_names_len);
6a52325f 586
621901d6
AE
587 /* Record each snapshot's size */
588
d2bb24e5
AE
589 size = snap_count * sizeof (*header->snap_sizes);
590 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 591 if (!header->snap_sizes)
6a52325f 592 goto out_err;
621901d6
AE
593 for (i = 0; i < snap_count; i++)
594 header->snap_sizes[i] =
595 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 596 } else {
ccece235 597 WARN_ON(ondisk->snap_names_len);
602adf40
YS
598 header->snap_names = NULL;
599 header->snap_sizes = NULL;
600 }
849b4260 601
34b13184 602 header->features = 0; /* No features support in v1 images */
602adf40
YS
603 header->obj_order = ondisk->options.order;
604 header->crypt_type = ondisk->options.crypt_type;
605 header->comp_type = ondisk->options.comp_type;
6a52325f 606
621901d6
AE
607 /* Allocate and fill in the snapshot context */
608
f84344f3 609 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
610 size = sizeof (struct ceph_snap_context);
611 size += snap_count * sizeof (header->snapc->snaps[0]);
612 header->snapc = kzalloc(size, GFP_KERNEL);
613 if (!header->snapc)
614 goto out_err;
602adf40
YS
615
616 atomic_set(&header->snapc->nref, 1);
505cbb9b 617 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 618 header->snapc->num_snaps = snap_count;
621901d6
AE
619 for (i = 0; i < snap_count; i++)
620 header->snapc->snaps[i] =
621 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
622
623 return 0;
624
6a52325f 625out_err:
849b4260 626 kfree(header->snap_sizes);
ccece235 627 header->snap_sizes = NULL;
602adf40 628 kfree(header->snap_names);
ccece235 629 header->snap_names = NULL;
6a52325f
AE
630 kfree(header->object_prefix);
631 header->object_prefix = NULL;
ccece235 632
00f1f36f 633 return -ENOMEM;
602adf40
YS
634}
635
8836b995 636static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40 637{
602adf40 638
e86924a8 639 struct rbd_snap *snap;
602adf40 640
e86924a8
AE
641 list_for_each_entry(snap, &rbd_dev->snaps, node) {
642 if (!strcmp(snap_name, snap->name)) {
643 rbd_dev->mapping.snap_id = snap->id;
644 rbd_dev->mapping.size = snap->size;
34b13184 645 rbd_dev->mapping.features = snap->features;
602adf40 646
e86924a8 647 return 0;
00f1f36f 648 }
00f1f36f 649 }
e86924a8 650
00f1f36f 651 return -ENOENT;
602adf40
YS
652}
653
5ed16177 654static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
602adf40 655{
78dc447d 656 int ret;
602adf40 657
4e1105a2 658 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 659 sizeof (RBD_SNAP_HEAD_NAME))) {
f84344f3 660 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
99c1f08f 661 rbd_dev->mapping.size = rbd_dev->header.image_size;
34b13184 662 rbd_dev->mapping.features = rbd_dev->header.features;
f84344f3
AE
663 rbd_dev->mapping.snap_exists = false;
664 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
e86924a8 665 ret = 0;
602adf40 666 } else {
8836b995 667 ret = snap_by_name(rbd_dev, snap_name);
602adf40
YS
668 if (ret < 0)
669 goto done;
f84344f3
AE
670 rbd_dev->mapping.snap_exists = true;
671 rbd_dev->mapping.read_only = true;
602adf40 672 }
4e1105a2 673 rbd_dev->mapping.snap_name = snap_name;
602adf40 674done:
602adf40
YS
675 return ret;
676}
677
678static void rbd_header_free(struct rbd_image_header *header)
679{
849b4260 680 kfree(header->object_prefix);
d78fd7ae 681 header->object_prefix = NULL;
602adf40 682 kfree(header->snap_sizes);
d78fd7ae 683 header->snap_sizes = NULL;
849b4260 684 kfree(header->snap_names);
d78fd7ae 685 header->snap_names = NULL;
d1d25646 686 ceph_put_snap_context(header->snapc);
d78fd7ae 687 header->snapc = NULL;
602adf40
YS
688}
689
65ccfe21 690static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 691{
65ccfe21
AE
692 char *name;
693 u64 segment;
694 int ret;
602adf40 695
65ccfe21
AE
696 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
697 if (!name)
698 return NULL;
699 segment = offset >> rbd_dev->header.obj_order;
700 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
701 rbd_dev->header.object_prefix, segment);
702 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
703 pr_err("error formatting segment name for #%llu (%d)\n",
704 segment, ret);
705 kfree(name);
706 name = NULL;
707 }
602adf40 708
65ccfe21
AE
709 return name;
710}
602adf40 711
65ccfe21
AE
712static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
713{
714 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 715
65ccfe21
AE
716 return offset & (segment_size - 1);
717}
718
719static u64 rbd_segment_length(struct rbd_device *rbd_dev,
720 u64 offset, u64 length)
721{
722 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
723
724 offset &= segment_size - 1;
725
aafb230e 726 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
727 if (offset + length > segment_size)
728 length = segment_size - offset;
729
730 return length;
602adf40
YS
731}
732
1fec7093
YS
733static int rbd_get_num_segments(struct rbd_image_header *header,
734 u64 ofs, u64 len)
735{
df111be6
AE
736 u64 start_seg;
737 u64 end_seg;
738
739 if (!len)
740 return 0;
741 if (len - 1 > U64_MAX - ofs)
742 return -ERANGE;
743
744 start_seg = ofs >> header->obj_order;
745 end_seg = (ofs + len - 1) >> header->obj_order;
746
1fec7093
YS
747 return end_seg - start_seg + 1;
748}
749
029bcbd8
JD
750/*
751 * returns the size of an object in the image
752 */
753static u64 rbd_obj_bytes(struct rbd_image_header *header)
754{
755 return 1 << header->obj_order;
756}
757
602adf40
YS
758/*
759 * bio helpers
760 */
761
762static void bio_chain_put(struct bio *chain)
763{
764 struct bio *tmp;
765
766 while (chain) {
767 tmp = chain;
768 chain = chain->bi_next;
769 bio_put(tmp);
770 }
771}
772
773/*
774 * zeros a bio chain, starting at specific offset
775 */
776static void zero_bio_chain(struct bio *chain, int start_ofs)
777{
778 struct bio_vec *bv;
779 unsigned long flags;
780 void *buf;
781 int i;
782 int pos = 0;
783
784 while (chain) {
785 bio_for_each_segment(bv, chain, i) {
786 if (pos + bv->bv_len > start_ofs) {
787 int remainder = max(start_ofs - pos, 0);
788 buf = bvec_kmap_irq(bv, &flags);
789 memset(buf + remainder, 0,
790 bv->bv_len - remainder);
85b5aaa6 791 bvec_kunmap_irq(buf, &flags);
602adf40
YS
792 }
793 pos += bv->bv_len;
794 }
795
796 chain = chain->bi_next;
797 }
798}
799
800/*
801 * bio_chain_clone - clone a chain of bios up to a certain length.
802 * might return a bio_pair that will need to be released.
803 */
804static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
805 struct bio_pair **bp,
806 int len, gfp_t gfpmask)
807{
542582fc
AE
808 struct bio *old_chain = *old;
809 struct bio *new_chain = NULL;
810 struct bio *tail;
602adf40
YS
811 int total = 0;
812
813 if (*bp) {
814 bio_pair_release(*bp);
815 *bp = NULL;
816 }
817
818 while (old_chain && (total < len)) {
542582fc
AE
819 struct bio *tmp;
820
602adf40
YS
821 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
822 if (!tmp)
823 goto err_out;
542582fc 824 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
602adf40
YS
825
826 if (total + old_chain->bi_size > len) {
827 struct bio_pair *bp;
828
829 /*
830 * this split can only happen with a single paged bio,
831 * split_bio will BUG_ON if this is not the case
832 */
833 dout("bio_chain_clone split! total=%d remaining=%d"
bd919d45
AE
834 "bi_size=%u\n",
835 total, len - total, old_chain->bi_size);
602adf40
YS
836
837 /* split the bio. We'll release it either in the next
838 call, or it will have to be released outside */
593a9e7b 839 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
602adf40
YS
840 if (!bp)
841 goto err_out;
842
843 __bio_clone(tmp, &bp->bio1);
844
845 *next = &bp->bio2;
846 } else {
847 __bio_clone(tmp, old_chain);
848 *next = old_chain->bi_next;
849 }
850
851 tmp->bi_bdev = NULL;
602adf40 852 tmp->bi_next = NULL;
542582fc 853 if (new_chain)
602adf40 854 tail->bi_next = tmp;
542582fc
AE
855 else
856 new_chain = tmp;
857 tail = tmp;
602adf40
YS
858 old_chain = old_chain->bi_next;
859
860 total += tmp->bi_size;
861 }
862
aafb230e 863 rbd_assert(total == len);
602adf40 864
602adf40
YS
865 *old = old_chain;
866
867 return new_chain;
868
869err_out:
870 dout("bio_chain_clone with err\n");
871 bio_chain_put(new_chain);
872 return NULL;
873}
874
875/*
876 * helpers for osd request op vectors.
877 */
57cfc106
AE
878static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
879 int opcode, u32 payload_len)
602adf40 880{
57cfc106
AE
881 struct ceph_osd_req_op *ops;
882
883 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
884 if (!ops)
885 return NULL;
886
887 ops[0].op = opcode;
888
602adf40
YS
889 /*
890 * op extent offset and length will be set later on
891 * in calc_raw_layout()
892 */
57cfc106
AE
893 ops[0].payload_len = payload_len;
894
895 return ops;
602adf40
YS
896}
897
898static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
899{
900 kfree(ops);
901}
902
1fec7093
YS
903static void rbd_coll_end_req_index(struct request *rq,
904 struct rbd_req_coll *coll,
905 int index,
906 int ret, u64 len)
907{
908 struct request_queue *q;
909 int min, max, i;
910
bd919d45
AE
911 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
912 coll, index, ret, (unsigned long long) len);
1fec7093
YS
913
914 if (!rq)
915 return;
916
917 if (!coll) {
918 blk_end_request(rq, ret, len);
919 return;
920 }
921
922 q = rq->q;
923
924 spin_lock_irq(q->queue_lock);
925 coll->status[index].done = 1;
926 coll->status[index].rc = ret;
927 coll->status[index].bytes = len;
928 max = min = coll->num_done;
929 while (max < coll->total && coll->status[max].done)
930 max++;
931
932 for (i = min; i<max; i++) {
933 __blk_end_request(rq, coll->status[i].rc,
934 coll->status[i].bytes);
935 coll->num_done++;
936 kref_put(&coll->kref, rbd_coll_release);
937 }
938 spin_unlock_irq(q->queue_lock);
939}
940
941static void rbd_coll_end_req(struct rbd_request *req,
942 int ret, u64 len)
943{
944 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
945}
946
602adf40
YS
947/*
948 * Send ceph osd request
949 */
950static int rbd_do_request(struct request *rq,
0ce1a794 951 struct rbd_device *rbd_dev,
602adf40
YS
952 struct ceph_snap_context *snapc,
953 u64 snapid,
aded07ea 954 const char *object_name, u64 ofs, u64 len,
602adf40
YS
955 struct bio *bio,
956 struct page **pages,
957 int num_pages,
958 int flags,
959 struct ceph_osd_req_op *ops,
1fec7093
YS
960 struct rbd_req_coll *coll,
961 int coll_index,
602adf40 962 void (*rbd_cb)(struct ceph_osd_request *req,
59c2be1e
YS
963 struct ceph_msg *msg),
964 struct ceph_osd_request **linger_req,
965 u64 *ver)
602adf40
YS
966{
967 struct ceph_osd_request *req;
968 struct ceph_file_layout *layout;
969 int ret;
970 u64 bno;
971 struct timespec mtime = CURRENT_TIME;
972 struct rbd_request *req_data;
973 struct ceph_osd_request_head *reqhead;
1dbb4399 974 struct ceph_osd_client *osdc;
602adf40 975
602adf40 976 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
1fec7093
YS
977 if (!req_data) {
978 if (coll)
979 rbd_coll_end_req_index(rq, coll, coll_index,
980 -ENOMEM, len);
981 return -ENOMEM;
982 }
983
984 if (coll) {
985 req_data->coll = coll;
986 req_data->coll_index = coll_index;
987 }
602adf40 988
bd919d45
AE
989 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
990 (unsigned long long) ofs, (unsigned long long) len);
602adf40 991
0ce1a794 992 osdc = &rbd_dev->rbd_client->client->osdc;
1dbb4399
AE
993 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
994 false, GFP_NOIO, pages, bio);
4ad12621 995 if (!req) {
4ad12621 996 ret = -ENOMEM;
602adf40
YS
997 goto done_pages;
998 }
999
1000 req->r_callback = rbd_cb;
1001
1002 req_data->rq = rq;
1003 req_data->bio = bio;
1004 req_data->pages = pages;
1005 req_data->len = len;
1006
1007 req->r_priv = req_data;
1008
1009 reqhead = req->r_request->front.iov_base;
1010 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1011
aded07ea 1012 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
602adf40
YS
1013 req->r_oid_len = strlen(req->r_oid);
1014
1015 layout = &req->r_file_layout;
1016 memset(layout, 0, sizeof(*layout));
1017 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1018 layout->fl_stripe_count = cpu_to_le32(1);
1019 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
0ce1a794 1020 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
1dbb4399
AE
1021 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1022 req, ops);
602adf40
YS
1023
1024 ceph_osdc_build_request(req, ofs, &len,
1025 ops,
1026 snapc,
1027 &mtime,
1028 req->r_oid, req->r_oid_len);
602adf40 1029
59c2be1e 1030 if (linger_req) {
1dbb4399 1031 ceph_osdc_set_request_linger(osdc, req);
59c2be1e
YS
1032 *linger_req = req;
1033 }
1034
1dbb4399 1035 ret = ceph_osdc_start_request(osdc, req, false);
602adf40
YS
1036 if (ret < 0)
1037 goto done_err;
1038
1039 if (!rbd_cb) {
1dbb4399 1040 ret = ceph_osdc_wait_request(osdc, req);
59c2be1e
YS
1041 if (ver)
1042 *ver = le64_to_cpu(req->r_reassert_version.version);
bd919d45
AE
1043 dout("reassert_ver=%llu\n",
1044 (unsigned long long)
1045 le64_to_cpu(req->r_reassert_version.version));
602adf40
YS
1046 ceph_osdc_put_request(req);
1047 }
1048 return ret;
1049
1050done_err:
1051 bio_chain_put(req_data->bio);
1052 ceph_osdc_put_request(req);
1053done_pages:
1fec7093 1054 rbd_coll_end_req(req_data, ret, len);
602adf40 1055 kfree(req_data);
602adf40
YS
1056 return ret;
1057}
1058
1059/*
1060 * Ceph osd op callback
1061 */
1062static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1063{
1064 struct rbd_request *req_data = req->r_priv;
1065 struct ceph_osd_reply_head *replyhead;
1066 struct ceph_osd_op *op;
1067 __s32 rc;
1068 u64 bytes;
1069 int read_op;
1070
1071 /* parse reply */
1072 replyhead = msg->front.iov_base;
1073 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1074 op = (void *)(replyhead + 1);
1075 rc = le32_to_cpu(replyhead->result);
1076 bytes = le64_to_cpu(op->extent.length);
895cfcc8 1077 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
602adf40 1078
bd919d45
AE
1079 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1080 (unsigned long long) bytes, read_op, (int) rc);
602adf40
YS
1081
1082 if (rc == -ENOENT && read_op) {
1083 zero_bio_chain(req_data->bio, 0);
1084 rc = 0;
1085 } else if (rc == 0 && read_op && bytes < req_data->len) {
1086 zero_bio_chain(req_data->bio, bytes);
1087 bytes = req_data->len;
1088 }
1089
1fec7093 1090 rbd_coll_end_req(req_data, rc, bytes);
602adf40
YS
1091
1092 if (req_data->bio)
1093 bio_chain_put(req_data->bio);
1094
1095 ceph_osdc_put_request(req);
1096 kfree(req_data);
1097}
1098
59c2be1e
YS
1099static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1100{
1101 ceph_osdc_put_request(req);
1102}
1103
602adf40
YS
1104/*
1105 * Do a synchronous ceph osd operation
1106 */
0ce1a794 1107static int rbd_req_sync_op(struct rbd_device *rbd_dev,
602adf40
YS
1108 struct ceph_snap_context *snapc,
1109 u64 snapid,
602adf40 1110 int flags,
913d2fdc 1111 struct ceph_osd_req_op *ops,
aded07ea 1112 const char *object_name,
f8d4de6e
AE
1113 u64 ofs, u64 inbound_size,
1114 char *inbound,
59c2be1e
YS
1115 struct ceph_osd_request **linger_req,
1116 u64 *ver)
602adf40
YS
1117{
1118 int ret;
1119 struct page **pages;
1120 int num_pages;
913d2fdc 1121
aafb230e 1122 rbd_assert(ops != NULL);
602adf40 1123
f8d4de6e 1124 num_pages = calc_pages_for(ofs, inbound_size);
602adf40 1125 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
b8d0638a
DC
1126 if (IS_ERR(pages))
1127 return PTR_ERR(pages);
602adf40 1128
0ce1a794 1129 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
f8d4de6e 1130 object_name, ofs, inbound_size, NULL,
602adf40
YS
1131 pages, num_pages,
1132 flags,
1133 ops,
1fec7093 1134 NULL, 0,
59c2be1e
YS
1135 NULL,
1136 linger_req, ver);
602adf40 1137 if (ret < 0)
913d2fdc 1138 goto done;
602adf40 1139
f8d4de6e
AE
1140 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1141 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
602adf40 1142
602adf40
YS
1143done:
1144 ceph_release_page_vector(pages, num_pages);
1145 return ret;
1146}
1147
1148/*
1149 * Do an asynchronous ceph osd operation
1150 */
1151static int rbd_do_op(struct request *rq,
0ce1a794 1152 struct rbd_device *rbd_dev,
602adf40
YS
1153 struct ceph_snap_context *snapc,
1154 u64 snapid,
d1f57ea6 1155 int opcode, int flags,
602adf40 1156 u64 ofs, u64 len,
1fec7093
YS
1157 struct bio *bio,
1158 struct rbd_req_coll *coll,
1159 int coll_index)
602adf40
YS
1160{
1161 char *seg_name;
1162 u64 seg_ofs;
1163 u64 seg_len;
1164 int ret;
1165 struct ceph_osd_req_op *ops;
1166 u32 payload_len;
1167
65ccfe21 1168 seg_name = rbd_segment_name(rbd_dev, ofs);
602adf40
YS
1169 if (!seg_name)
1170 return -ENOMEM;
65ccfe21
AE
1171 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1172 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
602adf40
YS
1173
1174 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1175
57cfc106
AE
1176 ret = -ENOMEM;
1177 ops = rbd_create_rw_ops(1, opcode, payload_len);
1178 if (!ops)
602adf40
YS
1179 goto done;
1180
1181 /* we've taken care of segment sizes earlier when we
1182 cloned the bios. We should never have a segment
1183 truncated at this point */
aafb230e 1184 rbd_assert(seg_len == len);
602adf40
YS
1185
1186 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1187 seg_name, seg_ofs, seg_len,
1188 bio,
1189 NULL, 0,
1190 flags,
1191 ops,
1fec7093 1192 coll, coll_index,
59c2be1e 1193 rbd_req_cb, 0, NULL);
11f77002
SW
1194
1195 rbd_destroy_ops(ops);
602adf40
YS
1196done:
1197 kfree(seg_name);
1198 return ret;
1199}
1200
1201/*
1202 * Request async osd write
1203 */
1204static int rbd_req_write(struct request *rq,
1205 struct rbd_device *rbd_dev,
1206 struct ceph_snap_context *snapc,
1207 u64 ofs, u64 len,
1fec7093
YS
1208 struct bio *bio,
1209 struct rbd_req_coll *coll,
1210 int coll_index)
602adf40
YS
1211{
1212 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1213 CEPH_OSD_OP_WRITE,
1214 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1fec7093 1215 ofs, len, bio, coll, coll_index);
602adf40
YS
1216}
1217
1218/*
1219 * Request async osd read
1220 */
1221static int rbd_req_read(struct request *rq,
1222 struct rbd_device *rbd_dev,
1223 u64 snapid,
1224 u64 ofs, u64 len,
1fec7093
YS
1225 struct bio *bio,
1226 struct rbd_req_coll *coll,
1227 int coll_index)
602adf40
YS
1228{
1229 return rbd_do_op(rq, rbd_dev, NULL,
b06e6a6b 1230 snapid,
602adf40
YS
1231 CEPH_OSD_OP_READ,
1232 CEPH_OSD_FLAG_READ,
1fec7093 1233 ofs, len, bio, coll, coll_index);
602adf40
YS
1234}
1235
1236/*
1237 * Request sync osd read
1238 */
0ce1a794 1239static int rbd_req_sync_read(struct rbd_device *rbd_dev,
602adf40 1240 u64 snapid,
aded07ea 1241 const char *object_name,
602adf40 1242 u64 ofs, u64 len,
59c2be1e
YS
1243 char *buf,
1244 u64 *ver)
602adf40 1245{
913d2fdc
AE
1246 struct ceph_osd_req_op *ops;
1247 int ret;
1248
1249 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1250 if (!ops)
1251 return -ENOMEM;
1252
1253 ret = rbd_req_sync_op(rbd_dev, NULL,
b06e6a6b 1254 snapid,
602adf40 1255 CEPH_OSD_FLAG_READ,
913d2fdc
AE
1256 ops, object_name, ofs, len, buf, NULL, ver);
1257 rbd_destroy_ops(ops);
1258
1259 return ret;
602adf40
YS
1260}
1261
1262/*
59c2be1e
YS
1263 * Request sync osd watch
1264 */
0ce1a794 1265static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
59c2be1e 1266 u64 ver,
7f0a24d8 1267 u64 notify_id)
59c2be1e
YS
1268{
1269 struct ceph_osd_req_op *ops;
11f77002
SW
1270 int ret;
1271
57cfc106
AE
1272 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1273 if (!ops)
1274 return -ENOMEM;
59c2be1e 1275
a71b891b 1276 ops[0].watch.ver = cpu_to_le64(ver);
59c2be1e
YS
1277 ops[0].watch.cookie = notify_id;
1278 ops[0].watch.flag = 0;
1279
0ce1a794 1280 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
7f0a24d8 1281 rbd_dev->header_name, 0, 0, NULL,
ad4f232f 1282 NULL, 0,
59c2be1e
YS
1283 CEPH_OSD_FLAG_READ,
1284 ops,
1fec7093 1285 NULL, 0,
59c2be1e
YS
1286 rbd_simple_req_cb, 0, NULL);
1287
1288 rbd_destroy_ops(ops);
1289 return ret;
1290}
1291
1292static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1293{
0ce1a794 1294 struct rbd_device *rbd_dev = (struct rbd_device *)data;
a71b891b 1295 u64 hver;
13143d2d
SW
1296 int rc;
1297
0ce1a794 1298 if (!rbd_dev)
59c2be1e
YS
1299 return;
1300
bd919d45
AE
1301 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1302 rbd_dev->header_name, (unsigned long long) notify_id,
1303 (unsigned int) opcode);
1fe5e993 1304 rc = rbd_refresh_header(rbd_dev, &hver);
13143d2d 1305 if (rc)
f0f8cef5 1306 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
0ce1a794 1307 " update snaps: %d\n", rbd_dev->major, rc);
59c2be1e 1308
7f0a24d8 1309 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
59c2be1e
YS
1310}
1311
1312/*
1313 * Request sync osd watch
1314 */
0e6f322d 1315static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
59c2be1e
YS
1316{
1317 struct ceph_osd_req_op *ops;
0ce1a794 1318 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
57cfc106 1319 int ret;
59c2be1e 1320
57cfc106
AE
1321 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1322 if (!ops)
1323 return -ENOMEM;
59c2be1e
YS
1324
1325 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
0ce1a794 1326 (void *)rbd_dev, &rbd_dev->watch_event);
59c2be1e
YS
1327 if (ret < 0)
1328 goto fail;
1329
0e6f322d 1330 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
0ce1a794 1331 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
59c2be1e
YS
1332 ops[0].watch.flag = 1;
1333
0ce1a794 1334 ret = rbd_req_sync_op(rbd_dev, NULL,
59c2be1e 1335 CEPH_NOSNAP,
59c2be1e
YS
1336 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1337 ops,
0e6f322d
AE
1338 rbd_dev->header_name,
1339 0, 0, NULL,
0ce1a794 1340 &rbd_dev->watch_request, NULL);
59c2be1e
YS
1341
1342 if (ret < 0)
1343 goto fail_event;
1344
1345 rbd_destroy_ops(ops);
1346 return 0;
1347
1348fail_event:
0ce1a794
AE
1349 ceph_osdc_cancel_event(rbd_dev->watch_event);
1350 rbd_dev->watch_event = NULL;
59c2be1e
YS
1351fail:
1352 rbd_destroy_ops(ops);
1353 return ret;
1354}
1355
79e3057c
YS
1356/*
1357 * Request sync osd unwatch
1358 */
070c633f 1359static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
79e3057c
YS
1360{
1361 struct ceph_osd_req_op *ops;
57cfc106 1362 int ret;
79e3057c 1363
57cfc106
AE
1364 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1365 if (!ops)
1366 return -ENOMEM;
79e3057c
YS
1367
1368 ops[0].watch.ver = 0;
0ce1a794 1369 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
79e3057c
YS
1370 ops[0].watch.flag = 0;
1371
0ce1a794 1372 ret = rbd_req_sync_op(rbd_dev, NULL,
79e3057c 1373 CEPH_NOSNAP,
79e3057c
YS
1374 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1375 ops,
070c633f
AE
1376 rbd_dev->header_name,
1377 0, 0, NULL, NULL, NULL);
1378
79e3057c
YS
1379
1380 rbd_destroy_ops(ops);
0ce1a794
AE
1381 ceph_osdc_cancel_event(rbd_dev->watch_event);
1382 rbd_dev->watch_event = NULL;
79e3057c
YS
1383 return ret;
1384}
1385
602adf40 1386/*
3cb4a687 1387 * Synchronous osd object method call
602adf40 1388 */
0ce1a794 1389static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
aded07ea
AE
1390 const char *object_name,
1391 const char *class_name,
1392 const char *method_name,
3cb4a687
AE
1393 const char *outbound,
1394 size_t outbound_size,
f8d4de6e
AE
1395 char *inbound,
1396 size_t inbound_size,
3cb4a687 1397 int flags,
59c2be1e 1398 u64 *ver)
602adf40
YS
1399{
1400 struct ceph_osd_req_op *ops;
aded07ea
AE
1401 int class_name_len = strlen(class_name);
1402 int method_name_len = strlen(method_name);
3cb4a687 1403 int payload_size;
57cfc106
AE
1404 int ret;
1405
3cb4a687
AE
1406 /*
1407 * Any input parameters required by the method we're calling
1408 * will be sent along with the class and method names as
1409 * part of the message payload. That data and its size are
1410 * supplied via the indata and indata_len fields (named from
1411 * the perspective of the server side) in the OSD request
1412 * operation.
1413 */
1414 payload_size = class_name_len + method_name_len + outbound_size;
1415 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
57cfc106
AE
1416 if (!ops)
1417 return -ENOMEM;
602adf40 1418
aded07ea
AE
1419 ops[0].cls.class_name = class_name;
1420 ops[0].cls.class_len = (__u8) class_name_len;
1421 ops[0].cls.method_name = method_name;
1422 ops[0].cls.method_len = (__u8) method_name_len;
602adf40 1423 ops[0].cls.argc = 0;
3cb4a687
AE
1424 ops[0].cls.indata = outbound;
1425 ops[0].cls.indata_len = outbound_size;
602adf40 1426
0ce1a794 1427 ret = rbd_req_sync_op(rbd_dev, NULL,
602adf40 1428 CEPH_NOSNAP,
3cb4a687 1429 flags, ops,
f8d4de6e
AE
1430 object_name, 0, inbound_size, inbound,
1431 NULL, ver);
602adf40
YS
1432
1433 rbd_destroy_ops(ops);
1434
1435 dout("cls_exec returned %d\n", ret);
1436 return ret;
1437}
1438
1fec7093
YS
1439static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1440{
1441 struct rbd_req_coll *coll =
1442 kzalloc(sizeof(struct rbd_req_coll) +
1443 sizeof(struct rbd_req_status) * num_reqs,
1444 GFP_ATOMIC);
1445
1446 if (!coll)
1447 return NULL;
1448 coll->total = num_reqs;
1449 kref_init(&coll->kref);
1450 return coll;
1451}
1452
602adf40
YS
1453/*
1454 * block device queue callback
1455 */
1456static void rbd_rq_fn(struct request_queue *q)
1457{
1458 struct rbd_device *rbd_dev = q->queuedata;
1459 struct request *rq;
1460 struct bio_pair *bp = NULL;
1461
00f1f36f 1462 while ((rq = blk_fetch_request(q))) {
602adf40
YS
1463 struct bio *bio;
1464 struct bio *rq_bio, *next_bio = NULL;
1465 bool do_write;
bd919d45
AE
1466 unsigned int size;
1467 u64 op_size = 0;
602adf40 1468 u64 ofs;
1fec7093
YS
1469 int num_segs, cur_seg = 0;
1470 struct rbd_req_coll *coll;
d1d25646 1471 struct ceph_snap_context *snapc;
602adf40 1472
602adf40
YS
1473 dout("fetched request\n");
1474
1475 /* filter out block requests we don't understand */
1476 if ((rq->cmd_type != REQ_TYPE_FS)) {
1477 __blk_end_request_all(rq, 0);
00f1f36f 1478 continue;
602adf40
YS
1479 }
1480
1481 /* deduce our operation (read, write) */
1482 do_write = (rq_data_dir(rq) == WRITE);
1483
1484 size = blk_rq_bytes(rq);
593a9e7b 1485 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
602adf40 1486 rq_bio = rq->bio;
f84344f3 1487 if (do_write && rbd_dev->mapping.read_only) {
602adf40 1488 __blk_end_request_all(rq, -EROFS);
00f1f36f 1489 continue;
602adf40
YS
1490 }
1491
1492 spin_unlock_irq(q->queue_lock);
1493
d1d25646 1494 down_read(&rbd_dev->header_rwsem);
e88a36ec 1495
f84344f3
AE
1496 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1497 !rbd_dev->mapping.snap_exists) {
e88a36ec 1498 up_read(&rbd_dev->header_rwsem);
d1d25646
JD
1499 dout("request for non-existent snapshot");
1500 spin_lock_irq(q->queue_lock);
1501 __blk_end_request_all(rq, -ENXIO);
1502 continue;
e88a36ec
JD
1503 }
1504
d1d25646
JD
1505 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1506
1507 up_read(&rbd_dev->header_rwsem);
1508
602adf40
YS
1509 dout("%s 0x%x bytes at 0x%llx\n",
1510 do_write ? "write" : "read",
bd919d45 1511 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
602adf40 1512
1fec7093 1513 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
df111be6
AE
1514 if (num_segs <= 0) {
1515 spin_lock_irq(q->queue_lock);
1516 __blk_end_request_all(rq, num_segs);
1517 ceph_put_snap_context(snapc);
1518 continue;
1519 }
1fec7093
YS
1520 coll = rbd_alloc_coll(num_segs);
1521 if (!coll) {
1522 spin_lock_irq(q->queue_lock);
1523 __blk_end_request_all(rq, -ENOMEM);
d1d25646 1524 ceph_put_snap_context(snapc);
00f1f36f 1525 continue;
1fec7093
YS
1526 }
1527
602adf40
YS
1528 do {
1529 /* a bio clone to be passed down to OSD req */
bd919d45 1530 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
65ccfe21 1531 op_size = rbd_segment_length(rbd_dev, ofs, size);
1fec7093 1532 kref_get(&coll->kref);
602adf40
YS
1533 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1534 op_size, GFP_ATOMIC);
1535 if (!bio) {
1fec7093
YS
1536 rbd_coll_end_req_index(rq, coll, cur_seg,
1537 -ENOMEM, op_size);
1538 goto next_seg;
602adf40
YS
1539 }
1540
1fec7093 1541
602adf40
YS
1542 /* init OSD command: write or read */
1543 if (do_write)
1544 rbd_req_write(rq, rbd_dev,
d1d25646 1545 snapc,
602adf40 1546 ofs,
1fec7093
YS
1547 op_size, bio,
1548 coll, cur_seg);
602adf40
YS
1549 else
1550 rbd_req_read(rq, rbd_dev,
f84344f3 1551 rbd_dev->mapping.snap_id,
602adf40 1552 ofs,
1fec7093
YS
1553 op_size, bio,
1554 coll, cur_seg);
602adf40 1555
1fec7093 1556next_seg:
602adf40
YS
1557 size -= op_size;
1558 ofs += op_size;
1559
1fec7093 1560 cur_seg++;
602adf40
YS
1561 rq_bio = next_bio;
1562 } while (size > 0);
1fec7093 1563 kref_put(&coll->kref, rbd_coll_release);
602adf40
YS
1564
1565 if (bp)
1566 bio_pair_release(bp);
602adf40 1567 spin_lock_irq(q->queue_lock);
d1d25646
JD
1568
1569 ceph_put_snap_context(snapc);
602adf40
YS
1570 }
1571}
1572
1573/*
1574 * a queue callback. Makes sure that we don't create a bio that spans across
1575 * multiple osd objects. One exception would be with a single page bios,
1576 * which we handle later at bio_chain_clone
1577 */
1578static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1579 struct bio_vec *bvec)
1580{
1581 struct rbd_device *rbd_dev = q->queuedata;
593a9e7b
AE
1582 unsigned int chunk_sectors;
1583 sector_t sector;
1584 unsigned int bio_sectors;
602adf40
YS
1585 int max;
1586
593a9e7b
AE
1587 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1588 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1589 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1590
602adf40 1591 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
593a9e7b 1592 + bio_sectors)) << SECTOR_SHIFT;
602adf40
YS
1593 if (max < 0)
1594 max = 0; /* bio_add cannot handle a negative return */
1595 if (max <= bvec->bv_len && bio_sectors == 0)
1596 return bvec->bv_len;
1597 return max;
1598}
1599
1600static void rbd_free_disk(struct rbd_device *rbd_dev)
1601{
1602 struct gendisk *disk = rbd_dev->disk;
1603
1604 if (!disk)
1605 return;
1606
602adf40
YS
1607 if (disk->flags & GENHD_FL_UP)
1608 del_gendisk(disk);
1609 if (disk->queue)
1610 blk_cleanup_queue(disk->queue);
1611 put_disk(disk);
1612}
1613
1614/*
4156d998
AE
1615 * Read the complete header for the given rbd device.
1616 *
1617 * Returns a pointer to a dynamically-allocated buffer containing
1618 * the complete and validated header. Caller can pass the address
1619 * of a variable that will be filled in with the version of the
1620 * header object at the time it was read.
1621 *
1622 * Returns a pointer-coded errno if a failure occurs.
602adf40 1623 */
4156d998
AE
1624static struct rbd_image_header_ondisk *
1625rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 1626{
4156d998 1627 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 1628 u32 snap_count = 0;
4156d998
AE
1629 u64 names_size = 0;
1630 u32 want_count;
1631 int ret;
602adf40 1632
00f1f36f 1633 /*
4156d998
AE
1634 * The complete header will include an array of its 64-bit
1635 * snapshot ids, followed by the names of those snapshots as
1636 * a contiguous block of NUL-terminated strings. Note that
1637 * the number of snapshots could change by the time we read
1638 * it in, in which case we re-read it.
00f1f36f 1639 */
4156d998
AE
1640 do {
1641 size_t size;
1642
1643 kfree(ondisk);
1644
1645 size = sizeof (*ondisk);
1646 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1647 size += names_size;
1648 ondisk = kmalloc(size, GFP_KERNEL);
1649 if (!ondisk)
1650 return ERR_PTR(-ENOMEM);
1651
1652 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
0bed54dc 1653 rbd_dev->header_name,
4156d998
AE
1654 0, size,
1655 (char *) ondisk, version);
1656
1657 if (ret < 0)
1658 goto out_err;
1659 if (WARN_ON((size_t) ret < size)) {
1660 ret = -ENXIO;
1661 pr_warning("short header read for image %s"
1662 " (want %zd got %d)\n",
1663 rbd_dev->image_name, size, ret);
1664 goto out_err;
1665 }
1666 if (!rbd_dev_ondisk_valid(ondisk)) {
1667 ret = -ENXIO;
1668 pr_warning("invalid header for image %s\n",
1669 rbd_dev->image_name);
1670 goto out_err;
81e759fb 1671 }
602adf40 1672
4156d998
AE
1673 names_size = le64_to_cpu(ondisk->snap_names_len);
1674 want_count = snap_count;
1675 snap_count = le32_to_cpu(ondisk->snap_count);
1676 } while (snap_count != want_count);
00f1f36f 1677
4156d998 1678 return ondisk;
00f1f36f 1679
4156d998
AE
1680out_err:
1681 kfree(ondisk);
1682
1683 return ERR_PTR(ret);
1684}
1685
1686/*
1687 * reload the ondisk the header
1688 */
1689static int rbd_read_header(struct rbd_device *rbd_dev,
1690 struct rbd_image_header *header)
1691{
1692 struct rbd_image_header_ondisk *ondisk;
1693 u64 ver = 0;
1694 int ret;
602adf40 1695
4156d998
AE
1696 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1697 if (IS_ERR(ondisk))
1698 return PTR_ERR(ondisk);
1699 ret = rbd_header_from_disk(header, ondisk);
1700 if (ret >= 0)
1701 header->obj_version = ver;
1702 kfree(ondisk);
1703
1704 return ret;
602adf40
YS
1705}
1706
dfc5606d
YS
1707static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1708{
1709 struct rbd_snap *snap;
a0593290 1710 struct rbd_snap *next;
dfc5606d 1711
a0593290 1712 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
14e7085d 1713 __rbd_remove_snap_dev(snap);
dfc5606d
YS
1714}
1715
602adf40
YS
1716/*
1717 * only read the first part of the ondisk header, without the snaps info
1718 */
b813623a 1719static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
1720{
1721 int ret;
1722 struct rbd_image_header h;
602adf40
YS
1723
1724 ret = rbd_read_header(rbd_dev, &h);
1725 if (ret < 0)
1726 return ret;
1727
a51aa0c0
JD
1728 down_write(&rbd_dev->header_rwsem);
1729
9db4b3e3 1730 /* resized? */
f84344f3 1731 if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
474ef7ce
JD
1732 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1733
99c1f08f
AE
1734 if (size != (sector_t) rbd_dev->mapping.size) {
1735 dout("setting size to %llu sectors",
1736 (unsigned long long) size);
1737 rbd_dev->mapping.size = (u64) size;
1738 set_capacity(rbd_dev->disk, size);
1739 }
474ef7ce 1740 }
9db4b3e3 1741
849b4260 1742 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 1743 kfree(rbd_dev->header.snap_sizes);
849b4260 1744 kfree(rbd_dev->header.snap_names);
d1d25646
JD
1745 /* osd requests may still refer to snapc */
1746 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 1747
b813623a
AE
1748 if (hver)
1749 *hver = h.obj_version;
a71b891b 1750 rbd_dev->header.obj_version = h.obj_version;
93a24e08 1751 rbd_dev->header.image_size = h.image_size;
602adf40
YS
1752 rbd_dev->header.snapc = h.snapc;
1753 rbd_dev->header.snap_names = h.snap_names;
1754 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
1755 /* Free the extra copy of the object prefix */
1756 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1757 kfree(h.object_prefix);
1758
304f6808
AE
1759 ret = rbd_dev_snaps_update(rbd_dev);
1760 if (!ret)
1761 ret = rbd_dev_snaps_register(rbd_dev);
dfc5606d 1762
c666601a 1763 up_write(&rbd_dev->header_rwsem);
602adf40 1764
dfc5606d 1765 return ret;
602adf40
YS
1766}
1767
1fe5e993
AE
1768static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1769{
1770 int ret;
1771
1772 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1773 ret = __rbd_refresh_header(rbd_dev, hver);
1774 mutex_unlock(&ctl_mutex);
1775
1776 return ret;
1777}
1778
602adf40
YS
1779static int rbd_init_disk(struct rbd_device *rbd_dev)
1780{
1781 struct gendisk *disk;
1782 struct request_queue *q;
593a9e7b 1783 u64 segment_size;
602adf40 1784
602adf40 1785 /* create gendisk info */
602adf40
YS
1786 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1787 if (!disk)
1fcdb8aa 1788 return -ENOMEM;
602adf40 1789
f0f8cef5 1790 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 1791 rbd_dev->dev_id);
602adf40
YS
1792 disk->major = rbd_dev->major;
1793 disk->first_minor = 0;
1794 disk->fops = &rbd_bd_ops;
1795 disk->private_data = rbd_dev;
1796
1797 /* init rq */
602adf40
YS
1798 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1799 if (!q)
1800 goto out_disk;
029bcbd8 1801
593a9e7b
AE
1802 /* We use the default size, but let's be explicit about it. */
1803 blk_queue_physical_block_size(q, SECTOR_SIZE);
1804
029bcbd8 1805 /* set io sizes to object size */
593a9e7b
AE
1806 segment_size = rbd_obj_bytes(&rbd_dev->header);
1807 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1808 blk_queue_max_segment_size(q, segment_size);
1809 blk_queue_io_min(q, segment_size);
1810 blk_queue_io_opt(q, segment_size);
029bcbd8 1811
602adf40
YS
1812 blk_queue_merge_bvec(q, rbd_merge_bvec);
1813 disk->queue = q;
1814
1815 q->queuedata = rbd_dev;
1816
1817 rbd_dev->disk = disk;
602adf40 1818
12f02944
AE
1819 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1820
602adf40 1821 return 0;
602adf40
YS
1822out_disk:
1823 put_disk(disk);
1fcdb8aa
AE
1824
1825 return -ENOMEM;
602adf40
YS
1826}
1827
dfc5606d
YS
1828/*
1829 sysfs
1830*/
1831
593a9e7b
AE
1832static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1833{
1834 return container_of(dev, struct rbd_device, dev);
1835}
1836
dfc5606d
YS
1837static ssize_t rbd_size_show(struct device *dev,
1838 struct device_attribute *attr, char *buf)
1839{
593a9e7b 1840 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
1841 sector_t size;
1842
1843 down_read(&rbd_dev->header_rwsem);
1844 size = get_capacity(rbd_dev->disk);
1845 up_read(&rbd_dev->header_rwsem);
dfc5606d 1846
a51aa0c0 1847 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
1848}
1849
34b13184
AE
1850/*
1851 * Note this shows the features for whatever's mapped, which is not
1852 * necessarily the base image.
1853 */
1854static ssize_t rbd_features_show(struct device *dev,
1855 struct device_attribute *attr, char *buf)
1856{
1857 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1858
1859 return sprintf(buf, "0x%016llx\n",
1860 (unsigned long long) rbd_dev->mapping.features);
1861}
1862
dfc5606d
YS
1863static ssize_t rbd_major_show(struct device *dev,
1864 struct device_attribute *attr, char *buf)
1865{
593a9e7b 1866 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 1867
dfc5606d
YS
1868 return sprintf(buf, "%d\n", rbd_dev->major);
1869}
1870
1871static ssize_t rbd_client_id_show(struct device *dev,
1872 struct device_attribute *attr, char *buf)
602adf40 1873{
593a9e7b 1874 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1875
1dbb4399
AE
1876 return sprintf(buf, "client%lld\n",
1877 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
1878}
1879
dfc5606d
YS
1880static ssize_t rbd_pool_show(struct device *dev,
1881 struct device_attribute *attr, char *buf)
602adf40 1882{
593a9e7b 1883 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d
YS
1884
1885 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1886}
1887
9bb2f334
AE
1888static ssize_t rbd_pool_id_show(struct device *dev,
1889 struct device_attribute *attr, char *buf)
1890{
1891 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1892
1893 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1894}
1895
dfc5606d
YS
1896static ssize_t rbd_name_show(struct device *dev,
1897 struct device_attribute *attr, char *buf)
1898{
593a9e7b 1899 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1900
0bed54dc 1901 return sprintf(buf, "%s\n", rbd_dev->image_name);
dfc5606d
YS
1902}
1903
589d30e0
AE
1904static ssize_t rbd_image_id_show(struct device *dev,
1905 struct device_attribute *attr, char *buf)
1906{
1907 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1908
1909 return sprintf(buf, "%s\n", rbd_dev->image_id);
1910}
1911
34b13184
AE
1912/*
1913 * Shows the name of the currently-mapped snapshot (or
1914 * RBD_SNAP_HEAD_NAME for the base image).
1915 */
dfc5606d
YS
1916static ssize_t rbd_snap_show(struct device *dev,
1917 struct device_attribute *attr,
1918 char *buf)
1919{
593a9e7b 1920 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1921
f84344f3 1922 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
dfc5606d
YS
1923}
1924
1925static ssize_t rbd_image_refresh(struct device *dev,
1926 struct device_attribute *attr,
1927 const char *buf,
1928 size_t size)
1929{
593a9e7b 1930 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 1931 int ret;
602adf40 1932
1fe5e993 1933 ret = rbd_refresh_header(rbd_dev, NULL);
b813623a
AE
1934
1935 return ret < 0 ? ret : size;
dfc5606d 1936}
602adf40 1937
dfc5606d 1938static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
34b13184 1939static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
dfc5606d
YS
1940static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1941static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1942static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 1943static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d 1944static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
589d30e0 1945static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
dfc5606d
YS
1946static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1947static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
dfc5606d
YS
1948
1949static struct attribute *rbd_attrs[] = {
1950 &dev_attr_size.attr,
34b13184 1951 &dev_attr_features.attr,
dfc5606d
YS
1952 &dev_attr_major.attr,
1953 &dev_attr_client_id.attr,
1954 &dev_attr_pool.attr,
9bb2f334 1955 &dev_attr_pool_id.attr,
dfc5606d 1956 &dev_attr_name.attr,
589d30e0 1957 &dev_attr_image_id.attr,
dfc5606d
YS
1958 &dev_attr_current_snap.attr,
1959 &dev_attr_refresh.attr,
dfc5606d
YS
1960 NULL
1961};
1962
1963static struct attribute_group rbd_attr_group = {
1964 .attrs = rbd_attrs,
1965};
1966
1967static const struct attribute_group *rbd_attr_groups[] = {
1968 &rbd_attr_group,
1969 NULL
1970};
1971
1972static void rbd_sysfs_dev_release(struct device *dev)
1973{
1974}
1975
1976static struct device_type rbd_device_type = {
1977 .name = "rbd",
1978 .groups = rbd_attr_groups,
1979 .release = rbd_sysfs_dev_release,
1980};
1981
1982
1983/*
1984 sysfs - snapshots
1985*/
1986
1987static ssize_t rbd_snap_size_show(struct device *dev,
1988 struct device_attribute *attr,
1989 char *buf)
1990{
1991 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1992
3591538f 1993 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
1994}
1995
1996static ssize_t rbd_snap_id_show(struct device *dev,
1997 struct device_attribute *attr,
1998 char *buf)
1999{
2000 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2001
3591538f 2002 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
2003}
2004
34b13184
AE
2005static ssize_t rbd_snap_features_show(struct device *dev,
2006 struct device_attribute *attr,
2007 char *buf)
2008{
2009 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2010
2011 return sprintf(buf, "0x%016llx\n",
2012 (unsigned long long) snap->features);
2013}
2014
dfc5606d
YS
2015static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2016static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
34b13184 2017static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
dfc5606d
YS
2018
2019static struct attribute *rbd_snap_attrs[] = {
2020 &dev_attr_snap_size.attr,
2021 &dev_attr_snap_id.attr,
34b13184 2022 &dev_attr_snap_features.attr,
dfc5606d
YS
2023 NULL,
2024};
2025
2026static struct attribute_group rbd_snap_attr_group = {
2027 .attrs = rbd_snap_attrs,
2028};
2029
2030static void rbd_snap_dev_release(struct device *dev)
2031{
2032 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2033 kfree(snap->name);
2034 kfree(snap);
2035}
2036
2037static const struct attribute_group *rbd_snap_attr_groups[] = {
2038 &rbd_snap_attr_group,
2039 NULL
2040};
2041
2042static struct device_type rbd_snap_device_type = {
2043 .groups = rbd_snap_attr_groups,
2044 .release = rbd_snap_dev_release,
2045};
2046
304f6808
AE
2047static bool rbd_snap_registered(struct rbd_snap *snap)
2048{
2049 bool ret = snap->dev.type == &rbd_snap_device_type;
2050 bool reg = device_is_registered(&snap->dev);
2051
2052 rbd_assert(!ret ^ reg);
2053
2054 return ret;
2055}
2056
14e7085d 2057static void __rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2058{
2059 list_del(&snap->node);
304f6808
AE
2060 if (device_is_registered(&snap->dev))
2061 device_unregister(&snap->dev);
dfc5606d
YS
2062}
2063
14e7085d 2064static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2065 struct device *parent)
2066{
2067 struct device *dev = &snap->dev;
2068 int ret;
2069
2070 dev->type = &rbd_snap_device_type;
2071 dev->parent = parent;
2072 dev->release = rbd_snap_dev_release;
2073 dev_set_name(dev, "snap_%s", snap->name);
304f6808
AE
2074 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2075
dfc5606d
YS
2076 ret = device_register(dev);
2077
2078 return ret;
2079}
2080
4e891e0a 2081static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
c8d18425 2082 const char *snap_name,
34b13184
AE
2083 u64 snap_id, u64 snap_size,
2084 u64 snap_features)
dfc5606d 2085{
4e891e0a 2086 struct rbd_snap *snap;
dfc5606d 2087 int ret;
4e891e0a
AE
2088
2089 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2090 if (!snap)
4e891e0a
AE
2091 return ERR_PTR(-ENOMEM);
2092
2093 ret = -ENOMEM;
c8d18425 2094 snap->name = kstrdup(snap_name, GFP_KERNEL);
4e891e0a
AE
2095 if (!snap->name)
2096 goto err;
2097
c8d18425
AE
2098 snap->id = snap_id;
2099 snap->size = snap_size;
34b13184 2100 snap->features = snap_features;
4e891e0a
AE
2101
2102 return snap;
2103
dfc5606d
YS
2104err:
2105 kfree(snap->name);
2106 kfree(snap);
4e891e0a
AE
2107
2108 return ERR_PTR(ret);
dfc5606d
YS
2109}
2110
cd892126
AE
2111static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2112 u64 *snap_size, u64 *snap_features)
2113{
2114 char *snap_name;
2115
2116 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2117
2118 *snap_size = rbd_dev->header.snap_sizes[which];
2119 *snap_features = 0; /* No features for v1 */
2120
2121 /* Skip over names until we find the one we are looking for */
2122
2123 snap_name = rbd_dev->header.snap_names;
2124 while (which--)
2125 snap_name += strlen(snap_name) + 1;
2126
2127 return snap_name;
2128}
2129
dfc5606d 2130/*
35938150
AE
2131 * Scan the rbd device's current snapshot list and compare it to the
2132 * newly-received snapshot context. Remove any existing snapshots
2133 * not present in the new snapshot context. Add a new snapshot for
2134 * any snaphots in the snapshot context not in the current list.
2135 * And verify there are no changes to snapshots we already know
2136 * about.
2137 *
2138 * Assumes the snapshots in the snapshot context are sorted by
2139 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2140 * are also maintained in that order.)
dfc5606d 2141 */
304f6808 2142static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
dfc5606d 2143{
35938150
AE
2144 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2145 const u32 snap_count = snapc->num_snaps;
35938150
AE
2146 struct list_head *head = &rbd_dev->snaps;
2147 struct list_head *links = head->next;
2148 u32 index = 0;
dfc5606d 2149
9fcbb800 2150 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
2151 while (index < snap_count || links != head) {
2152 u64 snap_id;
2153 struct rbd_snap *snap;
cd892126
AE
2154 char *snap_name;
2155 u64 snap_size = 0;
2156 u64 snap_features = 0;
dfc5606d 2157
35938150
AE
2158 snap_id = index < snap_count ? snapc->snaps[index]
2159 : CEPH_NOSNAP;
2160 snap = links != head ? list_entry(links, struct rbd_snap, node)
2161 : NULL;
aafb230e 2162 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 2163
35938150
AE
2164 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2165 struct list_head *next = links->next;
dfc5606d 2166
35938150 2167 /* Existing snapshot not in the new snap context */
dfc5606d 2168
f84344f3
AE
2169 if (rbd_dev->mapping.snap_id == snap->id)
2170 rbd_dev->mapping.snap_exists = false;
35938150 2171 __rbd_remove_snap_dev(snap);
9fcbb800 2172 dout("%ssnap id %llu has been removed\n",
f84344f3
AE
2173 rbd_dev->mapping.snap_id == snap->id ?
2174 "mapped " : "",
9fcbb800 2175 (unsigned long long) snap->id);
35938150
AE
2176
2177 /* Done with this list entry; advance */
2178
2179 links = next;
dfc5606d
YS
2180 continue;
2181 }
35938150 2182
cd892126
AE
2183 snap_name = rbd_dev_v1_snap_info(rbd_dev, index,
2184 &snap_size, &snap_features);
2185 if (IS_ERR(snap_name))
2186 return PTR_ERR(snap_name);
2187
9fcbb800
AE
2188 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2189 (unsigned long long) snap_id);
35938150
AE
2190 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2191 struct rbd_snap *new_snap;
2192
2193 /* We haven't seen this snapshot before */
2194
c8d18425 2195 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
cd892126 2196 snap_id, snap_size, snap_features);
9fcbb800
AE
2197 if (IS_ERR(new_snap)) {
2198 int err = PTR_ERR(new_snap);
2199
2200 dout(" failed to add dev, error %d\n", err);
2201
2202 return err;
2203 }
35938150
AE
2204
2205 /* New goes before existing, or at end of list */
2206
9fcbb800 2207 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
2208 if (snap)
2209 list_add_tail(&new_snap->node, &snap->node);
2210 else
523f3258 2211 list_add_tail(&new_snap->node, head);
35938150
AE
2212 } else {
2213 /* Already have this one */
2214
9fcbb800
AE
2215 dout(" already present\n");
2216
cd892126 2217 rbd_assert(snap->size == snap_size);
aafb230e 2218 rbd_assert(!strcmp(snap->name, snap_name));
cd892126 2219 rbd_assert(snap->features == snap_features);
35938150
AE
2220
2221 /* Done with this list entry; advance */
2222
2223 links = links->next;
dfc5606d 2224 }
35938150
AE
2225
2226 /* Advance to the next entry in the snapshot context */
2227
2228 index++;
dfc5606d 2229 }
9fcbb800 2230 dout("%s: done\n", __func__);
dfc5606d
YS
2231
2232 return 0;
2233}
2234
304f6808
AE
2235/*
2236 * Scan the list of snapshots and register the devices for any that
2237 * have not already been registered.
2238 */
2239static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2240{
2241 struct rbd_snap *snap;
2242 int ret = 0;
2243
2244 dout("%s called\n", __func__);
86ff77bb
AE
2245 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2246 return -EIO;
304f6808
AE
2247
2248 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2249 if (!rbd_snap_registered(snap)) {
2250 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2251 if (ret < 0)
2252 break;
2253 }
2254 }
2255 dout("%s: returning %d\n", __func__, ret);
2256
2257 return ret;
2258}
2259
dfc5606d
YS
2260static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2261{
dfc5606d 2262 struct device *dev;
cd789ab9 2263 int ret;
dfc5606d
YS
2264
2265 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
dfc5606d 2266
cd789ab9 2267 dev = &rbd_dev->dev;
dfc5606d
YS
2268 dev->bus = &rbd_bus_type;
2269 dev->type = &rbd_device_type;
2270 dev->parent = &rbd_root_dev;
2271 dev->release = rbd_dev_release;
de71a297 2272 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d 2273 ret = device_register(dev);
dfc5606d 2274
dfc5606d 2275 mutex_unlock(&ctl_mutex);
cd789ab9 2276
dfc5606d 2277 return ret;
602adf40
YS
2278}
2279
dfc5606d
YS
2280static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2281{
2282 device_unregister(&rbd_dev->dev);
2283}
2284
59c2be1e
YS
2285static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2286{
2287 int ret, rc;
2288
2289 do {
0e6f322d 2290 ret = rbd_req_sync_watch(rbd_dev);
59c2be1e 2291 if (ret == -ERANGE) {
1fe5e993 2292 rc = rbd_refresh_header(rbd_dev, NULL);
59c2be1e
YS
2293 if (rc < 0)
2294 return rc;
2295 }
2296 } while (ret == -ERANGE);
2297
2298 return ret;
2299}
2300
e2839308 2301static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
2302
2303/*
499afd5b
AE
2304 * Get a unique rbd identifier for the given new rbd_dev, and add
2305 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 2306 */
e2839308 2307static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 2308{
e2839308 2309 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
2310
2311 spin_lock(&rbd_dev_list_lock);
2312 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2313 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
2314 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2315 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 2316}
b7f23c36 2317
1ddbe94e 2318/*
499afd5b
AE
2319 * Remove an rbd_dev from the global list, and record that its
2320 * identifier is no longer in use.
1ddbe94e 2321 */
e2839308 2322static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 2323{
d184f6bf 2324 struct list_head *tmp;
de71a297 2325 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
2326 int max_id;
2327
aafb230e 2328 rbd_assert(rbd_id > 0);
499afd5b 2329
e2839308
AE
2330 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2331 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
2332 spin_lock(&rbd_dev_list_lock);
2333 list_del_init(&rbd_dev->node);
d184f6bf
AE
2334
2335 /*
2336 * If the id being "put" is not the current maximum, there
2337 * is nothing special we need to do.
2338 */
e2839308 2339 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
2340 spin_unlock(&rbd_dev_list_lock);
2341 return;
2342 }
2343
2344 /*
2345 * We need to update the current maximum id. Search the
2346 * list to find out what it is. We're more likely to find
2347 * the maximum at the end, so search the list backward.
2348 */
2349 max_id = 0;
2350 list_for_each_prev(tmp, &rbd_dev_list) {
2351 struct rbd_device *rbd_dev;
2352
2353 rbd_dev = list_entry(tmp, struct rbd_device, node);
2354 if (rbd_id > max_id)
2355 max_id = rbd_id;
2356 }
499afd5b 2357 spin_unlock(&rbd_dev_list_lock);
b7f23c36 2358
1ddbe94e 2359 /*
e2839308 2360 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
2361 * which case it now accurately reflects the new maximum.
2362 * Be careful not to overwrite the maximum value in that
2363 * case.
1ddbe94e 2364 */
e2839308
AE
2365 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2366 dout(" max dev id has been reset\n");
b7f23c36
AE
2367}
2368
e28fff26
AE
2369/*
2370 * Skips over white space at *buf, and updates *buf to point to the
2371 * first found non-space character (if any). Returns the length of
593a9e7b
AE
2372 * the token (string of non-white space characters) found. Note
2373 * that *buf must be terminated with '\0'.
e28fff26
AE
2374 */
2375static inline size_t next_token(const char **buf)
2376{
2377 /*
2378 * These are the characters that produce nonzero for
2379 * isspace() in the "C" and "POSIX" locales.
2380 */
2381 const char *spaces = " \f\n\r\t\v";
2382
2383 *buf += strspn(*buf, spaces); /* Find start of token */
2384
2385 return strcspn(*buf, spaces); /* Return token length */
2386}
2387
2388/*
2389 * Finds the next token in *buf, and if the provided token buffer is
2390 * big enough, copies the found token into it. The result, if
593a9e7b
AE
2391 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2392 * must be terminated with '\0' on entry.
e28fff26
AE
2393 *
2394 * Returns the length of the token found (not including the '\0').
2395 * Return value will be 0 if no token is found, and it will be >=
2396 * token_size if the token would not fit.
2397 *
593a9e7b 2398 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
2399 * found token. Note that this occurs even if the token buffer is
2400 * too small to hold it.
2401 */
2402static inline size_t copy_token(const char **buf,
2403 char *token,
2404 size_t token_size)
2405{
2406 size_t len;
2407
2408 len = next_token(buf);
2409 if (len < token_size) {
2410 memcpy(token, *buf, len);
2411 *(token + len) = '\0';
2412 }
2413 *buf += len;
2414
2415 return len;
2416}
2417
ea3352f4
AE
2418/*
2419 * Finds the next token in *buf, dynamically allocates a buffer big
2420 * enough to hold a copy of it, and copies the token into the new
2421 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2422 * that a duplicate buffer is created even for a zero-length token.
2423 *
2424 * Returns a pointer to the newly-allocated duplicate, or a null
2425 * pointer if memory for the duplicate was not available. If
2426 * the lenp argument is a non-null pointer, the length of the token
2427 * (not including the '\0') is returned in *lenp.
2428 *
2429 * If successful, the *buf pointer will be updated to point beyond
2430 * the end of the found token.
2431 *
2432 * Note: uses GFP_KERNEL for allocation.
2433 */
2434static inline char *dup_token(const char **buf, size_t *lenp)
2435{
2436 char *dup;
2437 size_t len;
2438
2439 len = next_token(buf);
2440 dup = kmalloc(len + 1, GFP_KERNEL);
2441 if (!dup)
2442 return NULL;
2443
2444 memcpy(dup, *buf, len);
2445 *(dup + len) = '\0';
2446 *buf += len;
2447
2448 if (lenp)
2449 *lenp = len;
2450
2451 return dup;
2452}
2453
a725f65e 2454/*
3feeb894
AE
2455 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2456 * rbd_md_name, and name fields of the given rbd_dev, based on the
2457 * list of monitor addresses and other options provided via
2458 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2459 * copy of the snapshot name to map if successful, or a
2460 * pointer-coded error otherwise.
d22f76e7
AE
2461 *
2462 * Note: rbd_dev is assumed to have been initially zero-filled.
a725f65e 2463 */
3feeb894
AE
2464static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2465 const char *buf,
2466 const char **mon_addrs,
2467 size_t *mon_addrs_size,
2468 char *options,
2469 size_t options_size)
e28fff26 2470{
d22f76e7 2471 size_t len;
3feeb894
AE
2472 char *err_ptr = ERR_PTR(-EINVAL);
2473 char *snap_name;
e28fff26
AE
2474
2475 /* The first four tokens are required */
2476
7ef3214a
AE
2477 len = next_token(&buf);
2478 if (!len)
3feeb894 2479 return err_ptr;
5214ecc4 2480 *mon_addrs_size = len + 1;
7ef3214a
AE
2481 *mon_addrs = buf;
2482
2483 buf += len;
a725f65e 2484
e28fff26
AE
2485 len = copy_token(&buf, options, options_size);
2486 if (!len || len >= options_size)
3feeb894 2487 return err_ptr;
e28fff26 2488
3feeb894 2489 err_ptr = ERR_PTR(-ENOMEM);
d22f76e7
AE
2490 rbd_dev->pool_name = dup_token(&buf, NULL);
2491 if (!rbd_dev->pool_name)
d22f76e7 2492 goto out_err;
e28fff26 2493
0bed54dc
AE
2494 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2495 if (!rbd_dev->image_name)
bf3e5ae1 2496 goto out_err;
a725f65e 2497
3feeb894
AE
2498 /* Snapshot name is optional */
2499 len = next_token(&buf);
820a5f3e 2500 if (!len) {
3feeb894
AE
2501 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2502 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
849b4260 2503 }
3feeb894
AE
2504 snap_name = kmalloc(len + 1, GFP_KERNEL);
2505 if (!snap_name)
2506 goto out_err;
2507 memcpy(snap_name, buf, len);
2508 *(snap_name + len) = '\0';
e28fff26 2509
3feeb894
AE
2510dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len);
2511
2512 return snap_name;
d22f76e7
AE
2513
2514out_err:
0bed54dc 2515 kfree(rbd_dev->image_name);
d78fd7ae
AE
2516 rbd_dev->image_name = NULL;
2517 rbd_dev->image_name_len = 0;
d22f76e7
AE
2518 kfree(rbd_dev->pool_name);
2519 rbd_dev->pool_name = NULL;
2520
3feeb894 2521 return err_ptr;
a725f65e
AE
2522}
2523
589d30e0
AE
2524/*
2525 * An rbd format 2 image has a unique identifier, distinct from the
2526 * name given to it by the user. Internally, that identifier is
2527 * what's used to specify the names of objects related to the image.
2528 *
2529 * A special "rbd id" object is used to map an rbd image name to its
2530 * id. If that object doesn't exist, then there is no v2 rbd image
2531 * with the supplied name.
2532 *
2533 * This function will record the given rbd_dev's image_id field if
2534 * it can be determined, and in that case will return 0. If any
2535 * errors occur a negative errno will be returned and the rbd_dev's
2536 * image_id field will be unchanged (and should be NULL).
2537 */
2538static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2539{
2540 int ret;
2541 size_t size;
2542 char *object_name;
2543 void *response;
2544 void *p;
2545
2546 /*
2547 * First, see if the format 2 image id file exists, and if
2548 * so, get the image's persistent id from it.
2549 */
2550 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2551 object_name = kmalloc(size, GFP_NOIO);
2552 if (!object_name)
2553 return -ENOMEM;
2554 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2555 dout("rbd id object name is %s\n", object_name);
2556
2557 /* Response will be an encoded string, which includes a length */
2558
2559 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2560 response = kzalloc(size, GFP_NOIO);
2561 if (!response) {
2562 ret = -ENOMEM;
2563 goto out;
2564 }
2565
2566 ret = rbd_req_sync_exec(rbd_dev, object_name,
2567 "rbd", "get_id",
2568 NULL, 0,
2569 response, RBD_IMAGE_ID_LEN_MAX,
2570 CEPH_OSD_FLAG_READ, NULL);
2571 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2572 if (ret < 0)
2573 goto out;
2574
2575 p = response;
2576 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2577 p + RBD_IMAGE_ID_LEN_MAX,
2578 &rbd_dev->image_id_len,
2579 GFP_NOIO);
2580 if (IS_ERR(rbd_dev->image_id)) {
2581 ret = PTR_ERR(rbd_dev->image_id);
2582 rbd_dev->image_id = NULL;
2583 } else {
2584 dout("image_id is %s\n", rbd_dev->image_id);
2585 }
2586out:
2587 kfree(response);
2588 kfree(object_name);
2589
2590 return ret;
2591}
2592
a30b71b9
AE
2593static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2594{
2595 int ret;
2596 size_t size;
2597
2598 /* Version 1 images have no id; empty string is used */
2599
2600 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
2601 if (!rbd_dev->image_id)
2602 return -ENOMEM;
2603 rbd_dev->image_id_len = 0;
2604
2605 /* Record the header object name for this rbd image. */
2606
2607 size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
2608 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2609 if (!rbd_dev->header_name) {
2610 ret = -ENOMEM;
2611 goto out_err;
2612 }
2613 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2614
2615 /* Populate rbd image metadata */
2616
2617 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
2618 if (ret < 0)
2619 goto out_err;
2620 rbd_dev->image_format = 1;
2621
2622 dout("discovered version 1 image, header name is %s\n",
2623 rbd_dev->header_name);
2624
2625 return 0;
2626
2627out_err:
2628 kfree(rbd_dev->header_name);
2629 rbd_dev->header_name = NULL;
2630 kfree(rbd_dev->image_id);
2631 rbd_dev->image_id = NULL;
2632
2633 return ret;
2634}
2635
2636static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
2637{
2638 size_t size;
2639
2640 /*
2641 * Image id was filled in by the caller. Record the header
2642 * object name for this rbd image.
2643 */
2644 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
2645 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2646 if (!rbd_dev->header_name)
2647 return -ENOMEM;
2648 sprintf(rbd_dev->header_name, "%s%s",
2649 RBD_HEADER_PREFIX, rbd_dev->image_id);
2650 rbd_dev->image_format = 2;
2651
2652 dout("discovered version 2 image, header name is %s\n",
2653 rbd_dev->header_name);
2654
2655 return -ENOTSUPP;
2656}
2657
2658/*
2659 * Probe for the existence of the header object for the given rbd
2660 * device. For format 2 images this includes determining the image
2661 * id.
2662 */
2663static int rbd_dev_probe(struct rbd_device *rbd_dev)
2664{
2665 int ret;
2666
2667 /*
2668 * Get the id from the image id object. If it's not a
2669 * format 2 image, we'll get ENOENT back, and we'll assume
2670 * it's a format 1 image.
2671 */
2672 ret = rbd_dev_image_id(rbd_dev);
2673 if (ret)
2674 ret = rbd_dev_v1_probe(rbd_dev);
2675 else
2676 ret = rbd_dev_v2_probe(rbd_dev);
2677 if (ret)
2678 dout("probe failed, returning %d\n", ret);
2679
2680 return ret;
2681}
2682
59c2be1e
YS
2683static ssize_t rbd_add(struct bus_type *bus,
2684 const char *buf,
2685 size_t count)
602adf40 2686{
cb8627c7
AE
2687 char *options;
2688 struct rbd_device *rbd_dev = NULL;
7ef3214a
AE
2689 const char *mon_addrs = NULL;
2690 size_t mon_addrs_size = 0;
27cc2594
AE
2691 struct ceph_osd_client *osdc;
2692 int rc = -ENOMEM;
3feeb894 2693 char *snap_name;
602adf40
YS
2694
2695 if (!try_module_get(THIS_MODULE))
2696 return -ENODEV;
2697
60571c7d 2698 options = kmalloc(count, GFP_KERNEL);
602adf40 2699 if (!options)
85ae8926 2700 goto err_out_mem;
cb8627c7
AE
2701 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2702 if (!rbd_dev)
85ae8926 2703 goto err_out_mem;
602adf40
YS
2704
2705 /* static rbd_device initialization */
2706 spin_lock_init(&rbd_dev->lock);
2707 INIT_LIST_HEAD(&rbd_dev->node);
dfc5606d 2708 INIT_LIST_HEAD(&rbd_dev->snaps);
c666601a 2709 init_rwsem(&rbd_dev->header_rwsem);
602adf40 2710
602adf40 2711 /* parse add command */
3feeb894
AE
2712 snap_name = rbd_add_parse_args(rbd_dev, buf,
2713 &mon_addrs, &mon_addrs_size, options, count);
2714 if (IS_ERR(snap_name)) {
2715 rc = PTR_ERR(snap_name);
85ae8926 2716 goto err_out_mem;
3feeb894 2717 }
e124a82f 2718
f8c38929
AE
2719 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2720 if (rc < 0)
85ae8926 2721 goto err_out_args;
602adf40 2722
602adf40 2723 /* pick the pool */
1dbb4399 2724 osdc = &rbd_dev->rbd_client->client->osdc;
602adf40
YS
2725 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2726 if (rc < 0)
2727 goto err_out_client;
9bb2f334 2728 rbd_dev->pool_id = rc;
602adf40 2729
a30b71b9
AE
2730 rc = rbd_dev_probe(rbd_dev);
2731 if (rc < 0)
05fd6f6f 2732 goto err_out_client;
a30b71b9 2733 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
05fd6f6f
AE
2734
2735 /* no need to lock here, as rbd_dev is not registered yet */
2736 rc = rbd_dev_snaps_update(rbd_dev);
2737 if (rc)
2738 goto err_out_header;
2739
2740 rc = rbd_dev_set_mapping(rbd_dev, snap_name);
2741 if (rc)
2742 goto err_out_header;
2743
85ae8926
AE
2744 /* generate unique id: find highest unique id, add one */
2745 rbd_dev_id_get(rbd_dev);
2746
2747 /* Fill in the device name, now that we have its id. */
2748 BUILD_BUG_ON(DEV_NAME_LEN
2749 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2750 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
2751
2752 /* Get our block major device number. */
2753
27cc2594
AE
2754 rc = register_blkdev(0, rbd_dev->name);
2755 if (rc < 0)
85ae8926 2756 goto err_out_id;
27cc2594 2757 rbd_dev->major = rc;
602adf40 2758
0f308a31
AE
2759 /* Set up the blkdev mapping. */
2760
2761 rc = rbd_init_disk(rbd_dev);
dfc5606d 2762 if (rc)
766fc439
YS
2763 goto err_out_blkdev;
2764
0f308a31
AE
2765 rc = rbd_bus_add_dev(rbd_dev);
2766 if (rc)
2767 goto err_out_disk;
2768
32eec68d
AE
2769 /*
2770 * At this point cleanup in the event of an error is the job
2771 * of the sysfs code (initiated by rbd_bus_del_dev()).
32eec68d 2772 */
2ac4e75d 2773
4bb1f1ed 2774 down_write(&rbd_dev->header_rwsem);
5ed16177 2775 rc = rbd_dev_snaps_register(rbd_dev);
4bb1f1ed 2776 up_write(&rbd_dev->header_rwsem);
2ac4e75d
AE
2777 if (rc)
2778 goto err_out_bus;
2779
3ee4001e
AE
2780 rc = rbd_init_watch_dev(rbd_dev);
2781 if (rc)
2782 goto err_out_bus;
2783
2ac4e75d
AE
2784 /* Everything's ready. Announce the disk to the world. */
2785
2ac4e75d 2786 add_disk(rbd_dev->disk);
3ee4001e 2787
2ac4e75d
AE
2788 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
2789 (unsigned long long) rbd_dev->mapping.size);
2790
602adf40
YS
2791 return count;
2792
766fc439 2793err_out_bus:
766fc439
YS
2794 /* this will also clean up rest of rbd_dev stuff */
2795
2796 rbd_bus_del_dev(rbd_dev);
2797 kfree(options);
766fc439
YS
2798 return rc;
2799
0f308a31
AE
2800err_out_disk:
2801 rbd_free_disk(rbd_dev);
602adf40
YS
2802err_out_blkdev:
2803 unregister_blkdev(rbd_dev->major, rbd_dev->name);
85ae8926
AE
2804err_out_id:
2805 rbd_dev_id_put(rbd_dev);
05fd6f6f
AE
2806err_out_header:
2807 rbd_header_free(&rbd_dev->header);
602adf40 2808err_out_client:
3fcf2581 2809 kfree(rbd_dev->header_name);
602adf40 2810 rbd_put_client(rbd_dev);
589d30e0 2811 kfree(rbd_dev->image_id);
85ae8926
AE
2812err_out_args:
2813 kfree(rbd_dev->mapping.snap_name);
2814 kfree(rbd_dev->image_name);
2815 kfree(rbd_dev->pool_name);
2816err_out_mem:
27cc2594 2817 kfree(rbd_dev);
cb8627c7 2818 kfree(options);
27cc2594 2819
602adf40
YS
2820 dout("Error adding device %s\n", buf);
2821 module_put(THIS_MODULE);
27cc2594
AE
2822
2823 return (ssize_t) rc;
602adf40
YS
2824}
2825
de71a297 2826static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
2827{
2828 struct list_head *tmp;
2829 struct rbd_device *rbd_dev;
2830
e124a82f 2831 spin_lock(&rbd_dev_list_lock);
602adf40
YS
2832 list_for_each(tmp, &rbd_dev_list) {
2833 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 2834 if (rbd_dev->dev_id == dev_id) {
e124a82f 2835 spin_unlock(&rbd_dev_list_lock);
602adf40 2836 return rbd_dev;
e124a82f 2837 }
602adf40 2838 }
e124a82f 2839 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
2840 return NULL;
2841}
2842
dfc5606d 2843static void rbd_dev_release(struct device *dev)
602adf40 2844{
593a9e7b 2845 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 2846
1dbb4399
AE
2847 if (rbd_dev->watch_request) {
2848 struct ceph_client *client = rbd_dev->rbd_client->client;
2849
2850 ceph_osdc_unregister_linger_request(&client->osdc,
59c2be1e 2851 rbd_dev->watch_request);
1dbb4399 2852 }
59c2be1e 2853 if (rbd_dev->watch_event)
070c633f 2854 rbd_req_sync_unwatch(rbd_dev);
59c2be1e 2855
602adf40
YS
2856 rbd_put_client(rbd_dev);
2857
2858 /* clean up and free blkdev */
2859 rbd_free_disk(rbd_dev);
2860 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d 2861
2ac4e75d
AE
2862 /* release allocated disk header fields */
2863 rbd_header_free(&rbd_dev->header);
2864
32eec68d 2865 /* done with the id, and with the rbd_dev */
f84344f3 2866 kfree(rbd_dev->mapping.snap_name);
589d30e0 2867 kfree(rbd_dev->image_id);
0bed54dc 2868 kfree(rbd_dev->header_name);
d22f76e7 2869 kfree(rbd_dev->pool_name);
0bed54dc 2870 kfree(rbd_dev->image_name);
e2839308 2871 rbd_dev_id_put(rbd_dev);
602adf40
YS
2872 kfree(rbd_dev);
2873
2874 /* release module ref */
2875 module_put(THIS_MODULE);
602adf40
YS
2876}
2877
dfc5606d
YS
2878static ssize_t rbd_remove(struct bus_type *bus,
2879 const char *buf,
2880 size_t count)
602adf40
YS
2881{
2882 struct rbd_device *rbd_dev = NULL;
2883 int target_id, rc;
2884 unsigned long ul;
2885 int ret = count;
2886
2887 rc = strict_strtoul(buf, 10, &ul);
2888 if (rc)
2889 return rc;
2890
2891 /* convert to int; abort if we lost anything in the conversion */
2892 target_id = (int) ul;
2893 if (target_id != ul)
2894 return -EINVAL;
2895
2896 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2897
2898 rbd_dev = __rbd_get_dev(target_id);
2899 if (!rbd_dev) {
2900 ret = -ENOENT;
2901 goto done;
2902 }
2903
dfc5606d
YS
2904 __rbd_remove_all_snaps(rbd_dev);
2905 rbd_bus_del_dev(rbd_dev);
602adf40
YS
2906
2907done:
2908 mutex_unlock(&ctl_mutex);
aafb230e 2909
602adf40
YS
2910 return ret;
2911}
2912
602adf40
YS
2913/*
2914 * create control files in sysfs
dfc5606d 2915 * /sys/bus/rbd/...
602adf40
YS
2916 */
2917static int rbd_sysfs_init(void)
2918{
dfc5606d 2919 int ret;
602adf40 2920
fed4c143 2921 ret = device_register(&rbd_root_dev);
21079786 2922 if (ret < 0)
dfc5606d 2923 return ret;
602adf40 2924
fed4c143
AE
2925 ret = bus_register(&rbd_bus_type);
2926 if (ret < 0)
2927 device_unregister(&rbd_root_dev);
602adf40 2928
602adf40
YS
2929 return ret;
2930}
2931
2932static void rbd_sysfs_cleanup(void)
2933{
dfc5606d 2934 bus_unregister(&rbd_bus_type);
fed4c143 2935 device_unregister(&rbd_root_dev);
602adf40
YS
2936}
2937
2938int __init rbd_init(void)
2939{
2940 int rc;
2941
2942 rc = rbd_sysfs_init();
2943 if (rc)
2944 return rc;
f0f8cef5 2945 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
2946 return 0;
2947}
2948
2949void __exit rbd_exit(void)
2950{
2951 rbd_sysfs_cleanup();
2952}
2953
2954module_init(rbd_init);
2955module_exit(rbd_exit);
2956
2957MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2958MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2959MODULE_DESCRIPTION("rados block device");
2960
2961/* following authorship retained from original osdblk.c */
2962MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2963
2964MODULE_LICENSE("GPL");
This page took 0.280129 seconds and 5 git commands to generate.