libceph: Unlock unprocessed pages in start_read() error path
[deliverable/linux.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
df111be6
AE
55/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
f0f8cef5
AE
59#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
61
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
d4b125e9
AE
64#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
35d489f9 68#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
69#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
9e15b77d
AE
73/* This allows a single page to hold an image name sent by OSD */
74#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
1e130199 75#define RBD_IMAGE_ID_LEN_MAX 64
9e15b77d 76
1e130199 77#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 78
d889140c
AE
79/* Feature bits */
80
81#define RBD_FEATURE_LAYERING 1
82
83/* Features supported by this (client software) implementation. */
84
85#define RBD_FEATURES_ALL (0)
86
81a89793
AE
87/*
88 * An RBD device name will be "rbd#", where the "rbd" comes from
89 * RBD_DRV_NAME above, and # is a unique integer identifier.
90 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
91 * enough to hold all possible device names.
92 */
602adf40 93#define DEV_NAME_LEN 32
81a89793 94#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40 95
cc0538b6 96#define RBD_READ_ONLY_DEFAULT false
59c2be1e 97
602adf40
YS
98/*
99 * block device image metadata (in-memory version)
100 */
101struct rbd_image_header {
f84344f3 102 /* These four fields never change for a given rbd image */
849b4260 103 char *object_prefix;
34b13184 104 u64 features;
602adf40
YS
105 __u8 obj_order;
106 __u8 crypt_type;
107 __u8 comp_type;
602adf40 108
f84344f3
AE
109 /* The remaining fields need to be updated occasionally */
110 u64 image_size;
111 struct ceph_snap_context *snapc;
602adf40
YS
112 char *snap_names;
113 u64 *snap_sizes;
59c2be1e
YS
114
115 u64 obj_version;
116};
117
0d7dbfce
AE
118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
122 * identify an image.
123 */
124struct rbd_spec {
125 u64 pool_id;
126 char *pool_name;
127
128 char *image_id;
129 size_t image_id_len;
130 char *image_name;
131 size_t image_name_len;
132
133 u64 snap_id;
134 char *snap_name;
135
136 struct kref kref;
137};
138
59c2be1e 139struct rbd_options {
cc0538b6 140 bool read_only;
602adf40
YS
141};
142
143/*
f0f8cef5 144 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
145 */
146struct rbd_client {
147 struct ceph_client *client;
148 struct kref kref;
149 struct list_head node;
150};
151
152/*
f0f8cef5 153 * a request completion status
602adf40 154 */
1fec7093
YS
155struct rbd_req_status {
156 int done;
157 int rc;
158 u64 bytes;
159};
160
161/*
162 * a collection of requests
163 */
164struct rbd_req_coll {
165 int total;
166 int num_done;
167 struct kref kref;
168 struct rbd_req_status status[0];
602adf40
YS
169};
170
f0f8cef5
AE
171/*
172 * a single io request
173 */
174struct rbd_request {
175 struct request *rq; /* blk layer request */
176 struct bio *bio; /* cloned bio */
177 struct page **pages; /* list of used pages */
178 u64 len;
179 int coll_index;
180 struct rbd_req_coll *coll;
181};
182
dfc5606d
YS
183struct rbd_snap {
184 struct device dev;
185 const char *name;
3591538f 186 u64 size;
dfc5606d
YS
187 struct list_head node;
188 u64 id;
34b13184 189 u64 features;
dfc5606d
YS
190};
191
f84344f3 192struct rbd_mapping {
99c1f08f 193 u64 size;
34b13184 194 u64 features;
f84344f3
AE
195 bool read_only;
196};
197
602adf40
YS
198/*
199 * a single device
200 */
201struct rbd_device {
de71a297 202 int dev_id; /* blkdev unique id */
602adf40
YS
203
204 int major; /* blkdev assigned major */
205 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 206
a30b71b9 207 u32 image_format; /* Either 1 or 2 */
602adf40
YS
208 struct rbd_client *rbd_client;
209
210 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
211
212 spinlock_t lock; /* queue lock */
213
214 struct rbd_image_header header;
daba5fdb 215 bool exists;
0d7dbfce 216 struct rbd_spec *spec;
602adf40 217
0d7dbfce 218 char *header_name;
971f839a 219
59c2be1e
YS
220 struct ceph_osd_event *watch_event;
221 struct ceph_osd_request *watch_request;
222
86b00e0d
AE
223 struct rbd_spec *parent_spec;
224 u64 parent_overlap;
225
c666601a
JD
226 /* protects updating the header */
227 struct rw_semaphore header_rwsem;
f84344f3
AE
228
229 struct rbd_mapping mapping;
602adf40
YS
230
231 struct list_head node;
dfc5606d
YS
232
233 /* list of snapshots */
234 struct list_head snaps;
235
236 /* sysfs related */
237 struct device dev;
238};
239
602adf40 240static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 241
602adf40 242static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
243static DEFINE_SPINLOCK(rbd_dev_list_lock);
244
432b8587
AE
245static LIST_HEAD(rbd_client_list); /* clients */
246static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 247
304f6808
AE
248static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
249static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
250
dfc5606d 251static void rbd_dev_release(struct device *dev);
41f38c2b 252static void rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 253
f0f8cef5
AE
254static ssize_t rbd_add(struct bus_type *bus, const char *buf,
255 size_t count);
256static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
257 size_t count);
258
259static struct bus_attribute rbd_bus_attrs[] = {
260 __ATTR(add, S_IWUSR, NULL, rbd_add),
261 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
262 __ATTR_NULL
263};
264
265static struct bus_type rbd_bus_type = {
266 .name = "rbd",
267 .bus_attrs = rbd_bus_attrs,
268};
269
270static void rbd_root_dev_release(struct device *dev)
271{
272}
273
274static struct device rbd_root_dev = {
275 .init_name = "rbd",
276 .release = rbd_root_dev_release,
277};
278
aafb230e
AE
279#ifdef RBD_DEBUG
280#define rbd_assert(expr) \
281 if (unlikely(!(expr))) { \
282 printk(KERN_ERR "\nAssertion failure in %s() " \
283 "at line %d:\n\n" \
284 "\trbd_assert(%s);\n\n", \
285 __func__, __LINE__, #expr); \
286 BUG(); \
287 }
288#else /* !RBD_DEBUG */
289# define rbd_assert(expr) ((void) 0)
290#endif /* !RBD_DEBUG */
dfc5606d 291
dfc5606d
YS
292static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
293{
294 return get_device(&rbd_dev->dev);
295}
296
297static void rbd_put_dev(struct rbd_device *rbd_dev)
298{
299 put_device(&rbd_dev->dev);
300}
602adf40 301
117973fb
AE
302static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
303static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 304
602adf40
YS
305static int rbd_open(struct block_device *bdev, fmode_t mode)
306{
f0f8cef5 307 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
602adf40 308
f84344f3 309 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
310 return -EROFS;
311
340c7a2b 312 rbd_get_dev(rbd_dev);
f84344f3 313 set_device_ro(bdev, rbd_dev->mapping.read_only);
340c7a2b 314
602adf40
YS
315 return 0;
316}
317
dfc5606d
YS
318static int rbd_release(struct gendisk *disk, fmode_t mode)
319{
320 struct rbd_device *rbd_dev = disk->private_data;
321
322 rbd_put_dev(rbd_dev);
323
324 return 0;
325}
326
602adf40
YS
327static const struct block_device_operations rbd_bd_ops = {
328 .owner = THIS_MODULE,
329 .open = rbd_open,
dfc5606d 330 .release = rbd_release,
602adf40
YS
331};
332
333/*
334 * Initialize an rbd client instance.
43ae4701 335 * We own *ceph_opts.
602adf40 336 */
f8c38929 337static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
338{
339 struct rbd_client *rbdc;
340 int ret = -ENOMEM;
341
342 dout("rbd_client_create\n");
343 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
344 if (!rbdc)
345 goto out_opt;
346
347 kref_init(&rbdc->kref);
348 INIT_LIST_HEAD(&rbdc->node);
349
bc534d86
AE
350 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
351
43ae4701 352 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 353 if (IS_ERR(rbdc->client))
bc534d86 354 goto out_mutex;
43ae4701 355 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
356
357 ret = ceph_open_session(rbdc->client);
358 if (ret < 0)
359 goto out_err;
360
432b8587 361 spin_lock(&rbd_client_list_lock);
602adf40 362 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 363 spin_unlock(&rbd_client_list_lock);
602adf40 364
bc534d86
AE
365 mutex_unlock(&ctl_mutex);
366
602adf40
YS
367 dout("rbd_client_create created %p\n", rbdc);
368 return rbdc;
369
370out_err:
371 ceph_destroy_client(rbdc->client);
bc534d86
AE
372out_mutex:
373 mutex_unlock(&ctl_mutex);
602adf40
YS
374 kfree(rbdc);
375out_opt:
43ae4701
AE
376 if (ceph_opts)
377 ceph_destroy_options(ceph_opts);
28f259b7 378 return ERR_PTR(ret);
602adf40
YS
379}
380
381/*
1f7ba331
AE
382 * Find a ceph client with specific addr and configuration. If
383 * found, bump its reference count.
602adf40 384 */
1f7ba331 385static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
386{
387 struct rbd_client *client_node;
1f7ba331 388 bool found = false;
602adf40 389
43ae4701 390 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
391 return NULL;
392
1f7ba331
AE
393 spin_lock(&rbd_client_list_lock);
394 list_for_each_entry(client_node, &rbd_client_list, node) {
395 if (!ceph_compare_options(ceph_opts, client_node->client)) {
396 kref_get(&client_node->kref);
397 found = true;
398 break;
399 }
400 }
401 spin_unlock(&rbd_client_list_lock);
402
403 return found ? client_node : NULL;
602adf40
YS
404}
405
59c2be1e
YS
406/*
407 * mount options
408 */
409enum {
59c2be1e
YS
410 Opt_last_int,
411 /* int args above */
412 Opt_last_string,
413 /* string args above */
cc0538b6
AE
414 Opt_read_only,
415 Opt_read_write,
416 /* Boolean args above */
417 Opt_last_bool,
59c2be1e
YS
418};
419
43ae4701 420static match_table_t rbd_opts_tokens = {
59c2be1e
YS
421 /* int args above */
422 /* string args above */
be466c1c 423 {Opt_read_only, "read_only"},
cc0538b6
AE
424 {Opt_read_only, "ro"}, /* Alternate spelling */
425 {Opt_read_write, "read_write"},
426 {Opt_read_write, "rw"}, /* Alternate spelling */
427 /* Boolean args above */
59c2be1e
YS
428 {-1, NULL}
429};
430
431static int parse_rbd_opts_token(char *c, void *private)
432{
43ae4701 433 struct rbd_options *rbd_opts = private;
59c2be1e
YS
434 substring_t argstr[MAX_OPT_ARGS];
435 int token, intval, ret;
436
43ae4701 437 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
438 if (token < 0)
439 return -EINVAL;
440
441 if (token < Opt_last_int) {
442 ret = match_int(&argstr[0], &intval);
443 if (ret < 0) {
444 pr_err("bad mount option arg (not int) "
445 "at '%s'\n", c);
446 return ret;
447 }
448 dout("got int token %d val %d\n", token, intval);
449 } else if (token > Opt_last_int && token < Opt_last_string) {
450 dout("got string token %d val %s\n", token,
451 argstr[0].from);
cc0538b6
AE
452 } else if (token > Opt_last_string && token < Opt_last_bool) {
453 dout("got Boolean token %d\n", token);
59c2be1e
YS
454 } else {
455 dout("got token %d\n", token);
456 }
457
458 switch (token) {
cc0538b6
AE
459 case Opt_read_only:
460 rbd_opts->read_only = true;
461 break;
462 case Opt_read_write:
463 rbd_opts->read_only = false;
464 break;
59c2be1e 465 default:
aafb230e
AE
466 rbd_assert(false);
467 break;
59c2be1e
YS
468 }
469 return 0;
470}
471
602adf40
YS
472/*
473 * Get a ceph client with specific addr and configuration, if one does
474 * not exist create it.
475 */
9d3997fd 476static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
602adf40 477{
f8c38929 478 struct rbd_client *rbdc;
59c2be1e 479
1f7ba331 480 rbdc = rbd_client_find(ceph_opts);
9d3997fd 481 if (rbdc) /* using an existing client */
43ae4701 482 ceph_destroy_options(ceph_opts);
9d3997fd 483 else
f8c38929 484 rbdc = rbd_client_create(ceph_opts);
602adf40 485
9d3997fd 486 return rbdc;
602adf40
YS
487}
488
489/*
490 * Destroy ceph client
d23a4b3f 491 *
432b8587 492 * Caller must hold rbd_client_list_lock.
602adf40
YS
493 */
494static void rbd_client_release(struct kref *kref)
495{
496 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
497
498 dout("rbd_release_client %p\n", rbdc);
cd9d9f5d 499 spin_lock(&rbd_client_list_lock);
602adf40 500 list_del(&rbdc->node);
cd9d9f5d 501 spin_unlock(&rbd_client_list_lock);
602adf40
YS
502
503 ceph_destroy_client(rbdc->client);
504 kfree(rbdc);
505}
506
507/*
508 * Drop reference to ceph client node. If it's not referenced anymore, release
509 * it.
510 */
9d3997fd 511static void rbd_put_client(struct rbd_client *rbdc)
602adf40 512{
c53d5893
AE
513 if (rbdc)
514 kref_put(&rbdc->kref, rbd_client_release);
602adf40
YS
515}
516
1fec7093
YS
517/*
518 * Destroy requests collection
519 */
520static void rbd_coll_release(struct kref *kref)
521{
522 struct rbd_req_coll *coll =
523 container_of(kref, struct rbd_req_coll, kref);
524
525 dout("rbd_coll_release %p\n", coll);
526 kfree(coll);
527}
602adf40 528
a30b71b9
AE
529static bool rbd_image_format_valid(u32 image_format)
530{
531 return image_format == 1 || image_format == 2;
532}
533
8e94af8e
AE
534static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
535{
103a150f
AE
536 size_t size;
537 u32 snap_count;
538
539 /* The header has to start with the magic rbd header text */
540 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
541 return false;
542
db2388b6
AE
543 /* The bio layer requires at least sector-sized I/O */
544
545 if (ondisk->options.order < SECTOR_SHIFT)
546 return false;
547
548 /* If we use u64 in a few spots we may be able to loosen this */
549
550 if (ondisk->options.order > 8 * sizeof (int) - 1)
551 return false;
552
103a150f
AE
553 /*
554 * The size of a snapshot header has to fit in a size_t, and
555 * that limits the number of snapshots.
556 */
557 snap_count = le32_to_cpu(ondisk->snap_count);
558 size = SIZE_MAX - sizeof (struct ceph_snap_context);
559 if (snap_count > size / sizeof (__le64))
560 return false;
561
562 /*
563 * Not only that, but the size of the entire the snapshot
564 * header must also be representable in a size_t.
565 */
566 size -= snap_count * sizeof (__le64);
567 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
568 return false;
569
570 return true;
8e94af8e
AE
571}
572
602adf40
YS
573/*
574 * Create a new header structure, translate header format from the on-disk
575 * header.
576 */
577static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 578 struct rbd_image_header_ondisk *ondisk)
602adf40 579{
ccece235 580 u32 snap_count;
58c17b0e 581 size_t len;
d2bb24e5 582 size_t size;
621901d6 583 u32 i;
602adf40 584
6a52325f
AE
585 memset(header, 0, sizeof (*header));
586
103a150f
AE
587 snap_count = le32_to_cpu(ondisk->snap_count);
588
58c17b0e
AE
589 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
590 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 591 if (!header->object_prefix)
602adf40 592 return -ENOMEM;
58c17b0e
AE
593 memcpy(header->object_prefix, ondisk->object_prefix, len);
594 header->object_prefix[len] = '\0';
00f1f36f 595
602adf40 596 if (snap_count) {
f785cc1d
AE
597 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
598
621901d6
AE
599 /* Save a copy of the snapshot names */
600
f785cc1d
AE
601 if (snap_names_len > (u64) SIZE_MAX)
602 return -EIO;
603 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 604 if (!header->snap_names)
6a52325f 605 goto out_err;
f785cc1d
AE
606 /*
607 * Note that rbd_dev_v1_header_read() guarantees
608 * the ondisk buffer we're working with has
609 * snap_names_len bytes beyond the end of the
610 * snapshot id array, this memcpy() is safe.
611 */
612 memcpy(header->snap_names, &ondisk->snaps[snap_count],
613 snap_names_len);
6a52325f 614
621901d6
AE
615 /* Record each snapshot's size */
616
d2bb24e5
AE
617 size = snap_count * sizeof (*header->snap_sizes);
618 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 619 if (!header->snap_sizes)
6a52325f 620 goto out_err;
621901d6
AE
621 for (i = 0; i < snap_count; i++)
622 header->snap_sizes[i] =
623 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 624 } else {
ccece235 625 WARN_ON(ondisk->snap_names_len);
602adf40
YS
626 header->snap_names = NULL;
627 header->snap_sizes = NULL;
628 }
849b4260 629
34b13184 630 header->features = 0; /* No features support in v1 images */
602adf40
YS
631 header->obj_order = ondisk->options.order;
632 header->crypt_type = ondisk->options.crypt_type;
633 header->comp_type = ondisk->options.comp_type;
6a52325f 634
621901d6
AE
635 /* Allocate and fill in the snapshot context */
636
f84344f3 637 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
638 size = sizeof (struct ceph_snap_context);
639 size += snap_count * sizeof (header->snapc->snaps[0]);
640 header->snapc = kzalloc(size, GFP_KERNEL);
641 if (!header->snapc)
642 goto out_err;
602adf40
YS
643
644 atomic_set(&header->snapc->nref, 1);
505cbb9b 645 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 646 header->snapc->num_snaps = snap_count;
621901d6
AE
647 for (i = 0; i < snap_count; i++)
648 header->snapc->snaps[i] =
649 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
650
651 return 0;
652
6a52325f 653out_err:
849b4260 654 kfree(header->snap_sizes);
ccece235 655 header->snap_sizes = NULL;
602adf40 656 kfree(header->snap_names);
ccece235 657 header->snap_names = NULL;
6a52325f
AE
658 kfree(header->object_prefix);
659 header->object_prefix = NULL;
ccece235 660
00f1f36f 661 return -ENOMEM;
602adf40
YS
662}
663
9e15b77d
AE
664static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
665{
666 struct rbd_snap *snap;
667
668 if (snap_id == CEPH_NOSNAP)
669 return RBD_SNAP_HEAD_NAME;
670
671 list_for_each_entry(snap, &rbd_dev->snaps, node)
672 if (snap_id == snap->id)
673 return snap->name;
674
675 return NULL;
676}
677
8836b995 678static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40 679{
602adf40 680
e86924a8 681 struct rbd_snap *snap;
602adf40 682
e86924a8
AE
683 list_for_each_entry(snap, &rbd_dev->snaps, node) {
684 if (!strcmp(snap_name, snap->name)) {
0d7dbfce 685 rbd_dev->spec->snap_id = snap->id;
e86924a8 686 rbd_dev->mapping.size = snap->size;
34b13184 687 rbd_dev->mapping.features = snap->features;
602adf40 688
e86924a8 689 return 0;
00f1f36f 690 }
00f1f36f 691 }
e86924a8 692
00f1f36f 693 return -ENOENT;
602adf40
YS
694}
695
819d52bf 696static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
602adf40 697{
78dc447d 698 int ret;
602adf40 699
0d7dbfce 700 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 701 sizeof (RBD_SNAP_HEAD_NAME))) {
0d7dbfce 702 rbd_dev->spec->snap_id = CEPH_NOSNAP;
99c1f08f 703 rbd_dev->mapping.size = rbd_dev->header.image_size;
34b13184 704 rbd_dev->mapping.features = rbd_dev->header.features;
e86924a8 705 ret = 0;
602adf40 706 } else {
0d7dbfce 707 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
602adf40
YS
708 if (ret < 0)
709 goto done;
f84344f3 710 rbd_dev->mapping.read_only = true;
602adf40 711 }
daba5fdb 712 rbd_dev->exists = true;
602adf40 713done:
602adf40
YS
714 return ret;
715}
716
717static void rbd_header_free(struct rbd_image_header *header)
718{
849b4260 719 kfree(header->object_prefix);
d78fd7ae 720 header->object_prefix = NULL;
602adf40 721 kfree(header->snap_sizes);
d78fd7ae 722 header->snap_sizes = NULL;
849b4260 723 kfree(header->snap_names);
d78fd7ae 724 header->snap_names = NULL;
d1d25646 725 ceph_put_snap_context(header->snapc);
d78fd7ae 726 header->snapc = NULL;
602adf40
YS
727}
728
65ccfe21 729static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 730{
65ccfe21
AE
731 char *name;
732 u64 segment;
733 int ret;
602adf40 734
65ccfe21
AE
735 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
736 if (!name)
737 return NULL;
738 segment = offset >> rbd_dev->header.obj_order;
739 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
740 rbd_dev->header.object_prefix, segment);
741 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
742 pr_err("error formatting segment name for #%llu (%d)\n",
743 segment, ret);
744 kfree(name);
745 name = NULL;
746 }
602adf40 747
65ccfe21
AE
748 return name;
749}
602adf40 750
65ccfe21
AE
751static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
752{
753 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 754
65ccfe21
AE
755 return offset & (segment_size - 1);
756}
757
758static u64 rbd_segment_length(struct rbd_device *rbd_dev,
759 u64 offset, u64 length)
760{
761 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
762
763 offset &= segment_size - 1;
764
aafb230e 765 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
766 if (offset + length > segment_size)
767 length = segment_size - offset;
768
769 return length;
602adf40
YS
770}
771
1fec7093
YS
772static int rbd_get_num_segments(struct rbd_image_header *header,
773 u64 ofs, u64 len)
774{
df111be6
AE
775 u64 start_seg;
776 u64 end_seg;
777
778 if (!len)
779 return 0;
780 if (len - 1 > U64_MAX - ofs)
781 return -ERANGE;
782
783 start_seg = ofs >> header->obj_order;
784 end_seg = (ofs + len - 1) >> header->obj_order;
785
1fec7093
YS
786 return end_seg - start_seg + 1;
787}
788
029bcbd8
JD
789/*
790 * returns the size of an object in the image
791 */
792static u64 rbd_obj_bytes(struct rbd_image_header *header)
793{
794 return 1 << header->obj_order;
795}
796
602adf40
YS
797/*
798 * bio helpers
799 */
800
801static void bio_chain_put(struct bio *chain)
802{
803 struct bio *tmp;
804
805 while (chain) {
806 tmp = chain;
807 chain = chain->bi_next;
808 bio_put(tmp);
809 }
810}
811
812/*
813 * zeros a bio chain, starting at specific offset
814 */
815static void zero_bio_chain(struct bio *chain, int start_ofs)
816{
817 struct bio_vec *bv;
818 unsigned long flags;
819 void *buf;
820 int i;
821 int pos = 0;
822
823 while (chain) {
824 bio_for_each_segment(bv, chain, i) {
825 if (pos + bv->bv_len > start_ofs) {
826 int remainder = max(start_ofs - pos, 0);
827 buf = bvec_kmap_irq(bv, &flags);
828 memset(buf + remainder, 0,
829 bv->bv_len - remainder);
85b5aaa6 830 bvec_kunmap_irq(buf, &flags);
602adf40
YS
831 }
832 pos += bv->bv_len;
833 }
834
835 chain = chain->bi_next;
836 }
837}
838
839/*
f7760dad
AE
840 * Clone a portion of a bio, starting at the given byte offset
841 * and continuing for the number of bytes indicated.
602adf40 842 */
f7760dad
AE
843static struct bio *bio_clone_range(struct bio *bio_src,
844 unsigned int offset,
845 unsigned int len,
846 gfp_t gfpmask)
602adf40 847{
f7760dad
AE
848 struct bio_vec *bv;
849 unsigned int resid;
850 unsigned short idx;
851 unsigned int voff;
852 unsigned short end_idx;
853 unsigned short vcnt;
854 struct bio *bio;
855
856 /* Handle the easy case for the caller */
857
858 if (!offset && len == bio_src->bi_size)
859 return bio_clone(bio_src, gfpmask);
860
861 if (WARN_ON_ONCE(!len))
862 return NULL;
863 if (WARN_ON_ONCE(len > bio_src->bi_size))
864 return NULL;
865 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
866 return NULL;
867
868 /* Find first affected segment... */
869
870 resid = offset;
871 __bio_for_each_segment(bv, bio_src, idx, 0) {
872 if (resid < bv->bv_len)
873 break;
874 resid -= bv->bv_len;
602adf40 875 }
f7760dad 876 voff = resid;
602adf40 877
f7760dad 878 /* ...and the last affected segment */
602adf40 879
f7760dad
AE
880 resid += len;
881 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
882 if (resid <= bv->bv_len)
883 break;
884 resid -= bv->bv_len;
885 }
886 vcnt = end_idx - idx + 1;
887
888 /* Build the clone */
889
890 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
891 if (!bio)
892 return NULL; /* ENOMEM */
602adf40 893
f7760dad
AE
894 bio->bi_bdev = bio_src->bi_bdev;
895 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
896 bio->bi_rw = bio_src->bi_rw;
897 bio->bi_flags |= 1 << BIO_CLONED;
898
899 /*
900 * Copy over our part of the bio_vec, then update the first
901 * and last (or only) entries.
902 */
903 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
904 vcnt * sizeof (struct bio_vec));
905 bio->bi_io_vec[0].bv_offset += voff;
906 if (vcnt > 1) {
907 bio->bi_io_vec[0].bv_len -= voff;
908 bio->bi_io_vec[vcnt - 1].bv_len = resid;
909 } else {
910 bio->bi_io_vec[0].bv_len = len;
602adf40
YS
911 }
912
f7760dad
AE
913 bio->bi_vcnt = vcnt;
914 bio->bi_size = len;
915 bio->bi_idx = 0;
916
917 return bio;
918}
919
920/*
921 * Clone a portion of a bio chain, starting at the given byte offset
922 * into the first bio in the source chain and continuing for the
923 * number of bytes indicated. The result is another bio chain of
924 * exactly the given length, or a null pointer on error.
925 *
926 * The bio_src and offset parameters are both in-out. On entry they
927 * refer to the first source bio and the offset into that bio where
928 * the start of data to be cloned is located.
929 *
930 * On return, bio_src is updated to refer to the bio in the source
931 * chain that contains first un-cloned byte, and *offset will
932 * contain the offset of that byte within that bio.
933 */
934static struct bio *bio_chain_clone_range(struct bio **bio_src,
935 unsigned int *offset,
936 unsigned int len,
937 gfp_t gfpmask)
938{
939 struct bio *bi = *bio_src;
940 unsigned int off = *offset;
941 struct bio *chain = NULL;
942 struct bio **end;
943
944 /* Build up a chain of clone bios up to the limit */
945
946 if (!bi || off >= bi->bi_size || !len)
947 return NULL; /* Nothing to clone */
602adf40 948
f7760dad
AE
949 end = &chain;
950 while (len) {
951 unsigned int bi_size;
952 struct bio *bio;
953
954 if (!bi)
955 goto out_err; /* EINVAL; ran out of bio's */
956 bi_size = min_t(unsigned int, bi->bi_size - off, len);
957 bio = bio_clone_range(bi, off, bi_size, gfpmask);
958 if (!bio)
959 goto out_err; /* ENOMEM */
960
961 *end = bio;
962 end = &bio->bi_next;
602adf40 963
f7760dad
AE
964 off += bi_size;
965 if (off == bi->bi_size) {
966 bi = bi->bi_next;
967 off = 0;
968 }
969 len -= bi_size;
970 }
971 *bio_src = bi;
972 *offset = off;
973
974 return chain;
975out_err:
976 bio_chain_put(chain);
602adf40 977
602adf40
YS
978 return NULL;
979}
980
981/*
982 * helpers for osd request op vectors.
983 */
57cfc106
AE
984static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
985 int opcode, u32 payload_len)
602adf40 986{
57cfc106
AE
987 struct ceph_osd_req_op *ops;
988
989 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
990 if (!ops)
991 return NULL;
992
993 ops[0].op = opcode;
994
602adf40
YS
995 /*
996 * op extent offset and length will be set later on
997 * in calc_raw_layout()
998 */
57cfc106
AE
999 ops[0].payload_len = payload_len;
1000
1001 return ops;
602adf40
YS
1002}
1003
1004static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
1005{
1006 kfree(ops);
1007}
1008
1fec7093
YS
1009static void rbd_coll_end_req_index(struct request *rq,
1010 struct rbd_req_coll *coll,
1011 int index,
1012 int ret, u64 len)
1013{
1014 struct request_queue *q;
1015 int min, max, i;
1016
bd919d45
AE
1017 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
1018 coll, index, ret, (unsigned long long) len);
1fec7093
YS
1019
1020 if (!rq)
1021 return;
1022
1023 if (!coll) {
1024 blk_end_request(rq, ret, len);
1025 return;
1026 }
1027
1028 q = rq->q;
1029
1030 spin_lock_irq(q->queue_lock);
1031 coll->status[index].done = 1;
1032 coll->status[index].rc = ret;
1033 coll->status[index].bytes = len;
1034 max = min = coll->num_done;
1035 while (max < coll->total && coll->status[max].done)
1036 max++;
1037
1038 for (i = min; i<max; i++) {
1039 __blk_end_request(rq, coll->status[i].rc,
1040 coll->status[i].bytes);
1041 coll->num_done++;
1042 kref_put(&coll->kref, rbd_coll_release);
1043 }
1044 spin_unlock_irq(q->queue_lock);
1045}
1046
1047static void rbd_coll_end_req(struct rbd_request *req,
1048 int ret, u64 len)
1049{
1050 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
1051}
1052
602adf40
YS
1053/*
1054 * Send ceph osd request
1055 */
1056static int rbd_do_request(struct request *rq,
0ce1a794 1057 struct rbd_device *rbd_dev,
602adf40
YS
1058 struct ceph_snap_context *snapc,
1059 u64 snapid,
aded07ea 1060 const char *object_name, u64 ofs, u64 len,
602adf40
YS
1061 struct bio *bio,
1062 struct page **pages,
1063 int num_pages,
1064 int flags,
1065 struct ceph_osd_req_op *ops,
1fec7093
YS
1066 struct rbd_req_coll *coll,
1067 int coll_index,
602adf40 1068 void (*rbd_cb)(struct ceph_osd_request *req,
59c2be1e
YS
1069 struct ceph_msg *msg),
1070 struct ceph_osd_request **linger_req,
1071 u64 *ver)
602adf40
YS
1072{
1073 struct ceph_osd_request *req;
1074 struct ceph_file_layout *layout;
1075 int ret;
1076 u64 bno;
1077 struct timespec mtime = CURRENT_TIME;
1078 struct rbd_request *req_data;
1079 struct ceph_osd_request_head *reqhead;
1dbb4399 1080 struct ceph_osd_client *osdc;
602adf40 1081
602adf40 1082 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
1fec7093
YS
1083 if (!req_data) {
1084 if (coll)
1085 rbd_coll_end_req_index(rq, coll, coll_index,
1086 -ENOMEM, len);
1087 return -ENOMEM;
1088 }
1089
1090 if (coll) {
1091 req_data->coll = coll;
1092 req_data->coll_index = coll_index;
1093 }
602adf40 1094
f7760dad
AE
1095 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1096 object_name, (unsigned long long) ofs,
1097 (unsigned long long) len, coll, coll_index);
602adf40 1098
0ce1a794 1099 osdc = &rbd_dev->rbd_client->client->osdc;
1dbb4399
AE
1100 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1101 false, GFP_NOIO, pages, bio);
4ad12621 1102 if (!req) {
4ad12621 1103 ret = -ENOMEM;
602adf40
YS
1104 goto done_pages;
1105 }
1106
1107 req->r_callback = rbd_cb;
1108
1109 req_data->rq = rq;
1110 req_data->bio = bio;
1111 req_data->pages = pages;
1112 req_data->len = len;
1113
1114 req->r_priv = req_data;
1115
1116 reqhead = req->r_request->front.iov_base;
1117 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1118
aded07ea 1119 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
602adf40
YS
1120 req->r_oid_len = strlen(req->r_oid);
1121
1122 layout = &req->r_file_layout;
1123 memset(layout, 0, sizeof(*layout));
1124 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1125 layout->fl_stripe_count = cpu_to_le32(1);
1126 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
0d7dbfce 1127 layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
6cae3717
SW
1128 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1129 req, ops);
1130 rbd_assert(ret == 0);
602adf40
YS
1131
1132 ceph_osdc_build_request(req, ofs, &len,
1133 ops,
1134 snapc,
1135 &mtime,
1136 req->r_oid, req->r_oid_len);
602adf40 1137
59c2be1e 1138 if (linger_req) {
1dbb4399 1139 ceph_osdc_set_request_linger(osdc, req);
59c2be1e
YS
1140 *linger_req = req;
1141 }
1142
1dbb4399 1143 ret = ceph_osdc_start_request(osdc, req, false);
602adf40
YS
1144 if (ret < 0)
1145 goto done_err;
1146
1147 if (!rbd_cb) {
1dbb4399 1148 ret = ceph_osdc_wait_request(osdc, req);
59c2be1e
YS
1149 if (ver)
1150 *ver = le64_to_cpu(req->r_reassert_version.version);
bd919d45
AE
1151 dout("reassert_ver=%llu\n",
1152 (unsigned long long)
1153 le64_to_cpu(req->r_reassert_version.version));
602adf40
YS
1154 ceph_osdc_put_request(req);
1155 }
1156 return ret;
1157
1158done_err:
1159 bio_chain_put(req_data->bio);
1160 ceph_osdc_put_request(req);
1161done_pages:
1fec7093 1162 rbd_coll_end_req(req_data, ret, len);
602adf40 1163 kfree(req_data);
602adf40
YS
1164 return ret;
1165}
1166
1167/*
1168 * Ceph osd op callback
1169 */
1170static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1171{
1172 struct rbd_request *req_data = req->r_priv;
1173 struct ceph_osd_reply_head *replyhead;
1174 struct ceph_osd_op *op;
1175 __s32 rc;
1176 u64 bytes;
1177 int read_op;
1178
1179 /* parse reply */
1180 replyhead = msg->front.iov_base;
1181 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1182 op = (void *)(replyhead + 1);
1183 rc = le32_to_cpu(replyhead->result);
1184 bytes = le64_to_cpu(op->extent.length);
895cfcc8 1185 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
602adf40 1186
bd919d45
AE
1187 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1188 (unsigned long long) bytes, read_op, (int) rc);
602adf40
YS
1189
1190 if (rc == -ENOENT && read_op) {
1191 zero_bio_chain(req_data->bio, 0);
1192 rc = 0;
1193 } else if (rc == 0 && read_op && bytes < req_data->len) {
1194 zero_bio_chain(req_data->bio, bytes);
1195 bytes = req_data->len;
1196 }
1197
1fec7093 1198 rbd_coll_end_req(req_data, rc, bytes);
602adf40
YS
1199
1200 if (req_data->bio)
1201 bio_chain_put(req_data->bio);
1202
1203 ceph_osdc_put_request(req);
1204 kfree(req_data);
1205}
1206
59c2be1e
YS
1207static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1208{
1209 ceph_osdc_put_request(req);
1210}
1211
602adf40
YS
1212/*
1213 * Do a synchronous ceph osd operation
1214 */
0ce1a794 1215static int rbd_req_sync_op(struct rbd_device *rbd_dev,
602adf40
YS
1216 struct ceph_snap_context *snapc,
1217 u64 snapid,
602adf40 1218 int flags,
913d2fdc 1219 struct ceph_osd_req_op *ops,
aded07ea 1220 const char *object_name,
f8d4de6e
AE
1221 u64 ofs, u64 inbound_size,
1222 char *inbound,
59c2be1e
YS
1223 struct ceph_osd_request **linger_req,
1224 u64 *ver)
602adf40
YS
1225{
1226 int ret;
1227 struct page **pages;
1228 int num_pages;
913d2fdc 1229
aafb230e 1230 rbd_assert(ops != NULL);
602adf40 1231
f8d4de6e 1232 num_pages = calc_pages_for(ofs, inbound_size);
602adf40 1233 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
b8d0638a
DC
1234 if (IS_ERR(pages))
1235 return PTR_ERR(pages);
602adf40 1236
0ce1a794 1237 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
f8d4de6e 1238 object_name, ofs, inbound_size, NULL,
602adf40
YS
1239 pages, num_pages,
1240 flags,
1241 ops,
1fec7093 1242 NULL, 0,
59c2be1e
YS
1243 NULL,
1244 linger_req, ver);
602adf40 1245 if (ret < 0)
913d2fdc 1246 goto done;
602adf40 1247
f8d4de6e
AE
1248 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1249 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
602adf40 1250
602adf40
YS
1251done:
1252 ceph_release_page_vector(pages, num_pages);
1253 return ret;
1254}
1255
1256/*
1257 * Do an asynchronous ceph osd operation
1258 */
1259static int rbd_do_op(struct request *rq,
0ce1a794 1260 struct rbd_device *rbd_dev,
602adf40 1261 struct ceph_snap_context *snapc,
602adf40 1262 u64 ofs, u64 len,
1fec7093
YS
1263 struct bio *bio,
1264 struct rbd_req_coll *coll,
1265 int coll_index)
602adf40
YS
1266{
1267 char *seg_name;
1268 u64 seg_ofs;
1269 u64 seg_len;
1270 int ret;
1271 struct ceph_osd_req_op *ops;
1272 u32 payload_len;
ff2e4bb5
AE
1273 int opcode;
1274 int flags;
4634246d 1275 u64 snapid;
602adf40 1276
65ccfe21 1277 seg_name = rbd_segment_name(rbd_dev, ofs);
602adf40
YS
1278 if (!seg_name)
1279 return -ENOMEM;
65ccfe21
AE
1280 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1281 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
602adf40 1282
ff2e4bb5
AE
1283 if (rq_data_dir(rq) == WRITE) {
1284 opcode = CEPH_OSD_OP_WRITE;
1285 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
4634246d 1286 snapid = CEPH_NOSNAP;
ff2e4bb5
AE
1287 payload_len = seg_len;
1288 } else {
1289 opcode = CEPH_OSD_OP_READ;
1290 flags = CEPH_OSD_FLAG_READ;
4634246d 1291 snapc = NULL;
0d7dbfce 1292 snapid = rbd_dev->spec->snap_id;
ff2e4bb5
AE
1293 payload_len = 0;
1294 }
602adf40 1295
57cfc106
AE
1296 ret = -ENOMEM;
1297 ops = rbd_create_rw_ops(1, opcode, payload_len);
1298 if (!ops)
602adf40
YS
1299 goto done;
1300
1301 /* we've taken care of segment sizes earlier when we
1302 cloned the bios. We should never have a segment
1303 truncated at this point */
aafb230e 1304 rbd_assert(seg_len == len);
602adf40
YS
1305
1306 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1307 seg_name, seg_ofs, seg_len,
1308 bio,
1309 NULL, 0,
1310 flags,
1311 ops,
1fec7093 1312 coll, coll_index,
59c2be1e 1313 rbd_req_cb, 0, NULL);
11f77002
SW
1314
1315 rbd_destroy_ops(ops);
602adf40
YS
1316done:
1317 kfree(seg_name);
1318 return ret;
1319}
1320
602adf40
YS
1321/*
1322 * Request sync osd read
1323 */
0ce1a794 1324static int rbd_req_sync_read(struct rbd_device *rbd_dev,
602adf40 1325 u64 snapid,
aded07ea 1326 const char *object_name,
602adf40 1327 u64 ofs, u64 len,
59c2be1e
YS
1328 char *buf,
1329 u64 *ver)
602adf40 1330{
913d2fdc
AE
1331 struct ceph_osd_req_op *ops;
1332 int ret;
1333
1334 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1335 if (!ops)
1336 return -ENOMEM;
1337
1338 ret = rbd_req_sync_op(rbd_dev, NULL,
b06e6a6b 1339 snapid,
602adf40 1340 CEPH_OSD_FLAG_READ,
913d2fdc
AE
1341 ops, object_name, ofs, len, buf, NULL, ver);
1342 rbd_destroy_ops(ops);
1343
1344 return ret;
602adf40
YS
1345}
1346
1347/*
59c2be1e
YS
1348 * Request sync osd watch
1349 */
0ce1a794 1350static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
59c2be1e 1351 u64 ver,
7f0a24d8 1352 u64 notify_id)
59c2be1e
YS
1353{
1354 struct ceph_osd_req_op *ops;
11f77002
SW
1355 int ret;
1356
57cfc106
AE
1357 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1358 if (!ops)
1359 return -ENOMEM;
59c2be1e 1360
a71b891b 1361 ops[0].watch.ver = cpu_to_le64(ver);
59c2be1e
YS
1362 ops[0].watch.cookie = notify_id;
1363 ops[0].watch.flag = 0;
1364
0ce1a794 1365 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
7f0a24d8 1366 rbd_dev->header_name, 0, 0, NULL,
ad4f232f 1367 NULL, 0,
59c2be1e
YS
1368 CEPH_OSD_FLAG_READ,
1369 ops,
1fec7093 1370 NULL, 0,
59c2be1e
YS
1371 rbd_simple_req_cb, 0, NULL);
1372
1373 rbd_destroy_ops(ops);
1374 return ret;
1375}
1376
1377static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1378{
0ce1a794 1379 struct rbd_device *rbd_dev = (struct rbd_device *)data;
a71b891b 1380 u64 hver;
13143d2d
SW
1381 int rc;
1382
0ce1a794 1383 if (!rbd_dev)
59c2be1e
YS
1384 return;
1385
bd919d45
AE
1386 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1387 rbd_dev->header_name, (unsigned long long) notify_id,
1388 (unsigned int) opcode);
117973fb 1389 rc = rbd_dev_refresh(rbd_dev, &hver);
13143d2d 1390 if (rc)
f0f8cef5 1391 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
0ce1a794 1392 " update snaps: %d\n", rbd_dev->major, rc);
59c2be1e 1393
7f0a24d8 1394 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
59c2be1e
YS
1395}
1396
1397/*
1398 * Request sync osd watch
1399 */
0e6f322d 1400static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
59c2be1e
YS
1401{
1402 struct ceph_osd_req_op *ops;
0ce1a794 1403 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
57cfc106 1404 int ret;
59c2be1e 1405
57cfc106
AE
1406 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1407 if (!ops)
1408 return -ENOMEM;
59c2be1e
YS
1409
1410 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
0ce1a794 1411 (void *)rbd_dev, &rbd_dev->watch_event);
59c2be1e
YS
1412 if (ret < 0)
1413 goto fail;
1414
0e6f322d 1415 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
0ce1a794 1416 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
59c2be1e
YS
1417 ops[0].watch.flag = 1;
1418
0ce1a794 1419 ret = rbd_req_sync_op(rbd_dev, NULL,
59c2be1e 1420 CEPH_NOSNAP,
59c2be1e
YS
1421 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1422 ops,
0e6f322d
AE
1423 rbd_dev->header_name,
1424 0, 0, NULL,
0ce1a794 1425 &rbd_dev->watch_request, NULL);
59c2be1e
YS
1426
1427 if (ret < 0)
1428 goto fail_event;
1429
1430 rbd_destroy_ops(ops);
1431 return 0;
1432
1433fail_event:
0ce1a794
AE
1434 ceph_osdc_cancel_event(rbd_dev->watch_event);
1435 rbd_dev->watch_event = NULL;
59c2be1e
YS
1436fail:
1437 rbd_destroy_ops(ops);
1438 return ret;
1439}
1440
79e3057c
YS
1441/*
1442 * Request sync osd unwatch
1443 */
070c633f 1444static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
79e3057c
YS
1445{
1446 struct ceph_osd_req_op *ops;
57cfc106 1447 int ret;
79e3057c 1448
57cfc106
AE
1449 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1450 if (!ops)
1451 return -ENOMEM;
79e3057c
YS
1452
1453 ops[0].watch.ver = 0;
0ce1a794 1454 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
79e3057c
YS
1455 ops[0].watch.flag = 0;
1456
0ce1a794 1457 ret = rbd_req_sync_op(rbd_dev, NULL,
79e3057c 1458 CEPH_NOSNAP,
79e3057c
YS
1459 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1460 ops,
070c633f
AE
1461 rbd_dev->header_name,
1462 0, 0, NULL, NULL, NULL);
1463
79e3057c
YS
1464
1465 rbd_destroy_ops(ops);
0ce1a794
AE
1466 ceph_osdc_cancel_event(rbd_dev->watch_event);
1467 rbd_dev->watch_event = NULL;
79e3057c
YS
1468 return ret;
1469}
1470
602adf40 1471/*
3cb4a687 1472 * Synchronous osd object method call
602adf40 1473 */
0ce1a794 1474static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
aded07ea
AE
1475 const char *object_name,
1476 const char *class_name,
1477 const char *method_name,
3cb4a687
AE
1478 const char *outbound,
1479 size_t outbound_size,
f8d4de6e
AE
1480 char *inbound,
1481 size_t inbound_size,
3cb4a687 1482 int flags,
59c2be1e 1483 u64 *ver)
602adf40
YS
1484{
1485 struct ceph_osd_req_op *ops;
aded07ea
AE
1486 int class_name_len = strlen(class_name);
1487 int method_name_len = strlen(method_name);
3cb4a687 1488 int payload_size;
57cfc106
AE
1489 int ret;
1490
3cb4a687
AE
1491 /*
1492 * Any input parameters required by the method we're calling
1493 * will be sent along with the class and method names as
1494 * part of the message payload. That data and its size are
1495 * supplied via the indata and indata_len fields (named from
1496 * the perspective of the server side) in the OSD request
1497 * operation.
1498 */
1499 payload_size = class_name_len + method_name_len + outbound_size;
1500 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
57cfc106
AE
1501 if (!ops)
1502 return -ENOMEM;
602adf40 1503
aded07ea
AE
1504 ops[0].cls.class_name = class_name;
1505 ops[0].cls.class_len = (__u8) class_name_len;
1506 ops[0].cls.method_name = method_name;
1507 ops[0].cls.method_len = (__u8) method_name_len;
602adf40 1508 ops[0].cls.argc = 0;
3cb4a687
AE
1509 ops[0].cls.indata = outbound;
1510 ops[0].cls.indata_len = outbound_size;
602adf40 1511
0ce1a794 1512 ret = rbd_req_sync_op(rbd_dev, NULL,
602adf40 1513 CEPH_NOSNAP,
3cb4a687 1514 flags, ops,
f8d4de6e
AE
1515 object_name, 0, inbound_size, inbound,
1516 NULL, ver);
602adf40
YS
1517
1518 rbd_destroy_ops(ops);
1519
1520 dout("cls_exec returned %d\n", ret);
1521 return ret;
1522}
1523
1fec7093
YS
1524static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1525{
1526 struct rbd_req_coll *coll =
1527 kzalloc(sizeof(struct rbd_req_coll) +
1528 sizeof(struct rbd_req_status) * num_reqs,
1529 GFP_ATOMIC);
1530
1531 if (!coll)
1532 return NULL;
1533 coll->total = num_reqs;
1534 kref_init(&coll->kref);
1535 return coll;
1536}
1537
602adf40
YS
1538/*
1539 * block device queue callback
1540 */
1541static void rbd_rq_fn(struct request_queue *q)
1542{
1543 struct rbd_device *rbd_dev = q->queuedata;
1544 struct request *rq;
602adf40 1545
00f1f36f 1546 while ((rq = blk_fetch_request(q))) {
602adf40 1547 struct bio *bio;
602adf40 1548 bool do_write;
bd919d45 1549 unsigned int size;
602adf40 1550 u64 ofs;
1fec7093
YS
1551 int num_segs, cur_seg = 0;
1552 struct rbd_req_coll *coll;
d1d25646 1553 struct ceph_snap_context *snapc;
f7760dad 1554 unsigned int bio_offset;
602adf40 1555
602adf40
YS
1556 dout("fetched request\n");
1557
1558 /* filter out block requests we don't understand */
1559 if ((rq->cmd_type != REQ_TYPE_FS)) {
1560 __blk_end_request_all(rq, 0);
00f1f36f 1561 continue;
602adf40
YS
1562 }
1563
1564 /* deduce our operation (read, write) */
1565 do_write = (rq_data_dir(rq) == WRITE);
f84344f3 1566 if (do_write && rbd_dev->mapping.read_only) {
602adf40 1567 __blk_end_request_all(rq, -EROFS);
00f1f36f 1568 continue;
602adf40
YS
1569 }
1570
1571 spin_unlock_irq(q->queue_lock);
1572
d1d25646 1573 down_read(&rbd_dev->header_rwsem);
e88a36ec 1574
daba5fdb 1575 if (!rbd_dev->exists) {
0d7dbfce 1576 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
e88a36ec 1577 up_read(&rbd_dev->header_rwsem);
d1d25646
JD
1578 dout("request for non-existent snapshot");
1579 spin_lock_irq(q->queue_lock);
1580 __blk_end_request_all(rq, -ENXIO);
1581 continue;
e88a36ec
JD
1582 }
1583
d1d25646
JD
1584 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1585
1586 up_read(&rbd_dev->header_rwsem);
1587
f7760dad
AE
1588 size = blk_rq_bytes(rq);
1589 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1590 bio = rq->bio;
1591
602adf40
YS
1592 dout("%s 0x%x bytes at 0x%llx\n",
1593 do_write ? "write" : "read",
bd919d45 1594 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
602adf40 1595
1fec7093 1596 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
df111be6
AE
1597 if (num_segs <= 0) {
1598 spin_lock_irq(q->queue_lock);
1599 __blk_end_request_all(rq, num_segs);
1600 ceph_put_snap_context(snapc);
1601 continue;
1602 }
1fec7093
YS
1603 coll = rbd_alloc_coll(num_segs);
1604 if (!coll) {
1605 spin_lock_irq(q->queue_lock);
1606 __blk_end_request_all(rq, -ENOMEM);
d1d25646 1607 ceph_put_snap_context(snapc);
00f1f36f 1608 continue;
1fec7093
YS
1609 }
1610
f7760dad 1611 bio_offset = 0;
602adf40 1612 do {
f7760dad
AE
1613 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1614 unsigned int chain_size;
1615 struct bio *bio_chain;
1616
1617 BUG_ON(limit > (u64) UINT_MAX);
1618 chain_size = (unsigned int) limit;
bd919d45 1619 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
f7760dad 1620
1fec7093 1621 kref_get(&coll->kref);
f7760dad
AE
1622
1623 /* Pass a cloned bio chain via an osd request */
1624
1625 bio_chain = bio_chain_clone_range(&bio,
1626 &bio_offset, chain_size,
1627 GFP_ATOMIC);
1628 if (bio_chain)
4634246d 1629 (void) rbd_do_op(rq, rbd_dev, snapc,
f7760dad
AE
1630 ofs, chain_size,
1631 bio_chain, coll, cur_seg);
4634246d 1632 else
1fec7093 1633 rbd_coll_end_req_index(rq, coll, cur_seg,
f7760dad
AE
1634 -ENOMEM, chain_size);
1635 size -= chain_size;
1636 ofs += chain_size;
602adf40 1637
1fec7093 1638 cur_seg++;
602adf40 1639 } while (size > 0);
1fec7093 1640 kref_put(&coll->kref, rbd_coll_release);
602adf40 1641
602adf40 1642 spin_lock_irq(q->queue_lock);
d1d25646
JD
1643
1644 ceph_put_snap_context(snapc);
602adf40
YS
1645 }
1646}
1647
1648/*
1649 * a queue callback. Makes sure that we don't create a bio that spans across
1650 * multiple osd objects. One exception would be with a single page bios,
f7760dad 1651 * which we handle later at bio_chain_clone_range()
602adf40
YS
1652 */
1653static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1654 struct bio_vec *bvec)
1655{
1656 struct rbd_device *rbd_dev = q->queuedata;
e5cfeed2
AE
1657 sector_t sector_offset;
1658 sector_t sectors_per_obj;
1659 sector_t obj_sector_offset;
1660 int ret;
1661
1662 /*
1663 * Find how far into its rbd object the partition-relative
1664 * bio start sector is to offset relative to the enclosing
1665 * device.
1666 */
1667 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1668 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1669 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
1670
1671 /*
1672 * Compute the number of bytes from that offset to the end
1673 * of the object. Account for what's already used by the bio.
1674 */
1675 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1676 if (ret > bmd->bi_size)
1677 ret -= bmd->bi_size;
1678 else
1679 ret = 0;
1680
1681 /*
1682 * Don't send back more than was asked for. And if the bio
1683 * was empty, let the whole thing through because: "Note
1684 * that a block device *must* allow a single page to be
1685 * added to an empty bio."
1686 */
1687 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1688 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1689 ret = (int) bvec->bv_len;
1690
1691 return ret;
602adf40
YS
1692}
1693
1694static void rbd_free_disk(struct rbd_device *rbd_dev)
1695{
1696 struct gendisk *disk = rbd_dev->disk;
1697
1698 if (!disk)
1699 return;
1700
602adf40
YS
1701 if (disk->flags & GENHD_FL_UP)
1702 del_gendisk(disk);
1703 if (disk->queue)
1704 blk_cleanup_queue(disk->queue);
1705 put_disk(disk);
1706}
1707
1708/*
4156d998
AE
1709 * Read the complete header for the given rbd device.
1710 *
1711 * Returns a pointer to a dynamically-allocated buffer containing
1712 * the complete and validated header. Caller can pass the address
1713 * of a variable that will be filled in with the version of the
1714 * header object at the time it was read.
1715 *
1716 * Returns a pointer-coded errno if a failure occurs.
602adf40 1717 */
4156d998
AE
1718static struct rbd_image_header_ondisk *
1719rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 1720{
4156d998 1721 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 1722 u32 snap_count = 0;
4156d998
AE
1723 u64 names_size = 0;
1724 u32 want_count;
1725 int ret;
602adf40 1726
00f1f36f 1727 /*
4156d998
AE
1728 * The complete header will include an array of its 64-bit
1729 * snapshot ids, followed by the names of those snapshots as
1730 * a contiguous block of NUL-terminated strings. Note that
1731 * the number of snapshots could change by the time we read
1732 * it in, in which case we re-read it.
00f1f36f 1733 */
4156d998
AE
1734 do {
1735 size_t size;
1736
1737 kfree(ondisk);
1738
1739 size = sizeof (*ondisk);
1740 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1741 size += names_size;
1742 ondisk = kmalloc(size, GFP_KERNEL);
1743 if (!ondisk)
1744 return ERR_PTR(-ENOMEM);
1745
1746 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
0bed54dc 1747 rbd_dev->header_name,
4156d998
AE
1748 0, size,
1749 (char *) ondisk, version);
1750
1751 if (ret < 0)
1752 goto out_err;
1753 if (WARN_ON((size_t) ret < size)) {
1754 ret = -ENXIO;
1755 pr_warning("short header read for image %s"
1756 " (want %zd got %d)\n",
0d7dbfce 1757 rbd_dev->spec->image_name, size, ret);
4156d998
AE
1758 goto out_err;
1759 }
1760 if (!rbd_dev_ondisk_valid(ondisk)) {
1761 ret = -ENXIO;
1762 pr_warning("invalid header for image %s\n",
0d7dbfce 1763 rbd_dev->spec->image_name);
4156d998 1764 goto out_err;
81e759fb 1765 }
602adf40 1766
4156d998
AE
1767 names_size = le64_to_cpu(ondisk->snap_names_len);
1768 want_count = snap_count;
1769 snap_count = le32_to_cpu(ondisk->snap_count);
1770 } while (snap_count != want_count);
00f1f36f 1771
4156d998 1772 return ondisk;
00f1f36f 1773
4156d998
AE
1774out_err:
1775 kfree(ondisk);
1776
1777 return ERR_PTR(ret);
1778}
1779
1780/*
1781 * reload the ondisk the header
1782 */
1783static int rbd_read_header(struct rbd_device *rbd_dev,
1784 struct rbd_image_header *header)
1785{
1786 struct rbd_image_header_ondisk *ondisk;
1787 u64 ver = 0;
1788 int ret;
602adf40 1789
4156d998
AE
1790 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1791 if (IS_ERR(ondisk))
1792 return PTR_ERR(ondisk);
1793 ret = rbd_header_from_disk(header, ondisk);
1794 if (ret >= 0)
1795 header->obj_version = ver;
1796 kfree(ondisk);
1797
1798 return ret;
602adf40
YS
1799}
1800
41f38c2b 1801static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
dfc5606d
YS
1802{
1803 struct rbd_snap *snap;
a0593290 1804 struct rbd_snap *next;
dfc5606d 1805
a0593290 1806 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
41f38c2b 1807 rbd_remove_snap_dev(snap);
dfc5606d
YS
1808}
1809
9478554a
AE
1810static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1811{
1812 sector_t size;
1813
0d7dbfce 1814 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
9478554a
AE
1815 return;
1816
1817 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1818 dout("setting size to %llu sectors", (unsigned long long) size);
1819 rbd_dev->mapping.size = (u64) size;
1820 set_capacity(rbd_dev->disk, size);
1821}
1822
602adf40
YS
1823/*
1824 * only read the first part of the ondisk header, without the snaps info
1825 */
117973fb 1826static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
1827{
1828 int ret;
1829 struct rbd_image_header h;
602adf40
YS
1830
1831 ret = rbd_read_header(rbd_dev, &h);
1832 if (ret < 0)
1833 return ret;
1834
a51aa0c0
JD
1835 down_write(&rbd_dev->header_rwsem);
1836
9478554a
AE
1837 /* Update image size, and check for resize of mapped image */
1838 rbd_dev->header.image_size = h.image_size;
1839 rbd_update_mapping_size(rbd_dev);
9db4b3e3 1840
849b4260 1841 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 1842 kfree(rbd_dev->header.snap_sizes);
849b4260 1843 kfree(rbd_dev->header.snap_names);
d1d25646
JD
1844 /* osd requests may still refer to snapc */
1845 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 1846
b813623a
AE
1847 if (hver)
1848 *hver = h.obj_version;
a71b891b 1849 rbd_dev->header.obj_version = h.obj_version;
93a24e08 1850 rbd_dev->header.image_size = h.image_size;
602adf40
YS
1851 rbd_dev->header.snapc = h.snapc;
1852 rbd_dev->header.snap_names = h.snap_names;
1853 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
1854 /* Free the extra copy of the object prefix */
1855 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1856 kfree(h.object_prefix);
1857
304f6808
AE
1858 ret = rbd_dev_snaps_update(rbd_dev);
1859 if (!ret)
1860 ret = rbd_dev_snaps_register(rbd_dev);
dfc5606d 1861
c666601a 1862 up_write(&rbd_dev->header_rwsem);
602adf40 1863
dfc5606d 1864 return ret;
602adf40
YS
1865}
1866
117973fb 1867static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
1fe5e993
AE
1868{
1869 int ret;
1870
117973fb 1871 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1fe5e993 1872 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
117973fb
AE
1873 if (rbd_dev->image_format == 1)
1874 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1875 else
1876 ret = rbd_dev_v2_refresh(rbd_dev, hver);
1fe5e993
AE
1877 mutex_unlock(&ctl_mutex);
1878
1879 return ret;
1880}
1881
602adf40
YS
1882static int rbd_init_disk(struct rbd_device *rbd_dev)
1883{
1884 struct gendisk *disk;
1885 struct request_queue *q;
593a9e7b 1886 u64 segment_size;
602adf40 1887
602adf40 1888 /* create gendisk info */
602adf40
YS
1889 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1890 if (!disk)
1fcdb8aa 1891 return -ENOMEM;
602adf40 1892
f0f8cef5 1893 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 1894 rbd_dev->dev_id);
602adf40
YS
1895 disk->major = rbd_dev->major;
1896 disk->first_minor = 0;
1897 disk->fops = &rbd_bd_ops;
1898 disk->private_data = rbd_dev;
1899
1900 /* init rq */
602adf40
YS
1901 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1902 if (!q)
1903 goto out_disk;
029bcbd8 1904
593a9e7b
AE
1905 /* We use the default size, but let's be explicit about it. */
1906 blk_queue_physical_block_size(q, SECTOR_SIZE);
1907
029bcbd8 1908 /* set io sizes to object size */
593a9e7b
AE
1909 segment_size = rbd_obj_bytes(&rbd_dev->header);
1910 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1911 blk_queue_max_segment_size(q, segment_size);
1912 blk_queue_io_min(q, segment_size);
1913 blk_queue_io_opt(q, segment_size);
029bcbd8 1914
602adf40
YS
1915 blk_queue_merge_bvec(q, rbd_merge_bvec);
1916 disk->queue = q;
1917
1918 q->queuedata = rbd_dev;
1919
1920 rbd_dev->disk = disk;
602adf40 1921
12f02944
AE
1922 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1923
602adf40 1924 return 0;
602adf40
YS
1925out_disk:
1926 put_disk(disk);
1fcdb8aa
AE
1927
1928 return -ENOMEM;
602adf40
YS
1929}
1930
dfc5606d
YS
1931/*
1932 sysfs
1933*/
1934
593a9e7b
AE
1935static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1936{
1937 return container_of(dev, struct rbd_device, dev);
1938}
1939
dfc5606d
YS
1940static ssize_t rbd_size_show(struct device *dev,
1941 struct device_attribute *attr, char *buf)
1942{
593a9e7b 1943 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
1944 sector_t size;
1945
1946 down_read(&rbd_dev->header_rwsem);
1947 size = get_capacity(rbd_dev->disk);
1948 up_read(&rbd_dev->header_rwsem);
dfc5606d 1949
a51aa0c0 1950 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
1951}
1952
34b13184
AE
1953/*
1954 * Note this shows the features for whatever's mapped, which is not
1955 * necessarily the base image.
1956 */
1957static ssize_t rbd_features_show(struct device *dev,
1958 struct device_attribute *attr, char *buf)
1959{
1960 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1961
1962 return sprintf(buf, "0x%016llx\n",
1963 (unsigned long long) rbd_dev->mapping.features);
1964}
1965
dfc5606d
YS
1966static ssize_t rbd_major_show(struct device *dev,
1967 struct device_attribute *attr, char *buf)
1968{
593a9e7b 1969 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 1970
dfc5606d
YS
1971 return sprintf(buf, "%d\n", rbd_dev->major);
1972}
1973
1974static ssize_t rbd_client_id_show(struct device *dev,
1975 struct device_attribute *attr, char *buf)
602adf40 1976{
593a9e7b 1977 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1978
1dbb4399
AE
1979 return sprintf(buf, "client%lld\n",
1980 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
1981}
1982
dfc5606d
YS
1983static ssize_t rbd_pool_show(struct device *dev,
1984 struct device_attribute *attr, char *buf)
602adf40 1985{
593a9e7b 1986 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1987
0d7dbfce 1988 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
dfc5606d
YS
1989}
1990
9bb2f334
AE
1991static ssize_t rbd_pool_id_show(struct device *dev,
1992 struct device_attribute *attr, char *buf)
1993{
1994 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1995
0d7dbfce
AE
1996 return sprintf(buf, "%llu\n",
1997 (unsigned long long) rbd_dev->spec->pool_id);
9bb2f334
AE
1998}
1999
dfc5606d
YS
2000static ssize_t rbd_name_show(struct device *dev,
2001 struct device_attribute *attr, char *buf)
2002{
593a9e7b 2003 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2004
a92ffdf8
AE
2005 if (rbd_dev->spec->image_name)
2006 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2007
2008 return sprintf(buf, "(unknown)\n");
dfc5606d
YS
2009}
2010
589d30e0
AE
2011static ssize_t rbd_image_id_show(struct device *dev,
2012 struct device_attribute *attr, char *buf)
2013{
2014 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2015
0d7dbfce 2016 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
589d30e0
AE
2017}
2018
34b13184
AE
2019/*
2020 * Shows the name of the currently-mapped snapshot (or
2021 * RBD_SNAP_HEAD_NAME for the base image).
2022 */
dfc5606d
YS
2023static ssize_t rbd_snap_show(struct device *dev,
2024 struct device_attribute *attr,
2025 char *buf)
2026{
593a9e7b 2027 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2028
0d7dbfce 2029 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
dfc5606d
YS
2030}
2031
86b00e0d
AE
2032/*
2033 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2034 * for the parent image. If there is no parent, simply shows
2035 * "(no parent image)".
2036 */
2037static ssize_t rbd_parent_show(struct device *dev,
2038 struct device_attribute *attr,
2039 char *buf)
2040{
2041 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2042 struct rbd_spec *spec = rbd_dev->parent_spec;
2043 int count;
2044 char *bufp = buf;
2045
2046 if (!spec)
2047 return sprintf(buf, "(no parent image)\n");
2048
2049 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2050 (unsigned long long) spec->pool_id, spec->pool_name);
2051 if (count < 0)
2052 return count;
2053 bufp += count;
2054
2055 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2056 spec->image_name ? spec->image_name : "(unknown)");
2057 if (count < 0)
2058 return count;
2059 bufp += count;
2060
2061 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2062 (unsigned long long) spec->snap_id, spec->snap_name);
2063 if (count < 0)
2064 return count;
2065 bufp += count;
2066
2067 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2068 if (count < 0)
2069 return count;
2070 bufp += count;
2071
2072 return (ssize_t) (bufp - buf);
2073}
2074
dfc5606d
YS
2075static ssize_t rbd_image_refresh(struct device *dev,
2076 struct device_attribute *attr,
2077 const char *buf,
2078 size_t size)
2079{
593a9e7b 2080 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 2081 int ret;
602adf40 2082
117973fb 2083 ret = rbd_dev_refresh(rbd_dev, NULL);
b813623a
AE
2084
2085 return ret < 0 ? ret : size;
dfc5606d 2086}
602adf40 2087
dfc5606d 2088static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
34b13184 2089static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
dfc5606d
YS
2090static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2091static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2092static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 2093static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d 2094static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
589d30e0 2095static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
dfc5606d
YS
2096static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2097static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
86b00e0d 2098static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
dfc5606d
YS
2099
2100static struct attribute *rbd_attrs[] = {
2101 &dev_attr_size.attr,
34b13184 2102 &dev_attr_features.attr,
dfc5606d
YS
2103 &dev_attr_major.attr,
2104 &dev_attr_client_id.attr,
2105 &dev_attr_pool.attr,
9bb2f334 2106 &dev_attr_pool_id.attr,
dfc5606d 2107 &dev_attr_name.attr,
589d30e0 2108 &dev_attr_image_id.attr,
dfc5606d 2109 &dev_attr_current_snap.attr,
86b00e0d 2110 &dev_attr_parent.attr,
dfc5606d 2111 &dev_attr_refresh.attr,
dfc5606d
YS
2112 NULL
2113};
2114
2115static struct attribute_group rbd_attr_group = {
2116 .attrs = rbd_attrs,
2117};
2118
2119static const struct attribute_group *rbd_attr_groups[] = {
2120 &rbd_attr_group,
2121 NULL
2122};
2123
2124static void rbd_sysfs_dev_release(struct device *dev)
2125{
2126}
2127
2128static struct device_type rbd_device_type = {
2129 .name = "rbd",
2130 .groups = rbd_attr_groups,
2131 .release = rbd_sysfs_dev_release,
2132};
2133
2134
2135/*
2136 sysfs - snapshots
2137*/
2138
2139static ssize_t rbd_snap_size_show(struct device *dev,
2140 struct device_attribute *attr,
2141 char *buf)
2142{
2143 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2144
3591538f 2145 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
2146}
2147
2148static ssize_t rbd_snap_id_show(struct device *dev,
2149 struct device_attribute *attr,
2150 char *buf)
2151{
2152 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2153
3591538f 2154 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
2155}
2156
34b13184
AE
2157static ssize_t rbd_snap_features_show(struct device *dev,
2158 struct device_attribute *attr,
2159 char *buf)
2160{
2161 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2162
2163 return sprintf(buf, "0x%016llx\n",
2164 (unsigned long long) snap->features);
2165}
2166
dfc5606d
YS
2167static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2168static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
34b13184 2169static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
dfc5606d
YS
2170
2171static struct attribute *rbd_snap_attrs[] = {
2172 &dev_attr_snap_size.attr,
2173 &dev_attr_snap_id.attr,
34b13184 2174 &dev_attr_snap_features.attr,
dfc5606d
YS
2175 NULL,
2176};
2177
2178static struct attribute_group rbd_snap_attr_group = {
2179 .attrs = rbd_snap_attrs,
2180};
2181
2182static void rbd_snap_dev_release(struct device *dev)
2183{
2184 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2185 kfree(snap->name);
2186 kfree(snap);
2187}
2188
2189static const struct attribute_group *rbd_snap_attr_groups[] = {
2190 &rbd_snap_attr_group,
2191 NULL
2192};
2193
2194static struct device_type rbd_snap_device_type = {
2195 .groups = rbd_snap_attr_groups,
2196 .release = rbd_snap_dev_release,
2197};
2198
8b8fb99c
AE
2199static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2200{
2201 kref_get(&spec->kref);
2202
2203 return spec;
2204}
2205
2206static void rbd_spec_free(struct kref *kref);
2207static void rbd_spec_put(struct rbd_spec *spec)
2208{
2209 if (spec)
2210 kref_put(&spec->kref, rbd_spec_free);
2211}
2212
2213static struct rbd_spec *rbd_spec_alloc(void)
2214{
2215 struct rbd_spec *spec;
2216
2217 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2218 if (!spec)
2219 return NULL;
2220 kref_init(&spec->kref);
2221
2222 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2223
2224 return spec;
2225}
2226
2227static void rbd_spec_free(struct kref *kref)
2228{
2229 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2230
2231 kfree(spec->pool_name);
2232 kfree(spec->image_id);
2233 kfree(spec->image_name);
2234 kfree(spec->snap_name);
2235 kfree(spec);
2236}
2237
c53d5893
AE
2238struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2239 struct rbd_spec *spec)
2240{
2241 struct rbd_device *rbd_dev;
2242
2243 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2244 if (!rbd_dev)
2245 return NULL;
2246
2247 spin_lock_init(&rbd_dev->lock);
2248 INIT_LIST_HEAD(&rbd_dev->node);
2249 INIT_LIST_HEAD(&rbd_dev->snaps);
2250 init_rwsem(&rbd_dev->header_rwsem);
2251
2252 rbd_dev->spec = spec;
2253 rbd_dev->rbd_client = rbdc;
2254
2255 return rbd_dev;
2256}
2257
2258static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2259{
86b00e0d 2260 rbd_spec_put(rbd_dev->parent_spec);
c53d5893
AE
2261 kfree(rbd_dev->header_name);
2262 rbd_put_client(rbd_dev->rbd_client);
2263 rbd_spec_put(rbd_dev->spec);
2264 kfree(rbd_dev);
2265}
2266
304f6808
AE
2267static bool rbd_snap_registered(struct rbd_snap *snap)
2268{
2269 bool ret = snap->dev.type == &rbd_snap_device_type;
2270 bool reg = device_is_registered(&snap->dev);
2271
2272 rbd_assert(!ret ^ reg);
2273
2274 return ret;
2275}
2276
41f38c2b 2277static void rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2278{
2279 list_del(&snap->node);
304f6808
AE
2280 if (device_is_registered(&snap->dev))
2281 device_unregister(&snap->dev);
dfc5606d
YS
2282}
2283
14e7085d 2284static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2285 struct device *parent)
2286{
2287 struct device *dev = &snap->dev;
2288 int ret;
2289
2290 dev->type = &rbd_snap_device_type;
2291 dev->parent = parent;
2292 dev->release = rbd_snap_dev_release;
d4b125e9 2293 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
304f6808
AE
2294 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2295
dfc5606d
YS
2296 ret = device_register(dev);
2297
2298 return ret;
2299}
2300
4e891e0a 2301static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
c8d18425 2302 const char *snap_name,
34b13184
AE
2303 u64 snap_id, u64 snap_size,
2304 u64 snap_features)
dfc5606d 2305{
4e891e0a 2306 struct rbd_snap *snap;
dfc5606d 2307 int ret;
4e891e0a
AE
2308
2309 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2310 if (!snap)
4e891e0a
AE
2311 return ERR_PTR(-ENOMEM);
2312
2313 ret = -ENOMEM;
c8d18425 2314 snap->name = kstrdup(snap_name, GFP_KERNEL);
4e891e0a
AE
2315 if (!snap->name)
2316 goto err;
2317
c8d18425
AE
2318 snap->id = snap_id;
2319 snap->size = snap_size;
34b13184 2320 snap->features = snap_features;
4e891e0a
AE
2321
2322 return snap;
2323
dfc5606d
YS
2324err:
2325 kfree(snap->name);
2326 kfree(snap);
4e891e0a
AE
2327
2328 return ERR_PTR(ret);
dfc5606d
YS
2329}
2330
cd892126
AE
2331static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2332 u64 *snap_size, u64 *snap_features)
2333{
2334 char *snap_name;
2335
2336 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2337
2338 *snap_size = rbd_dev->header.snap_sizes[which];
2339 *snap_features = 0; /* No features for v1 */
2340
2341 /* Skip over names until we find the one we are looking for */
2342
2343 snap_name = rbd_dev->header.snap_names;
2344 while (which--)
2345 snap_name += strlen(snap_name) + 1;
2346
2347 return snap_name;
2348}
2349
9d475de5
AE
2350/*
2351 * Get the size and object order for an image snapshot, or if
2352 * snap_id is CEPH_NOSNAP, gets this information for the base
2353 * image.
2354 */
2355static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2356 u8 *order, u64 *snap_size)
2357{
2358 __le64 snapid = cpu_to_le64(snap_id);
2359 int ret;
2360 struct {
2361 u8 order;
2362 __le64 size;
2363 } __attribute__ ((packed)) size_buf = { 0 };
2364
2365 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2366 "rbd", "get_size",
2367 (char *) &snapid, sizeof (snapid),
2368 (char *) &size_buf, sizeof (size_buf),
2369 CEPH_OSD_FLAG_READ, NULL);
2370 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2371 if (ret < 0)
2372 return ret;
2373
2374 *order = size_buf.order;
2375 *snap_size = le64_to_cpu(size_buf.size);
2376
2377 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2378 (unsigned long long) snap_id, (unsigned int) *order,
2379 (unsigned long long) *snap_size);
2380
2381 return 0;
2382}
2383
2384static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2385{
2386 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2387 &rbd_dev->header.obj_order,
2388 &rbd_dev->header.image_size);
2389}
2390
1e130199
AE
2391static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2392{
2393 void *reply_buf;
2394 int ret;
2395 void *p;
2396
2397 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2398 if (!reply_buf)
2399 return -ENOMEM;
2400
2401 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2402 "rbd", "get_object_prefix",
2403 NULL, 0,
2404 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2405 CEPH_OSD_FLAG_READ, NULL);
2406 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2407 if (ret < 0)
2408 goto out;
a0ea3a40 2409 ret = 0; /* rbd_req_sync_exec() can return positive */
1e130199
AE
2410
2411 p = reply_buf;
2412 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2413 p + RBD_OBJ_PREFIX_LEN_MAX,
2414 NULL, GFP_NOIO);
2415
2416 if (IS_ERR(rbd_dev->header.object_prefix)) {
2417 ret = PTR_ERR(rbd_dev->header.object_prefix);
2418 rbd_dev->header.object_prefix = NULL;
2419 } else {
2420 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2421 }
2422
2423out:
2424 kfree(reply_buf);
2425
2426 return ret;
2427}
2428
b1b5402a
AE
2429static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2430 u64 *snap_features)
2431{
2432 __le64 snapid = cpu_to_le64(snap_id);
2433 struct {
2434 __le64 features;
2435 __le64 incompat;
2436 } features_buf = { 0 };
d889140c 2437 u64 incompat;
b1b5402a
AE
2438 int ret;
2439
2440 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2441 "rbd", "get_features",
2442 (char *) &snapid, sizeof (snapid),
2443 (char *) &features_buf, sizeof (features_buf),
2444 CEPH_OSD_FLAG_READ, NULL);
2445 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2446 if (ret < 0)
2447 return ret;
d889140c
AE
2448
2449 incompat = le64_to_cpu(features_buf.incompat);
2450 if (incompat & ~RBD_FEATURES_ALL)
2451 return -ENOTSUPP;
2452
b1b5402a
AE
2453 *snap_features = le64_to_cpu(features_buf.features);
2454
2455 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2456 (unsigned long long) snap_id,
2457 (unsigned long long) *snap_features,
2458 (unsigned long long) le64_to_cpu(features_buf.incompat));
2459
2460 return 0;
2461}
2462
2463static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2464{
2465 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2466 &rbd_dev->header.features);
2467}
2468
86b00e0d
AE
2469static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2470{
2471 struct rbd_spec *parent_spec;
2472 size_t size;
2473 void *reply_buf = NULL;
2474 __le64 snapid;
2475 void *p;
2476 void *end;
2477 char *image_id;
2478 u64 overlap;
2479 size_t len = 0;
2480 int ret;
2481
2482 parent_spec = rbd_spec_alloc();
2483 if (!parent_spec)
2484 return -ENOMEM;
2485
2486 size = sizeof (__le64) + /* pool_id */
2487 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2488 sizeof (__le64) + /* snap_id */
2489 sizeof (__le64); /* overlap */
2490 reply_buf = kmalloc(size, GFP_KERNEL);
2491 if (!reply_buf) {
2492 ret = -ENOMEM;
2493 goto out_err;
2494 }
2495
2496 snapid = cpu_to_le64(CEPH_NOSNAP);
2497 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2498 "rbd", "get_parent",
2499 (char *) &snapid, sizeof (snapid),
2500 (char *) reply_buf, size,
2501 CEPH_OSD_FLAG_READ, NULL);
2502 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2503 if (ret < 0)
2504 goto out_err;
2505
2506 ret = -ERANGE;
2507 p = reply_buf;
2508 end = (char *) reply_buf + size;
2509 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2510 if (parent_spec->pool_id == CEPH_NOPOOL)
2511 goto out; /* No parent? No problem. */
2512
2513 image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2514 if (IS_ERR(image_id)) {
2515 ret = PTR_ERR(image_id);
2516 goto out_err;
2517 }
2518 parent_spec->image_id = image_id;
9e15b77d 2519 parent_spec->image_id_len = len;
86b00e0d
AE
2520 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2521 ceph_decode_64_safe(&p, end, overlap, out_err);
2522
2523 rbd_dev->parent_overlap = overlap;
2524 rbd_dev->parent_spec = parent_spec;
2525 parent_spec = NULL; /* rbd_dev now owns this */
2526out:
2527 ret = 0;
2528out_err:
2529 kfree(reply_buf);
2530 rbd_spec_put(parent_spec);
2531
2532 return ret;
2533}
2534
9e15b77d
AE
2535static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2536{
2537 size_t image_id_size;
2538 char *image_id;
2539 void *p;
2540 void *end;
2541 size_t size;
2542 void *reply_buf = NULL;
2543 size_t len = 0;
2544 char *image_name = NULL;
2545 int ret;
2546
2547 rbd_assert(!rbd_dev->spec->image_name);
2548
2549 image_id_size = sizeof (__le32) + rbd_dev->spec->image_id_len;
2550 image_id = kmalloc(image_id_size, GFP_KERNEL);
2551 if (!image_id)
2552 return NULL;
2553
2554 p = image_id;
2555 end = (char *) image_id + image_id_size;
2556 ceph_encode_string(&p, end, rbd_dev->spec->image_id,
2557 (u32) rbd_dev->spec->image_id_len);
2558
2559 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2560 reply_buf = kmalloc(size, GFP_KERNEL);
2561 if (!reply_buf)
2562 goto out;
2563
2564 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
2565 "rbd", "dir_get_name",
2566 image_id, image_id_size,
2567 (char *) reply_buf, size,
2568 CEPH_OSD_FLAG_READ, NULL);
2569 if (ret < 0)
2570 goto out;
2571 p = reply_buf;
2572 end = (char *) reply_buf + size;
2573 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2574 if (IS_ERR(image_name))
2575 image_name = NULL;
2576 else
2577 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2578out:
2579 kfree(reply_buf);
2580 kfree(image_id);
2581
2582 return image_name;
2583}
2584
2585/*
2586 * When a parent image gets probed, we only have the pool, image,
2587 * and snapshot ids but not the names of any of them. This call
2588 * is made later to fill in those names. It has to be done after
2589 * rbd_dev_snaps_update() has completed because some of the
2590 * information (in particular, snapshot name) is not available
2591 * until then.
2592 */
2593static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2594{
2595 struct ceph_osd_client *osdc;
2596 const char *name;
2597 void *reply_buf = NULL;
2598 int ret;
2599
2600 if (rbd_dev->spec->pool_name)
2601 return 0; /* Already have the names */
2602
2603 /* Look up the pool name */
2604
2605 osdc = &rbd_dev->rbd_client->client->osdc;
2606 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
2607 if (!name)
2608 return -EIO; /* pool id too large (>= 2^31) */
2609
2610 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2611 if (!rbd_dev->spec->pool_name)
2612 return -ENOMEM;
2613
2614 /* Fetch the image name; tolerate failure here */
2615
2616 name = rbd_dev_image_name(rbd_dev);
2617 if (name) {
2618 rbd_dev->spec->image_name_len = strlen(name);
2619 rbd_dev->spec->image_name = (char *) name;
2620 } else {
2621 pr_warning(RBD_DRV_NAME "%d "
2622 "unable to get image name for image id %s\n",
2623 rbd_dev->major, rbd_dev->spec->image_id);
2624 }
2625
2626 /* Look up the snapshot name. */
2627
2628 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2629 if (!name) {
2630 ret = -EIO;
2631 goto out_err;
2632 }
2633 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
2634 if(!rbd_dev->spec->snap_name)
2635 goto out_err;
2636
2637 return 0;
2638out_err:
2639 kfree(reply_buf);
2640 kfree(rbd_dev->spec->pool_name);
2641 rbd_dev->spec->pool_name = NULL;
2642
2643 return ret;
2644}
2645
6e14b1a6 2646static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
35d489f9
AE
2647{
2648 size_t size;
2649 int ret;
2650 void *reply_buf;
2651 void *p;
2652 void *end;
2653 u64 seq;
2654 u32 snap_count;
2655 struct ceph_snap_context *snapc;
2656 u32 i;
2657
2658 /*
2659 * We'll need room for the seq value (maximum snapshot id),
2660 * snapshot count, and array of that many snapshot ids.
2661 * For now we have a fixed upper limit on the number we're
2662 * prepared to receive.
2663 */
2664 size = sizeof (__le64) + sizeof (__le32) +
2665 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2666 reply_buf = kzalloc(size, GFP_KERNEL);
2667 if (!reply_buf)
2668 return -ENOMEM;
2669
2670 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2671 "rbd", "get_snapcontext",
2672 NULL, 0,
2673 reply_buf, size,
6e14b1a6 2674 CEPH_OSD_FLAG_READ, ver);
35d489f9
AE
2675 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2676 if (ret < 0)
2677 goto out;
2678
2679 ret = -ERANGE;
2680 p = reply_buf;
2681 end = (char *) reply_buf + size;
2682 ceph_decode_64_safe(&p, end, seq, out);
2683 ceph_decode_32_safe(&p, end, snap_count, out);
2684
2685 /*
2686 * Make sure the reported number of snapshot ids wouldn't go
2687 * beyond the end of our buffer. But before checking that,
2688 * make sure the computed size of the snapshot context we
2689 * allocate is representable in a size_t.
2690 */
2691 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2692 / sizeof (u64)) {
2693 ret = -EINVAL;
2694 goto out;
2695 }
2696 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2697 goto out;
2698
2699 size = sizeof (struct ceph_snap_context) +
2700 snap_count * sizeof (snapc->snaps[0]);
2701 snapc = kmalloc(size, GFP_KERNEL);
2702 if (!snapc) {
2703 ret = -ENOMEM;
2704 goto out;
2705 }
2706
2707 atomic_set(&snapc->nref, 1);
2708 snapc->seq = seq;
2709 snapc->num_snaps = snap_count;
2710 for (i = 0; i < snap_count; i++)
2711 snapc->snaps[i] = ceph_decode_64(&p);
2712
2713 rbd_dev->header.snapc = snapc;
2714
2715 dout(" snap context seq = %llu, snap_count = %u\n",
2716 (unsigned long long) seq, (unsigned int) snap_count);
2717
2718out:
2719 kfree(reply_buf);
2720
2721 return 0;
2722}
2723
b8b1e2db
AE
2724static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2725{
2726 size_t size;
2727 void *reply_buf;
2728 __le64 snap_id;
2729 int ret;
2730 void *p;
2731 void *end;
b8b1e2db
AE
2732 char *snap_name;
2733
2734 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2735 reply_buf = kmalloc(size, GFP_KERNEL);
2736 if (!reply_buf)
2737 return ERR_PTR(-ENOMEM);
2738
2739 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2740 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2741 "rbd", "get_snapshot_name",
2742 (char *) &snap_id, sizeof (snap_id),
2743 reply_buf, size,
2744 CEPH_OSD_FLAG_READ, NULL);
2745 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2746 if (ret < 0)
2747 goto out;
2748
2749 p = reply_buf;
2750 end = (char *) reply_buf + size;
e5c35534 2751 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
b8b1e2db
AE
2752 if (IS_ERR(snap_name)) {
2753 ret = PTR_ERR(snap_name);
2754 goto out;
2755 } else {
2756 dout(" snap_id 0x%016llx snap_name = %s\n",
2757 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2758 }
2759 kfree(reply_buf);
2760
2761 return snap_name;
2762out:
2763 kfree(reply_buf);
2764
2765 return ERR_PTR(ret);
2766}
2767
2768static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2769 u64 *snap_size, u64 *snap_features)
2770{
2771 __le64 snap_id;
2772 u8 order;
2773 int ret;
2774
2775 snap_id = rbd_dev->header.snapc->snaps[which];
2776 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2777 if (ret)
2778 return ERR_PTR(ret);
2779 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2780 if (ret)
2781 return ERR_PTR(ret);
2782
2783 return rbd_dev_v2_snap_name(rbd_dev, which);
2784}
2785
2786static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2787 u64 *snap_size, u64 *snap_features)
2788{
2789 if (rbd_dev->image_format == 1)
2790 return rbd_dev_v1_snap_info(rbd_dev, which,
2791 snap_size, snap_features);
2792 if (rbd_dev->image_format == 2)
2793 return rbd_dev_v2_snap_info(rbd_dev, which,
2794 snap_size, snap_features);
2795 return ERR_PTR(-EINVAL);
2796}
2797
117973fb
AE
2798static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2799{
2800 int ret;
2801 __u8 obj_order;
2802
2803 down_write(&rbd_dev->header_rwsem);
2804
2805 /* Grab old order first, to see if it changes */
2806
2807 obj_order = rbd_dev->header.obj_order,
2808 ret = rbd_dev_v2_image_size(rbd_dev);
2809 if (ret)
2810 goto out;
2811 if (rbd_dev->header.obj_order != obj_order) {
2812 ret = -EIO;
2813 goto out;
2814 }
2815 rbd_update_mapping_size(rbd_dev);
2816
2817 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2818 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2819 if (ret)
2820 goto out;
2821 ret = rbd_dev_snaps_update(rbd_dev);
2822 dout("rbd_dev_snaps_update returned %d\n", ret);
2823 if (ret)
2824 goto out;
2825 ret = rbd_dev_snaps_register(rbd_dev);
2826 dout("rbd_dev_snaps_register returned %d\n", ret);
2827out:
2828 up_write(&rbd_dev->header_rwsem);
2829
2830 return ret;
2831}
2832
dfc5606d 2833/*
35938150
AE
2834 * Scan the rbd device's current snapshot list and compare it to the
2835 * newly-received snapshot context. Remove any existing snapshots
2836 * not present in the new snapshot context. Add a new snapshot for
2837 * any snaphots in the snapshot context not in the current list.
2838 * And verify there are no changes to snapshots we already know
2839 * about.
2840 *
2841 * Assumes the snapshots in the snapshot context are sorted by
2842 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2843 * are also maintained in that order.)
dfc5606d 2844 */
304f6808 2845static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
dfc5606d 2846{
35938150
AE
2847 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2848 const u32 snap_count = snapc->num_snaps;
35938150
AE
2849 struct list_head *head = &rbd_dev->snaps;
2850 struct list_head *links = head->next;
2851 u32 index = 0;
dfc5606d 2852
9fcbb800 2853 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
2854 while (index < snap_count || links != head) {
2855 u64 snap_id;
2856 struct rbd_snap *snap;
cd892126
AE
2857 char *snap_name;
2858 u64 snap_size = 0;
2859 u64 snap_features = 0;
dfc5606d 2860
35938150
AE
2861 snap_id = index < snap_count ? snapc->snaps[index]
2862 : CEPH_NOSNAP;
2863 snap = links != head ? list_entry(links, struct rbd_snap, node)
2864 : NULL;
aafb230e 2865 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 2866
35938150
AE
2867 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2868 struct list_head *next = links->next;
dfc5606d 2869
35938150 2870 /* Existing snapshot not in the new snap context */
dfc5606d 2871
0d7dbfce 2872 if (rbd_dev->spec->snap_id == snap->id)
daba5fdb 2873 rbd_dev->exists = false;
41f38c2b 2874 rbd_remove_snap_dev(snap);
9fcbb800 2875 dout("%ssnap id %llu has been removed\n",
0d7dbfce
AE
2876 rbd_dev->spec->snap_id == snap->id ?
2877 "mapped " : "",
9fcbb800 2878 (unsigned long long) snap->id);
35938150
AE
2879
2880 /* Done with this list entry; advance */
2881
2882 links = next;
dfc5606d
YS
2883 continue;
2884 }
35938150 2885
b8b1e2db
AE
2886 snap_name = rbd_dev_snap_info(rbd_dev, index,
2887 &snap_size, &snap_features);
cd892126
AE
2888 if (IS_ERR(snap_name))
2889 return PTR_ERR(snap_name);
2890
9fcbb800
AE
2891 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2892 (unsigned long long) snap_id);
35938150
AE
2893 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2894 struct rbd_snap *new_snap;
2895
2896 /* We haven't seen this snapshot before */
2897
c8d18425 2898 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
cd892126 2899 snap_id, snap_size, snap_features);
9fcbb800
AE
2900 if (IS_ERR(new_snap)) {
2901 int err = PTR_ERR(new_snap);
2902
2903 dout(" failed to add dev, error %d\n", err);
2904
2905 return err;
2906 }
35938150
AE
2907
2908 /* New goes before existing, or at end of list */
2909
9fcbb800 2910 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
2911 if (snap)
2912 list_add_tail(&new_snap->node, &snap->node);
2913 else
523f3258 2914 list_add_tail(&new_snap->node, head);
35938150
AE
2915 } else {
2916 /* Already have this one */
2917
9fcbb800
AE
2918 dout(" already present\n");
2919
cd892126 2920 rbd_assert(snap->size == snap_size);
aafb230e 2921 rbd_assert(!strcmp(snap->name, snap_name));
cd892126 2922 rbd_assert(snap->features == snap_features);
35938150
AE
2923
2924 /* Done with this list entry; advance */
2925
2926 links = links->next;
dfc5606d 2927 }
35938150
AE
2928
2929 /* Advance to the next entry in the snapshot context */
2930
2931 index++;
dfc5606d 2932 }
9fcbb800 2933 dout("%s: done\n", __func__);
dfc5606d
YS
2934
2935 return 0;
2936}
2937
304f6808
AE
2938/*
2939 * Scan the list of snapshots and register the devices for any that
2940 * have not already been registered.
2941 */
2942static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2943{
2944 struct rbd_snap *snap;
2945 int ret = 0;
2946
2947 dout("%s called\n", __func__);
86ff77bb
AE
2948 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2949 return -EIO;
304f6808
AE
2950
2951 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2952 if (!rbd_snap_registered(snap)) {
2953 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2954 if (ret < 0)
2955 break;
2956 }
2957 }
2958 dout("%s: returning %d\n", __func__, ret);
2959
2960 return ret;
2961}
2962
dfc5606d
YS
2963static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2964{
dfc5606d 2965 struct device *dev;
cd789ab9 2966 int ret;
dfc5606d
YS
2967
2968 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
dfc5606d 2969
cd789ab9 2970 dev = &rbd_dev->dev;
dfc5606d
YS
2971 dev->bus = &rbd_bus_type;
2972 dev->type = &rbd_device_type;
2973 dev->parent = &rbd_root_dev;
2974 dev->release = rbd_dev_release;
de71a297 2975 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d 2976 ret = device_register(dev);
dfc5606d 2977
dfc5606d 2978 mutex_unlock(&ctl_mutex);
cd789ab9 2979
dfc5606d 2980 return ret;
602adf40
YS
2981}
2982
dfc5606d
YS
2983static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2984{
2985 device_unregister(&rbd_dev->dev);
2986}
2987
59c2be1e
YS
2988static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2989{
2990 int ret, rc;
2991
2992 do {
0e6f322d 2993 ret = rbd_req_sync_watch(rbd_dev);
59c2be1e 2994 if (ret == -ERANGE) {
117973fb 2995 rc = rbd_dev_refresh(rbd_dev, NULL);
59c2be1e
YS
2996 if (rc < 0)
2997 return rc;
2998 }
2999 } while (ret == -ERANGE);
3000
3001 return ret;
3002}
3003
e2839308 3004static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
3005
3006/*
499afd5b
AE
3007 * Get a unique rbd identifier for the given new rbd_dev, and add
3008 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 3009 */
e2839308 3010static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 3011{
e2839308 3012 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
3013
3014 spin_lock(&rbd_dev_list_lock);
3015 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3016 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
3017 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3018 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 3019}
b7f23c36 3020
1ddbe94e 3021/*
499afd5b
AE
3022 * Remove an rbd_dev from the global list, and record that its
3023 * identifier is no longer in use.
1ddbe94e 3024 */
e2839308 3025static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 3026{
d184f6bf 3027 struct list_head *tmp;
de71a297 3028 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
3029 int max_id;
3030
aafb230e 3031 rbd_assert(rbd_id > 0);
499afd5b 3032
e2839308
AE
3033 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3034 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
3035 spin_lock(&rbd_dev_list_lock);
3036 list_del_init(&rbd_dev->node);
d184f6bf
AE
3037
3038 /*
3039 * If the id being "put" is not the current maximum, there
3040 * is nothing special we need to do.
3041 */
e2839308 3042 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
3043 spin_unlock(&rbd_dev_list_lock);
3044 return;
3045 }
3046
3047 /*
3048 * We need to update the current maximum id. Search the
3049 * list to find out what it is. We're more likely to find
3050 * the maximum at the end, so search the list backward.
3051 */
3052 max_id = 0;
3053 list_for_each_prev(tmp, &rbd_dev_list) {
3054 struct rbd_device *rbd_dev;
3055
3056 rbd_dev = list_entry(tmp, struct rbd_device, node);
b213e0b1
AE
3057 if (rbd_dev->dev_id > max_id)
3058 max_id = rbd_dev->dev_id;
d184f6bf 3059 }
499afd5b 3060 spin_unlock(&rbd_dev_list_lock);
b7f23c36 3061
1ddbe94e 3062 /*
e2839308 3063 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
3064 * which case it now accurately reflects the new maximum.
3065 * Be careful not to overwrite the maximum value in that
3066 * case.
1ddbe94e 3067 */
e2839308
AE
3068 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3069 dout(" max dev id has been reset\n");
b7f23c36
AE
3070}
3071
e28fff26
AE
3072/*
3073 * Skips over white space at *buf, and updates *buf to point to the
3074 * first found non-space character (if any). Returns the length of
593a9e7b
AE
3075 * the token (string of non-white space characters) found. Note
3076 * that *buf must be terminated with '\0'.
e28fff26
AE
3077 */
3078static inline size_t next_token(const char **buf)
3079{
3080 /*
3081 * These are the characters that produce nonzero for
3082 * isspace() in the "C" and "POSIX" locales.
3083 */
3084 const char *spaces = " \f\n\r\t\v";
3085
3086 *buf += strspn(*buf, spaces); /* Find start of token */
3087
3088 return strcspn(*buf, spaces); /* Return token length */
3089}
3090
3091/*
3092 * Finds the next token in *buf, and if the provided token buffer is
3093 * big enough, copies the found token into it. The result, if
593a9e7b
AE
3094 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3095 * must be terminated with '\0' on entry.
e28fff26
AE
3096 *
3097 * Returns the length of the token found (not including the '\0').
3098 * Return value will be 0 if no token is found, and it will be >=
3099 * token_size if the token would not fit.
3100 *
593a9e7b 3101 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
3102 * found token. Note that this occurs even if the token buffer is
3103 * too small to hold it.
3104 */
3105static inline size_t copy_token(const char **buf,
3106 char *token,
3107 size_t token_size)
3108{
3109 size_t len;
3110
3111 len = next_token(buf);
3112 if (len < token_size) {
3113 memcpy(token, *buf, len);
3114 *(token + len) = '\0';
3115 }
3116 *buf += len;
3117
3118 return len;
3119}
3120
ea3352f4
AE
3121/*
3122 * Finds the next token in *buf, dynamically allocates a buffer big
3123 * enough to hold a copy of it, and copies the token into the new
3124 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3125 * that a duplicate buffer is created even for a zero-length token.
3126 *
3127 * Returns a pointer to the newly-allocated duplicate, or a null
3128 * pointer if memory for the duplicate was not available. If
3129 * the lenp argument is a non-null pointer, the length of the token
3130 * (not including the '\0') is returned in *lenp.
3131 *
3132 * If successful, the *buf pointer will be updated to point beyond
3133 * the end of the found token.
3134 *
3135 * Note: uses GFP_KERNEL for allocation.
3136 */
3137static inline char *dup_token(const char **buf, size_t *lenp)
3138{
3139 char *dup;
3140 size_t len;
3141
3142 len = next_token(buf);
3143 dup = kmalloc(len + 1, GFP_KERNEL);
3144 if (!dup)
3145 return NULL;
3146
3147 memcpy(dup, *buf, len);
3148 *(dup + len) = '\0';
3149 *buf += len;
3150
3151 if (lenp)
3152 *lenp = len;
3153
3154 return dup;
3155}
3156
a725f65e 3157/*
859c31df
AE
3158 * Parse the options provided for an "rbd add" (i.e., rbd image
3159 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3160 * and the data written is passed here via a NUL-terminated buffer.
3161 * Returns 0 if successful or an error code otherwise.
d22f76e7 3162 *
859c31df
AE
3163 * The information extracted from these options is recorded in
3164 * the other parameters which return dynamically-allocated
3165 * structures:
3166 * ceph_opts
3167 * The address of a pointer that will refer to a ceph options
3168 * structure. Caller must release the returned pointer using
3169 * ceph_destroy_options() when it is no longer needed.
3170 * rbd_opts
3171 * Address of an rbd options pointer. Fully initialized by
3172 * this function; caller must release with kfree().
3173 * spec
3174 * Address of an rbd image specification pointer. Fully
3175 * initialized by this function based on parsed options.
3176 * Caller must release with rbd_spec_put().
3177 *
3178 * The options passed take this form:
3179 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3180 * where:
3181 * <mon_addrs>
3182 * A comma-separated list of one or more monitor addresses.
3183 * A monitor address is an ip address, optionally followed
3184 * by a port number (separated by a colon).
3185 * I.e.: ip1[:port1][,ip2[:port2]...]
3186 * <options>
3187 * A comma-separated list of ceph and/or rbd options.
3188 * <pool_name>
3189 * The name of the rados pool containing the rbd image.
3190 * <image_name>
3191 * The name of the image in that pool to map.
3192 * <snap_id>
3193 * An optional snapshot id. If provided, the mapping will
3194 * present data from the image at the time that snapshot was
3195 * created. The image head is used if no snapshot id is
3196 * provided. Snapshot mappings are always read-only.
a725f65e 3197 */
859c31df 3198static int rbd_add_parse_args(const char *buf,
dc79b113 3199 struct ceph_options **ceph_opts,
859c31df
AE
3200 struct rbd_options **opts,
3201 struct rbd_spec **rbd_spec)
e28fff26 3202{
d22f76e7 3203 size_t len;
859c31df 3204 char *options;
0ddebc0c
AE
3205 const char *mon_addrs;
3206 size_t mon_addrs_size;
859c31df 3207 struct rbd_spec *spec = NULL;
4e9afeba 3208 struct rbd_options *rbd_opts = NULL;
859c31df 3209 struct ceph_options *copts;
dc79b113 3210 int ret;
e28fff26
AE
3211
3212 /* The first four tokens are required */
3213
7ef3214a
AE
3214 len = next_token(&buf);
3215 if (!len)
dc79b113 3216 return -EINVAL; /* Missing monitor address(es) */
0ddebc0c 3217 mon_addrs = buf;
f28e565a 3218 mon_addrs_size = len + 1;
7ef3214a 3219 buf += len;
a725f65e 3220
dc79b113 3221 ret = -EINVAL;
f28e565a
AE
3222 options = dup_token(&buf, NULL);
3223 if (!options)
dc79b113 3224 return -ENOMEM;
f28e565a
AE
3225 if (!*options)
3226 goto out_err; /* Missing options */
e28fff26 3227
859c31df
AE
3228 spec = rbd_spec_alloc();
3229 if (!spec)
f28e565a 3230 goto out_mem;
859c31df
AE
3231
3232 spec->pool_name = dup_token(&buf, NULL);
3233 if (!spec->pool_name)
3234 goto out_mem;
3235 if (!*spec->pool_name)
f28e565a 3236 goto out_err; /* Missing pool name */
e28fff26 3237
859c31df
AE
3238 spec->image_name = dup_token(&buf, &spec->image_name_len);
3239 if (!spec->image_name)
f28e565a 3240 goto out_mem;
859c31df 3241 if (!*spec->image_name)
f28e565a 3242 goto out_err; /* Missing image name */
d4b125e9 3243
f28e565a
AE
3244 /*
3245 * Snapshot name is optional; default is to use "-"
3246 * (indicating the head/no snapshot).
3247 */
3feeb894 3248 len = next_token(&buf);
820a5f3e 3249 if (!len) {
3feeb894
AE
3250 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3251 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a 3252 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
dc79b113 3253 ret = -ENAMETOOLONG;
f28e565a 3254 goto out_err;
849b4260 3255 }
859c31df
AE
3256 spec->snap_name = kmalloc(len + 1, GFP_KERNEL);
3257 if (!spec->snap_name)
f28e565a 3258 goto out_mem;
859c31df
AE
3259 memcpy(spec->snap_name, buf, len);
3260 *(spec->snap_name + len) = '\0';
e5c35534 3261
0ddebc0c 3262 /* Initialize all rbd options to the defaults */
e28fff26 3263
4e9afeba
AE
3264 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3265 if (!rbd_opts)
3266 goto out_mem;
3267
3268 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
d22f76e7 3269
859c31df 3270 copts = ceph_parse_options(options, mon_addrs,
0ddebc0c 3271 mon_addrs + mon_addrs_size - 1,
4e9afeba 3272 parse_rbd_opts_token, rbd_opts);
859c31df
AE
3273 if (IS_ERR(copts)) {
3274 ret = PTR_ERR(copts);
dc79b113
AE
3275 goto out_err;
3276 }
859c31df
AE
3277 kfree(options);
3278
3279 *ceph_opts = copts;
4e9afeba 3280 *opts = rbd_opts;
859c31df 3281 *rbd_spec = spec;
0ddebc0c 3282
dc79b113 3283 return 0;
f28e565a 3284out_mem:
dc79b113 3285 ret = -ENOMEM;
d22f76e7 3286out_err:
859c31df
AE
3287 kfree(rbd_opts);
3288 rbd_spec_put(spec);
f28e565a 3289 kfree(options);
d22f76e7 3290
dc79b113 3291 return ret;
a725f65e
AE
3292}
3293
589d30e0
AE
3294/*
3295 * An rbd format 2 image has a unique identifier, distinct from the
3296 * name given to it by the user. Internally, that identifier is
3297 * what's used to specify the names of objects related to the image.
3298 *
3299 * A special "rbd id" object is used to map an rbd image name to its
3300 * id. If that object doesn't exist, then there is no v2 rbd image
3301 * with the supplied name.
3302 *
3303 * This function will record the given rbd_dev's image_id field if
3304 * it can be determined, and in that case will return 0. If any
3305 * errors occur a negative errno will be returned and the rbd_dev's
3306 * image_id field will be unchanged (and should be NULL).
3307 */
3308static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3309{
3310 int ret;
3311 size_t size;
3312 char *object_name;
3313 void *response;
3314 void *p;
3315
2c0d0a10
AE
3316 /*
3317 * When probing a parent image, the image id is already
3318 * known (and the image name likely is not). There's no
3319 * need to fetch the image id again in this case.
3320 */
3321 if (rbd_dev->spec->image_id)
3322 return 0;
3323
589d30e0
AE
3324 /*
3325 * First, see if the format 2 image id file exists, and if
3326 * so, get the image's persistent id from it.
3327 */
0d7dbfce 3328 size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len;
589d30e0
AE
3329 object_name = kmalloc(size, GFP_NOIO);
3330 if (!object_name)
3331 return -ENOMEM;
0d7dbfce 3332 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
589d30e0
AE
3333 dout("rbd id object name is %s\n", object_name);
3334
3335 /* Response will be an encoded string, which includes a length */
3336
3337 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3338 response = kzalloc(size, GFP_NOIO);
3339 if (!response) {
3340 ret = -ENOMEM;
3341 goto out;
3342 }
3343
3344 ret = rbd_req_sync_exec(rbd_dev, object_name,
3345 "rbd", "get_id",
3346 NULL, 0,
3347 response, RBD_IMAGE_ID_LEN_MAX,
3348 CEPH_OSD_FLAG_READ, NULL);
3349 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3350 if (ret < 0)
3351 goto out;
a0ea3a40 3352 ret = 0; /* rbd_req_sync_exec() can return positive */
589d30e0
AE
3353
3354 p = response;
0d7dbfce 3355 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
589d30e0 3356 p + RBD_IMAGE_ID_LEN_MAX,
0d7dbfce 3357 &rbd_dev->spec->image_id_len,
589d30e0 3358 GFP_NOIO);
0d7dbfce
AE
3359 if (IS_ERR(rbd_dev->spec->image_id)) {
3360 ret = PTR_ERR(rbd_dev->spec->image_id);
3361 rbd_dev->spec->image_id = NULL;
589d30e0 3362 } else {
0d7dbfce 3363 dout("image_id is %s\n", rbd_dev->spec->image_id);
589d30e0
AE
3364 }
3365out:
3366 kfree(response);
3367 kfree(object_name);
3368
3369 return ret;
3370}
3371
a30b71b9
AE
3372static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3373{
3374 int ret;
3375 size_t size;
3376
3377 /* Version 1 images have no id; empty string is used */
3378
0d7dbfce
AE
3379 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3380 if (!rbd_dev->spec->image_id)
a30b71b9 3381 return -ENOMEM;
0d7dbfce 3382 rbd_dev->spec->image_id_len = 0;
a30b71b9
AE
3383
3384 /* Record the header object name for this rbd image. */
3385
0d7dbfce 3386 size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX);
a30b71b9
AE
3387 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3388 if (!rbd_dev->header_name) {
3389 ret = -ENOMEM;
3390 goto out_err;
3391 }
0d7dbfce
AE
3392 sprintf(rbd_dev->header_name, "%s%s",
3393 rbd_dev->spec->image_name, RBD_SUFFIX);
a30b71b9
AE
3394
3395 /* Populate rbd image metadata */
3396
3397 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3398 if (ret < 0)
3399 goto out_err;
86b00e0d
AE
3400
3401 /* Version 1 images have no parent (no layering) */
3402
3403 rbd_dev->parent_spec = NULL;
3404 rbd_dev->parent_overlap = 0;
3405
a30b71b9
AE
3406 rbd_dev->image_format = 1;
3407
3408 dout("discovered version 1 image, header name is %s\n",
3409 rbd_dev->header_name);
3410
3411 return 0;
3412
3413out_err:
3414 kfree(rbd_dev->header_name);
3415 rbd_dev->header_name = NULL;
0d7dbfce
AE
3416 kfree(rbd_dev->spec->image_id);
3417 rbd_dev->spec->image_id = NULL;
a30b71b9
AE
3418
3419 return ret;
3420}
3421
3422static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3423{
3424 size_t size;
9d475de5 3425 int ret;
6e14b1a6 3426 u64 ver = 0;
a30b71b9
AE
3427
3428 /*
3429 * Image id was filled in by the caller. Record the header
3430 * object name for this rbd image.
3431 */
0d7dbfce 3432 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len;
a30b71b9
AE
3433 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3434 if (!rbd_dev->header_name)
3435 return -ENOMEM;
3436 sprintf(rbd_dev->header_name, "%s%s",
0d7dbfce 3437 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
9d475de5
AE
3438
3439 /* Get the size and object order for the image */
3440
3441 ret = rbd_dev_v2_image_size(rbd_dev);
1e130199
AE
3442 if (ret < 0)
3443 goto out_err;
3444
3445 /* Get the object prefix (a.k.a. block_name) for the image */
3446
3447 ret = rbd_dev_v2_object_prefix(rbd_dev);
b1b5402a
AE
3448 if (ret < 0)
3449 goto out_err;
3450
d889140c 3451 /* Get the and check features for the image */
b1b5402a
AE
3452
3453 ret = rbd_dev_v2_features(rbd_dev);
9d475de5
AE
3454 if (ret < 0)
3455 goto out_err;
35d489f9 3456
86b00e0d
AE
3457 /* If the image supports layering, get the parent info */
3458
3459 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3460 ret = rbd_dev_v2_parent_info(rbd_dev);
3461 if (ret < 0)
3462 goto out_err;
3463 }
3464
6e14b1a6
AE
3465 /* crypto and compression type aren't (yet) supported for v2 images */
3466
3467 rbd_dev->header.crypt_type = 0;
3468 rbd_dev->header.comp_type = 0;
35d489f9 3469
6e14b1a6
AE
3470 /* Get the snapshot context, plus the header version */
3471
3472 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
35d489f9
AE
3473 if (ret)
3474 goto out_err;
6e14b1a6
AE
3475 rbd_dev->header.obj_version = ver;
3476
a30b71b9
AE
3477 rbd_dev->image_format = 2;
3478
3479 dout("discovered version 2 image, header name is %s\n",
3480 rbd_dev->header_name);
3481
35152979 3482 return 0;
9d475de5 3483out_err:
86b00e0d
AE
3484 rbd_dev->parent_overlap = 0;
3485 rbd_spec_put(rbd_dev->parent_spec);
3486 rbd_dev->parent_spec = NULL;
9d475de5
AE
3487 kfree(rbd_dev->header_name);
3488 rbd_dev->header_name = NULL;
1e130199
AE
3489 kfree(rbd_dev->header.object_prefix);
3490 rbd_dev->header.object_prefix = NULL;
9d475de5
AE
3491
3492 return ret;
a30b71b9
AE
3493}
3494
83a06263
AE
3495static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3496{
3497 int ret;
3498
3499 /* no need to lock here, as rbd_dev is not registered yet */
3500 ret = rbd_dev_snaps_update(rbd_dev);
3501 if (ret)
3502 return ret;
3503
9e15b77d
AE
3504 ret = rbd_dev_probe_update_spec(rbd_dev);
3505 if (ret)
3506 goto err_out_snaps;
3507
83a06263
AE
3508 ret = rbd_dev_set_mapping(rbd_dev);
3509 if (ret)
3510 goto err_out_snaps;
3511
3512 /* generate unique id: find highest unique id, add one */
3513 rbd_dev_id_get(rbd_dev);
3514
3515 /* Fill in the device name, now that we have its id. */
3516 BUILD_BUG_ON(DEV_NAME_LEN
3517 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3518 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3519
3520 /* Get our block major device number. */
3521
3522 ret = register_blkdev(0, rbd_dev->name);
3523 if (ret < 0)
3524 goto err_out_id;
3525 rbd_dev->major = ret;
3526
3527 /* Set up the blkdev mapping. */
3528
3529 ret = rbd_init_disk(rbd_dev);
3530 if (ret)
3531 goto err_out_blkdev;
3532
3533 ret = rbd_bus_add_dev(rbd_dev);
3534 if (ret)
3535 goto err_out_disk;
3536
3537 /*
3538 * At this point cleanup in the event of an error is the job
3539 * of the sysfs code (initiated by rbd_bus_del_dev()).
3540 */
3541 down_write(&rbd_dev->header_rwsem);
3542 ret = rbd_dev_snaps_register(rbd_dev);
3543 up_write(&rbd_dev->header_rwsem);
3544 if (ret)
3545 goto err_out_bus;
3546
3547 ret = rbd_init_watch_dev(rbd_dev);
3548 if (ret)
3549 goto err_out_bus;
3550
3551 /* Everything's ready. Announce the disk to the world. */
3552
3553 add_disk(rbd_dev->disk);
3554
3555 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3556 (unsigned long long) rbd_dev->mapping.size);
3557
3558 return ret;
3559err_out_bus:
3560 /* this will also clean up rest of rbd_dev stuff */
3561
3562 rbd_bus_del_dev(rbd_dev);
3563
3564 return ret;
3565err_out_disk:
3566 rbd_free_disk(rbd_dev);
3567err_out_blkdev:
3568 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3569err_out_id:
3570 rbd_dev_id_put(rbd_dev);
3571err_out_snaps:
3572 rbd_remove_all_snaps(rbd_dev);
3573
3574 return ret;
3575}
3576
a30b71b9
AE
3577/*
3578 * Probe for the existence of the header object for the given rbd
3579 * device. For format 2 images this includes determining the image
3580 * id.
3581 */
3582static int rbd_dev_probe(struct rbd_device *rbd_dev)
3583{
3584 int ret;
3585
3586 /*
3587 * Get the id from the image id object. If it's not a
3588 * format 2 image, we'll get ENOENT back, and we'll assume
3589 * it's a format 1 image.
3590 */
3591 ret = rbd_dev_image_id(rbd_dev);
3592 if (ret)
3593 ret = rbd_dev_v1_probe(rbd_dev);
3594 else
3595 ret = rbd_dev_v2_probe(rbd_dev);
83a06263 3596 if (ret) {
a30b71b9
AE
3597 dout("probe failed, returning %d\n", ret);
3598
83a06263
AE
3599 return ret;
3600 }
3601
3602 ret = rbd_dev_probe_finish(rbd_dev);
3603 if (ret)
3604 rbd_header_free(&rbd_dev->header);
3605
a30b71b9
AE
3606 return ret;
3607}
3608
59c2be1e
YS
3609static ssize_t rbd_add(struct bus_type *bus,
3610 const char *buf,
3611 size_t count)
602adf40 3612{
cb8627c7 3613 struct rbd_device *rbd_dev = NULL;
dc79b113 3614 struct ceph_options *ceph_opts = NULL;
4e9afeba 3615 struct rbd_options *rbd_opts = NULL;
859c31df 3616 struct rbd_spec *spec = NULL;
9d3997fd 3617 struct rbd_client *rbdc;
27cc2594
AE
3618 struct ceph_osd_client *osdc;
3619 int rc = -ENOMEM;
602adf40
YS
3620
3621 if (!try_module_get(THIS_MODULE))
3622 return -ENODEV;
3623
602adf40 3624 /* parse add command */
859c31df 3625 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
dc79b113 3626 if (rc < 0)
bd4ba655 3627 goto err_out_module;
78cea76e 3628
9d3997fd
AE
3629 rbdc = rbd_get_client(ceph_opts);
3630 if (IS_ERR(rbdc)) {
3631 rc = PTR_ERR(rbdc);
0ddebc0c 3632 goto err_out_args;
9d3997fd 3633 }
c53d5893 3634 ceph_opts = NULL; /* rbd_dev client now owns this */
602adf40 3635
602adf40 3636 /* pick the pool */
9d3997fd 3637 osdc = &rbdc->client->osdc;
859c31df 3638 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
602adf40
YS
3639 if (rc < 0)
3640 goto err_out_client;
859c31df
AE
3641 spec->pool_id = (u64) rc;
3642
c53d5893 3643 rbd_dev = rbd_dev_create(rbdc, spec);
bd4ba655
AE
3644 if (!rbd_dev)
3645 goto err_out_client;
c53d5893
AE
3646 rbdc = NULL; /* rbd_dev now owns this */
3647 spec = NULL; /* rbd_dev now owns this */
602adf40 3648
bd4ba655 3649 rbd_dev->mapping.read_only = rbd_opts->read_only;
c53d5893
AE
3650 kfree(rbd_opts);
3651 rbd_opts = NULL; /* done with this */
bd4ba655 3652
a30b71b9
AE
3653 rc = rbd_dev_probe(rbd_dev);
3654 if (rc < 0)
c53d5893 3655 goto err_out_rbd_dev;
05fd6f6f 3656
602adf40 3657 return count;
c53d5893
AE
3658err_out_rbd_dev:
3659 rbd_dev_destroy(rbd_dev);
bd4ba655 3660err_out_client:
9d3997fd 3661 rbd_put_client(rbdc);
0ddebc0c 3662err_out_args:
78cea76e
AE
3663 if (ceph_opts)
3664 ceph_destroy_options(ceph_opts);
4e9afeba 3665 kfree(rbd_opts);
859c31df 3666 rbd_spec_put(spec);
bd4ba655
AE
3667err_out_module:
3668 module_put(THIS_MODULE);
27cc2594 3669
602adf40 3670 dout("Error adding device %s\n", buf);
27cc2594
AE
3671
3672 return (ssize_t) rc;
602adf40
YS
3673}
3674
de71a297 3675static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
3676{
3677 struct list_head *tmp;
3678 struct rbd_device *rbd_dev;
3679
e124a82f 3680 spin_lock(&rbd_dev_list_lock);
602adf40
YS
3681 list_for_each(tmp, &rbd_dev_list) {
3682 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 3683 if (rbd_dev->dev_id == dev_id) {
e124a82f 3684 spin_unlock(&rbd_dev_list_lock);
602adf40 3685 return rbd_dev;
e124a82f 3686 }
602adf40 3687 }
e124a82f 3688 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
3689 return NULL;
3690}
3691
dfc5606d 3692static void rbd_dev_release(struct device *dev)
602adf40 3693{
593a9e7b 3694 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 3695
1dbb4399
AE
3696 if (rbd_dev->watch_request) {
3697 struct ceph_client *client = rbd_dev->rbd_client->client;
3698
3699 ceph_osdc_unregister_linger_request(&client->osdc,
59c2be1e 3700 rbd_dev->watch_request);
1dbb4399 3701 }
59c2be1e 3702 if (rbd_dev->watch_event)
070c633f 3703 rbd_req_sync_unwatch(rbd_dev);
59c2be1e 3704
602adf40
YS
3705
3706 /* clean up and free blkdev */
3707 rbd_free_disk(rbd_dev);
3708 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d 3709
2ac4e75d
AE
3710 /* release allocated disk header fields */
3711 rbd_header_free(&rbd_dev->header);
3712
32eec68d 3713 /* done with the id, and with the rbd_dev */
e2839308 3714 rbd_dev_id_put(rbd_dev);
c53d5893
AE
3715 rbd_assert(rbd_dev->rbd_client != NULL);
3716 rbd_dev_destroy(rbd_dev);
602adf40
YS
3717
3718 /* release module ref */
3719 module_put(THIS_MODULE);
602adf40
YS
3720}
3721
dfc5606d
YS
3722static ssize_t rbd_remove(struct bus_type *bus,
3723 const char *buf,
3724 size_t count)
602adf40
YS
3725{
3726 struct rbd_device *rbd_dev = NULL;
3727 int target_id, rc;
3728 unsigned long ul;
3729 int ret = count;
3730
3731 rc = strict_strtoul(buf, 10, &ul);
3732 if (rc)
3733 return rc;
3734
3735 /* convert to int; abort if we lost anything in the conversion */
3736 target_id = (int) ul;
3737 if (target_id != ul)
3738 return -EINVAL;
3739
3740 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3741
3742 rbd_dev = __rbd_get_dev(target_id);
3743 if (!rbd_dev) {
3744 ret = -ENOENT;
3745 goto done;
3746 }
3747
41f38c2b 3748 rbd_remove_all_snaps(rbd_dev);
dfc5606d 3749 rbd_bus_del_dev(rbd_dev);
602adf40
YS
3750
3751done:
3752 mutex_unlock(&ctl_mutex);
aafb230e 3753
602adf40
YS
3754 return ret;
3755}
3756
602adf40
YS
3757/*
3758 * create control files in sysfs
dfc5606d 3759 * /sys/bus/rbd/...
602adf40
YS
3760 */
3761static int rbd_sysfs_init(void)
3762{
dfc5606d 3763 int ret;
602adf40 3764
fed4c143 3765 ret = device_register(&rbd_root_dev);
21079786 3766 if (ret < 0)
dfc5606d 3767 return ret;
602adf40 3768
fed4c143
AE
3769 ret = bus_register(&rbd_bus_type);
3770 if (ret < 0)
3771 device_unregister(&rbd_root_dev);
602adf40 3772
602adf40
YS
3773 return ret;
3774}
3775
3776static void rbd_sysfs_cleanup(void)
3777{
dfc5606d 3778 bus_unregister(&rbd_bus_type);
fed4c143 3779 device_unregister(&rbd_root_dev);
602adf40
YS
3780}
3781
3782int __init rbd_init(void)
3783{
3784 int rc;
3785
3786 rc = rbd_sysfs_init();
3787 if (rc)
3788 return rc;
f0f8cef5 3789 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
3790 return 0;
3791}
3792
3793void __exit rbd_exit(void)
3794{
3795 rbd_sysfs_cleanup();
3796}
3797
3798module_init(rbd_init);
3799module_exit(rbd_exit);
3800
3801MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3802MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3803MODULE_DESCRIPTION("rados block device");
3804
3805/* following authorship retained from original osdblk.c */
3806MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3807
3808MODULE_LICENSE("GPL");
This page took 0.404124 seconds and 5 git commands to generate.