rbd: do not allow remove of mounted-on image
[deliverable/linux.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
df111be6
AE
55/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
f0f8cef5
AE
59#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
61
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
d4b125e9
AE
64#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
35d489f9 68#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
69#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
9e15b77d
AE
73/* This allows a single page to hold an image name sent by OSD */
74#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
1e130199 75#define RBD_IMAGE_ID_LEN_MAX 64
9e15b77d 76
1e130199 77#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 78
d889140c
AE
79/* Feature bits */
80
81#define RBD_FEATURE_LAYERING 1
82
83/* Features supported by this (client software) implementation. */
84
85#define RBD_FEATURES_ALL (0)
86
81a89793
AE
87/*
88 * An RBD device name will be "rbd#", where the "rbd" comes from
89 * RBD_DRV_NAME above, and # is a unique integer identifier.
90 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
91 * enough to hold all possible device names.
92 */
602adf40 93#define DEV_NAME_LEN 32
81a89793 94#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40 95
cc0538b6 96#define RBD_READ_ONLY_DEFAULT false
59c2be1e 97
602adf40
YS
98/*
99 * block device image metadata (in-memory version)
100 */
101struct rbd_image_header {
f84344f3 102 /* These four fields never change for a given rbd image */
849b4260 103 char *object_prefix;
34b13184 104 u64 features;
602adf40
YS
105 __u8 obj_order;
106 __u8 crypt_type;
107 __u8 comp_type;
602adf40 108
f84344f3
AE
109 /* The remaining fields need to be updated occasionally */
110 u64 image_size;
111 struct ceph_snap_context *snapc;
602adf40
YS
112 char *snap_names;
113 u64 *snap_sizes;
59c2be1e
YS
114
115 u64 obj_version;
116};
117
0d7dbfce
AE
118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
122 * identify an image.
123 */
124struct rbd_spec {
125 u64 pool_id;
126 char *pool_name;
127
128 char *image_id;
129 size_t image_id_len;
130 char *image_name;
131 size_t image_name_len;
132
133 u64 snap_id;
134 char *snap_name;
135
136 struct kref kref;
137};
138
59c2be1e 139struct rbd_options {
cc0538b6 140 bool read_only;
602adf40
YS
141};
142
143/*
f0f8cef5 144 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
145 */
146struct rbd_client {
147 struct ceph_client *client;
148 struct kref kref;
149 struct list_head node;
150};
151
152/*
f0f8cef5 153 * a request completion status
602adf40 154 */
1fec7093
YS
155struct rbd_req_status {
156 int done;
157 int rc;
158 u64 bytes;
159};
160
161/*
162 * a collection of requests
163 */
164struct rbd_req_coll {
165 int total;
166 int num_done;
167 struct kref kref;
168 struct rbd_req_status status[0];
602adf40
YS
169};
170
f0f8cef5
AE
171/*
172 * a single io request
173 */
174struct rbd_request {
175 struct request *rq; /* blk layer request */
176 struct bio *bio; /* cloned bio */
177 struct page **pages; /* list of used pages */
178 u64 len;
179 int coll_index;
180 struct rbd_req_coll *coll;
181};
182
dfc5606d
YS
183struct rbd_snap {
184 struct device dev;
185 const char *name;
3591538f 186 u64 size;
dfc5606d
YS
187 struct list_head node;
188 u64 id;
34b13184 189 u64 features;
dfc5606d
YS
190};
191
f84344f3 192struct rbd_mapping {
99c1f08f 193 u64 size;
34b13184 194 u64 features;
f84344f3
AE
195 bool read_only;
196};
197
602adf40
YS
198/*
199 * a single device
200 */
201struct rbd_device {
de71a297 202 int dev_id; /* blkdev unique id */
602adf40
YS
203
204 int major; /* blkdev assigned major */
205 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 206
a30b71b9 207 u32 image_format; /* Either 1 or 2 */
602adf40
YS
208 struct rbd_client *rbd_client;
209
210 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
211
212 spinlock_t lock; /* queue lock */
213
214 struct rbd_image_header header;
daba5fdb 215 bool exists;
0d7dbfce 216 struct rbd_spec *spec;
602adf40 217
0d7dbfce 218 char *header_name;
971f839a 219
59c2be1e
YS
220 struct ceph_osd_event *watch_event;
221 struct ceph_osd_request *watch_request;
222
86b00e0d
AE
223 struct rbd_spec *parent_spec;
224 u64 parent_overlap;
225
c666601a
JD
226 /* protects updating the header */
227 struct rw_semaphore header_rwsem;
f84344f3
AE
228
229 struct rbd_mapping mapping;
602adf40
YS
230
231 struct list_head node;
dfc5606d
YS
232
233 /* list of snapshots */
234 struct list_head snaps;
235
236 /* sysfs related */
237 struct device dev;
42382b70 238 unsigned long open_count;
dfc5606d
YS
239};
240
602adf40 241static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 242
602adf40 243static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
244static DEFINE_SPINLOCK(rbd_dev_list_lock);
245
432b8587
AE
246static LIST_HEAD(rbd_client_list); /* clients */
247static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 248
304f6808
AE
249static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
250static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
251
dfc5606d 252static void rbd_dev_release(struct device *dev);
41f38c2b 253static void rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 254
f0f8cef5
AE
255static ssize_t rbd_add(struct bus_type *bus, const char *buf,
256 size_t count);
257static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
258 size_t count);
259
260static struct bus_attribute rbd_bus_attrs[] = {
261 __ATTR(add, S_IWUSR, NULL, rbd_add),
262 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
263 __ATTR_NULL
264};
265
266static struct bus_type rbd_bus_type = {
267 .name = "rbd",
268 .bus_attrs = rbd_bus_attrs,
269};
270
271static void rbd_root_dev_release(struct device *dev)
272{
273}
274
275static struct device rbd_root_dev = {
276 .init_name = "rbd",
277 .release = rbd_root_dev_release,
278};
279
aafb230e
AE
280#ifdef RBD_DEBUG
281#define rbd_assert(expr) \
282 if (unlikely(!(expr))) { \
283 printk(KERN_ERR "\nAssertion failure in %s() " \
284 "at line %d:\n\n" \
285 "\trbd_assert(%s);\n\n", \
286 __func__, __LINE__, #expr); \
287 BUG(); \
288 }
289#else /* !RBD_DEBUG */
290# define rbd_assert(expr) ((void) 0)
291#endif /* !RBD_DEBUG */
dfc5606d 292
dfc5606d
YS
293static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
294{
295 return get_device(&rbd_dev->dev);
296}
297
298static void rbd_put_dev(struct rbd_device *rbd_dev)
299{
300 put_device(&rbd_dev->dev);
301}
602adf40 302
117973fb
AE
303static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
304static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 305
602adf40
YS
306static int rbd_open(struct block_device *bdev, fmode_t mode)
307{
f0f8cef5 308 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
602adf40 309
f84344f3 310 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
311 return -EROFS;
312
42382b70 313 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
340c7a2b 314 rbd_get_dev(rbd_dev);
f84344f3 315 set_device_ro(bdev, rbd_dev->mapping.read_only);
42382b70
AE
316 rbd_dev->open_count++;
317 mutex_unlock(&ctl_mutex);
340c7a2b 318
602adf40
YS
319 return 0;
320}
321
dfc5606d
YS
322static int rbd_release(struct gendisk *disk, fmode_t mode)
323{
324 struct rbd_device *rbd_dev = disk->private_data;
325
42382b70
AE
326 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
327 rbd_assert(rbd_dev->open_count > 0);
328 rbd_dev->open_count--;
dfc5606d 329 rbd_put_dev(rbd_dev);
42382b70 330 mutex_unlock(&ctl_mutex);
dfc5606d
YS
331
332 return 0;
333}
334
602adf40
YS
335static const struct block_device_operations rbd_bd_ops = {
336 .owner = THIS_MODULE,
337 .open = rbd_open,
dfc5606d 338 .release = rbd_release,
602adf40
YS
339};
340
341/*
342 * Initialize an rbd client instance.
43ae4701 343 * We own *ceph_opts.
602adf40 344 */
f8c38929 345static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
346{
347 struct rbd_client *rbdc;
348 int ret = -ENOMEM;
349
350 dout("rbd_client_create\n");
351 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
352 if (!rbdc)
353 goto out_opt;
354
355 kref_init(&rbdc->kref);
356 INIT_LIST_HEAD(&rbdc->node);
357
bc534d86
AE
358 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
359
43ae4701 360 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 361 if (IS_ERR(rbdc->client))
bc534d86 362 goto out_mutex;
43ae4701 363 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
364
365 ret = ceph_open_session(rbdc->client);
366 if (ret < 0)
367 goto out_err;
368
432b8587 369 spin_lock(&rbd_client_list_lock);
602adf40 370 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 371 spin_unlock(&rbd_client_list_lock);
602adf40 372
bc534d86
AE
373 mutex_unlock(&ctl_mutex);
374
602adf40
YS
375 dout("rbd_client_create created %p\n", rbdc);
376 return rbdc;
377
378out_err:
379 ceph_destroy_client(rbdc->client);
bc534d86
AE
380out_mutex:
381 mutex_unlock(&ctl_mutex);
602adf40
YS
382 kfree(rbdc);
383out_opt:
43ae4701
AE
384 if (ceph_opts)
385 ceph_destroy_options(ceph_opts);
28f259b7 386 return ERR_PTR(ret);
602adf40
YS
387}
388
389/*
1f7ba331
AE
390 * Find a ceph client with specific addr and configuration. If
391 * found, bump its reference count.
602adf40 392 */
1f7ba331 393static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
394{
395 struct rbd_client *client_node;
1f7ba331 396 bool found = false;
602adf40 397
43ae4701 398 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
399 return NULL;
400
1f7ba331
AE
401 spin_lock(&rbd_client_list_lock);
402 list_for_each_entry(client_node, &rbd_client_list, node) {
403 if (!ceph_compare_options(ceph_opts, client_node->client)) {
404 kref_get(&client_node->kref);
405 found = true;
406 break;
407 }
408 }
409 spin_unlock(&rbd_client_list_lock);
410
411 return found ? client_node : NULL;
602adf40
YS
412}
413
59c2be1e
YS
414/*
415 * mount options
416 */
417enum {
59c2be1e
YS
418 Opt_last_int,
419 /* int args above */
420 Opt_last_string,
421 /* string args above */
cc0538b6
AE
422 Opt_read_only,
423 Opt_read_write,
424 /* Boolean args above */
425 Opt_last_bool,
59c2be1e
YS
426};
427
43ae4701 428static match_table_t rbd_opts_tokens = {
59c2be1e
YS
429 /* int args above */
430 /* string args above */
be466c1c 431 {Opt_read_only, "read_only"},
cc0538b6
AE
432 {Opt_read_only, "ro"}, /* Alternate spelling */
433 {Opt_read_write, "read_write"},
434 {Opt_read_write, "rw"}, /* Alternate spelling */
435 /* Boolean args above */
59c2be1e
YS
436 {-1, NULL}
437};
438
439static int parse_rbd_opts_token(char *c, void *private)
440{
43ae4701 441 struct rbd_options *rbd_opts = private;
59c2be1e
YS
442 substring_t argstr[MAX_OPT_ARGS];
443 int token, intval, ret;
444
43ae4701 445 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
446 if (token < 0)
447 return -EINVAL;
448
449 if (token < Opt_last_int) {
450 ret = match_int(&argstr[0], &intval);
451 if (ret < 0) {
452 pr_err("bad mount option arg (not int) "
453 "at '%s'\n", c);
454 return ret;
455 }
456 dout("got int token %d val %d\n", token, intval);
457 } else if (token > Opt_last_int && token < Opt_last_string) {
458 dout("got string token %d val %s\n", token,
459 argstr[0].from);
cc0538b6
AE
460 } else if (token > Opt_last_string && token < Opt_last_bool) {
461 dout("got Boolean token %d\n", token);
59c2be1e
YS
462 } else {
463 dout("got token %d\n", token);
464 }
465
466 switch (token) {
cc0538b6
AE
467 case Opt_read_only:
468 rbd_opts->read_only = true;
469 break;
470 case Opt_read_write:
471 rbd_opts->read_only = false;
472 break;
59c2be1e 473 default:
aafb230e
AE
474 rbd_assert(false);
475 break;
59c2be1e
YS
476 }
477 return 0;
478}
479
602adf40
YS
480/*
481 * Get a ceph client with specific addr and configuration, if one does
482 * not exist create it.
483 */
9d3997fd 484static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
602adf40 485{
f8c38929 486 struct rbd_client *rbdc;
59c2be1e 487
1f7ba331 488 rbdc = rbd_client_find(ceph_opts);
9d3997fd 489 if (rbdc) /* using an existing client */
43ae4701 490 ceph_destroy_options(ceph_opts);
9d3997fd 491 else
f8c38929 492 rbdc = rbd_client_create(ceph_opts);
602adf40 493
9d3997fd 494 return rbdc;
602adf40
YS
495}
496
497/*
498 * Destroy ceph client
d23a4b3f 499 *
432b8587 500 * Caller must hold rbd_client_list_lock.
602adf40
YS
501 */
502static void rbd_client_release(struct kref *kref)
503{
504 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
505
506 dout("rbd_release_client %p\n", rbdc);
cd9d9f5d 507 spin_lock(&rbd_client_list_lock);
602adf40 508 list_del(&rbdc->node);
cd9d9f5d 509 spin_unlock(&rbd_client_list_lock);
602adf40
YS
510
511 ceph_destroy_client(rbdc->client);
512 kfree(rbdc);
513}
514
515/*
516 * Drop reference to ceph client node. If it's not referenced anymore, release
517 * it.
518 */
9d3997fd 519static void rbd_put_client(struct rbd_client *rbdc)
602adf40 520{
c53d5893
AE
521 if (rbdc)
522 kref_put(&rbdc->kref, rbd_client_release);
602adf40
YS
523}
524
1fec7093
YS
525/*
526 * Destroy requests collection
527 */
528static void rbd_coll_release(struct kref *kref)
529{
530 struct rbd_req_coll *coll =
531 container_of(kref, struct rbd_req_coll, kref);
532
533 dout("rbd_coll_release %p\n", coll);
534 kfree(coll);
535}
602adf40 536
a30b71b9
AE
537static bool rbd_image_format_valid(u32 image_format)
538{
539 return image_format == 1 || image_format == 2;
540}
541
8e94af8e
AE
542static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
543{
103a150f
AE
544 size_t size;
545 u32 snap_count;
546
547 /* The header has to start with the magic rbd header text */
548 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
549 return false;
550
db2388b6
AE
551 /* The bio layer requires at least sector-sized I/O */
552
553 if (ondisk->options.order < SECTOR_SHIFT)
554 return false;
555
556 /* If we use u64 in a few spots we may be able to loosen this */
557
558 if (ondisk->options.order > 8 * sizeof (int) - 1)
559 return false;
560
103a150f
AE
561 /*
562 * The size of a snapshot header has to fit in a size_t, and
563 * that limits the number of snapshots.
564 */
565 snap_count = le32_to_cpu(ondisk->snap_count);
566 size = SIZE_MAX - sizeof (struct ceph_snap_context);
567 if (snap_count > size / sizeof (__le64))
568 return false;
569
570 /*
571 * Not only that, but the size of the entire the snapshot
572 * header must also be representable in a size_t.
573 */
574 size -= snap_count * sizeof (__le64);
575 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
576 return false;
577
578 return true;
8e94af8e
AE
579}
580
602adf40
YS
581/*
582 * Create a new header structure, translate header format from the on-disk
583 * header.
584 */
585static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 586 struct rbd_image_header_ondisk *ondisk)
602adf40 587{
ccece235 588 u32 snap_count;
58c17b0e 589 size_t len;
d2bb24e5 590 size_t size;
621901d6 591 u32 i;
602adf40 592
6a52325f
AE
593 memset(header, 0, sizeof (*header));
594
103a150f
AE
595 snap_count = le32_to_cpu(ondisk->snap_count);
596
58c17b0e
AE
597 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
598 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 599 if (!header->object_prefix)
602adf40 600 return -ENOMEM;
58c17b0e
AE
601 memcpy(header->object_prefix, ondisk->object_prefix, len);
602 header->object_prefix[len] = '\0';
00f1f36f 603
602adf40 604 if (snap_count) {
f785cc1d
AE
605 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
606
621901d6
AE
607 /* Save a copy of the snapshot names */
608
f785cc1d
AE
609 if (snap_names_len > (u64) SIZE_MAX)
610 return -EIO;
611 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 612 if (!header->snap_names)
6a52325f 613 goto out_err;
f785cc1d
AE
614 /*
615 * Note that rbd_dev_v1_header_read() guarantees
616 * the ondisk buffer we're working with has
617 * snap_names_len bytes beyond the end of the
618 * snapshot id array, this memcpy() is safe.
619 */
620 memcpy(header->snap_names, &ondisk->snaps[snap_count],
621 snap_names_len);
6a52325f 622
621901d6
AE
623 /* Record each snapshot's size */
624
d2bb24e5
AE
625 size = snap_count * sizeof (*header->snap_sizes);
626 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 627 if (!header->snap_sizes)
6a52325f 628 goto out_err;
621901d6
AE
629 for (i = 0; i < snap_count; i++)
630 header->snap_sizes[i] =
631 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 632 } else {
ccece235 633 WARN_ON(ondisk->snap_names_len);
602adf40
YS
634 header->snap_names = NULL;
635 header->snap_sizes = NULL;
636 }
849b4260 637
34b13184 638 header->features = 0; /* No features support in v1 images */
602adf40
YS
639 header->obj_order = ondisk->options.order;
640 header->crypt_type = ondisk->options.crypt_type;
641 header->comp_type = ondisk->options.comp_type;
6a52325f 642
621901d6
AE
643 /* Allocate and fill in the snapshot context */
644
f84344f3 645 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
646 size = sizeof (struct ceph_snap_context);
647 size += snap_count * sizeof (header->snapc->snaps[0]);
648 header->snapc = kzalloc(size, GFP_KERNEL);
649 if (!header->snapc)
650 goto out_err;
602adf40
YS
651
652 atomic_set(&header->snapc->nref, 1);
505cbb9b 653 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 654 header->snapc->num_snaps = snap_count;
621901d6
AE
655 for (i = 0; i < snap_count; i++)
656 header->snapc->snaps[i] =
657 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
658
659 return 0;
660
6a52325f 661out_err:
849b4260 662 kfree(header->snap_sizes);
ccece235 663 header->snap_sizes = NULL;
602adf40 664 kfree(header->snap_names);
ccece235 665 header->snap_names = NULL;
6a52325f
AE
666 kfree(header->object_prefix);
667 header->object_prefix = NULL;
ccece235 668
00f1f36f 669 return -ENOMEM;
602adf40
YS
670}
671
9e15b77d
AE
672static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
673{
674 struct rbd_snap *snap;
675
676 if (snap_id == CEPH_NOSNAP)
677 return RBD_SNAP_HEAD_NAME;
678
679 list_for_each_entry(snap, &rbd_dev->snaps, node)
680 if (snap_id == snap->id)
681 return snap->name;
682
683 return NULL;
684}
685
8836b995 686static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40 687{
602adf40 688
e86924a8 689 struct rbd_snap *snap;
602adf40 690
e86924a8
AE
691 list_for_each_entry(snap, &rbd_dev->snaps, node) {
692 if (!strcmp(snap_name, snap->name)) {
0d7dbfce 693 rbd_dev->spec->snap_id = snap->id;
e86924a8 694 rbd_dev->mapping.size = snap->size;
34b13184 695 rbd_dev->mapping.features = snap->features;
602adf40 696
e86924a8 697 return 0;
00f1f36f 698 }
00f1f36f 699 }
e86924a8 700
00f1f36f 701 return -ENOENT;
602adf40
YS
702}
703
819d52bf 704static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
602adf40 705{
78dc447d 706 int ret;
602adf40 707
0d7dbfce 708 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 709 sizeof (RBD_SNAP_HEAD_NAME))) {
0d7dbfce 710 rbd_dev->spec->snap_id = CEPH_NOSNAP;
99c1f08f 711 rbd_dev->mapping.size = rbd_dev->header.image_size;
34b13184 712 rbd_dev->mapping.features = rbd_dev->header.features;
e86924a8 713 ret = 0;
602adf40 714 } else {
0d7dbfce 715 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
602adf40
YS
716 if (ret < 0)
717 goto done;
f84344f3 718 rbd_dev->mapping.read_only = true;
602adf40 719 }
daba5fdb 720 rbd_dev->exists = true;
602adf40 721done:
602adf40
YS
722 return ret;
723}
724
725static void rbd_header_free(struct rbd_image_header *header)
726{
849b4260 727 kfree(header->object_prefix);
d78fd7ae 728 header->object_prefix = NULL;
602adf40 729 kfree(header->snap_sizes);
d78fd7ae 730 header->snap_sizes = NULL;
849b4260 731 kfree(header->snap_names);
d78fd7ae 732 header->snap_names = NULL;
d1d25646 733 ceph_put_snap_context(header->snapc);
d78fd7ae 734 header->snapc = NULL;
602adf40
YS
735}
736
65ccfe21 737static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 738{
65ccfe21
AE
739 char *name;
740 u64 segment;
741 int ret;
602adf40 742
65ccfe21
AE
743 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
744 if (!name)
745 return NULL;
746 segment = offset >> rbd_dev->header.obj_order;
747 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
748 rbd_dev->header.object_prefix, segment);
749 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
750 pr_err("error formatting segment name for #%llu (%d)\n",
751 segment, ret);
752 kfree(name);
753 name = NULL;
754 }
602adf40 755
65ccfe21
AE
756 return name;
757}
602adf40 758
65ccfe21
AE
759static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
760{
761 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 762
65ccfe21
AE
763 return offset & (segment_size - 1);
764}
765
766static u64 rbd_segment_length(struct rbd_device *rbd_dev,
767 u64 offset, u64 length)
768{
769 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
770
771 offset &= segment_size - 1;
772
aafb230e 773 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
774 if (offset + length > segment_size)
775 length = segment_size - offset;
776
777 return length;
602adf40
YS
778}
779
1fec7093
YS
780static int rbd_get_num_segments(struct rbd_image_header *header,
781 u64 ofs, u64 len)
782{
df111be6
AE
783 u64 start_seg;
784 u64 end_seg;
785
786 if (!len)
787 return 0;
788 if (len - 1 > U64_MAX - ofs)
789 return -ERANGE;
790
791 start_seg = ofs >> header->obj_order;
792 end_seg = (ofs + len - 1) >> header->obj_order;
793
1fec7093
YS
794 return end_seg - start_seg + 1;
795}
796
029bcbd8
JD
797/*
798 * returns the size of an object in the image
799 */
800static u64 rbd_obj_bytes(struct rbd_image_header *header)
801{
802 return 1 << header->obj_order;
803}
804
602adf40
YS
805/*
806 * bio helpers
807 */
808
809static void bio_chain_put(struct bio *chain)
810{
811 struct bio *tmp;
812
813 while (chain) {
814 tmp = chain;
815 chain = chain->bi_next;
816 bio_put(tmp);
817 }
818}
819
820/*
821 * zeros a bio chain, starting at specific offset
822 */
823static void zero_bio_chain(struct bio *chain, int start_ofs)
824{
825 struct bio_vec *bv;
826 unsigned long flags;
827 void *buf;
828 int i;
829 int pos = 0;
830
831 while (chain) {
832 bio_for_each_segment(bv, chain, i) {
833 if (pos + bv->bv_len > start_ofs) {
834 int remainder = max(start_ofs - pos, 0);
835 buf = bvec_kmap_irq(bv, &flags);
836 memset(buf + remainder, 0,
837 bv->bv_len - remainder);
85b5aaa6 838 bvec_kunmap_irq(buf, &flags);
602adf40
YS
839 }
840 pos += bv->bv_len;
841 }
842
843 chain = chain->bi_next;
844 }
845}
846
847/*
f7760dad
AE
848 * Clone a portion of a bio, starting at the given byte offset
849 * and continuing for the number of bytes indicated.
602adf40 850 */
f7760dad
AE
851static struct bio *bio_clone_range(struct bio *bio_src,
852 unsigned int offset,
853 unsigned int len,
854 gfp_t gfpmask)
602adf40 855{
f7760dad
AE
856 struct bio_vec *bv;
857 unsigned int resid;
858 unsigned short idx;
859 unsigned int voff;
860 unsigned short end_idx;
861 unsigned short vcnt;
862 struct bio *bio;
863
864 /* Handle the easy case for the caller */
865
866 if (!offset && len == bio_src->bi_size)
867 return bio_clone(bio_src, gfpmask);
868
869 if (WARN_ON_ONCE(!len))
870 return NULL;
871 if (WARN_ON_ONCE(len > bio_src->bi_size))
872 return NULL;
873 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
874 return NULL;
875
876 /* Find first affected segment... */
877
878 resid = offset;
879 __bio_for_each_segment(bv, bio_src, idx, 0) {
880 if (resid < bv->bv_len)
881 break;
882 resid -= bv->bv_len;
602adf40 883 }
f7760dad 884 voff = resid;
602adf40 885
f7760dad 886 /* ...and the last affected segment */
602adf40 887
f7760dad
AE
888 resid += len;
889 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
890 if (resid <= bv->bv_len)
891 break;
892 resid -= bv->bv_len;
893 }
894 vcnt = end_idx - idx + 1;
895
896 /* Build the clone */
897
898 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
899 if (!bio)
900 return NULL; /* ENOMEM */
602adf40 901
f7760dad
AE
902 bio->bi_bdev = bio_src->bi_bdev;
903 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
904 bio->bi_rw = bio_src->bi_rw;
905 bio->bi_flags |= 1 << BIO_CLONED;
906
907 /*
908 * Copy over our part of the bio_vec, then update the first
909 * and last (or only) entries.
910 */
911 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
912 vcnt * sizeof (struct bio_vec));
913 bio->bi_io_vec[0].bv_offset += voff;
914 if (vcnt > 1) {
915 bio->bi_io_vec[0].bv_len -= voff;
916 bio->bi_io_vec[vcnt - 1].bv_len = resid;
917 } else {
918 bio->bi_io_vec[0].bv_len = len;
602adf40
YS
919 }
920
f7760dad
AE
921 bio->bi_vcnt = vcnt;
922 bio->bi_size = len;
923 bio->bi_idx = 0;
924
925 return bio;
926}
927
928/*
929 * Clone a portion of a bio chain, starting at the given byte offset
930 * into the first bio in the source chain and continuing for the
931 * number of bytes indicated. The result is another bio chain of
932 * exactly the given length, or a null pointer on error.
933 *
934 * The bio_src and offset parameters are both in-out. On entry they
935 * refer to the first source bio and the offset into that bio where
936 * the start of data to be cloned is located.
937 *
938 * On return, bio_src is updated to refer to the bio in the source
939 * chain that contains first un-cloned byte, and *offset will
940 * contain the offset of that byte within that bio.
941 */
942static struct bio *bio_chain_clone_range(struct bio **bio_src,
943 unsigned int *offset,
944 unsigned int len,
945 gfp_t gfpmask)
946{
947 struct bio *bi = *bio_src;
948 unsigned int off = *offset;
949 struct bio *chain = NULL;
950 struct bio **end;
951
952 /* Build up a chain of clone bios up to the limit */
953
954 if (!bi || off >= bi->bi_size || !len)
955 return NULL; /* Nothing to clone */
602adf40 956
f7760dad
AE
957 end = &chain;
958 while (len) {
959 unsigned int bi_size;
960 struct bio *bio;
961
962 if (!bi)
963 goto out_err; /* EINVAL; ran out of bio's */
964 bi_size = min_t(unsigned int, bi->bi_size - off, len);
965 bio = bio_clone_range(bi, off, bi_size, gfpmask);
966 if (!bio)
967 goto out_err; /* ENOMEM */
968
969 *end = bio;
970 end = &bio->bi_next;
602adf40 971
f7760dad
AE
972 off += bi_size;
973 if (off == bi->bi_size) {
974 bi = bi->bi_next;
975 off = 0;
976 }
977 len -= bi_size;
978 }
979 *bio_src = bi;
980 *offset = off;
981
982 return chain;
983out_err:
984 bio_chain_put(chain);
602adf40 985
602adf40
YS
986 return NULL;
987}
988
989/*
990 * helpers for osd request op vectors.
991 */
57cfc106
AE
992static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
993 int opcode, u32 payload_len)
602adf40 994{
57cfc106
AE
995 struct ceph_osd_req_op *ops;
996
997 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
998 if (!ops)
999 return NULL;
1000
1001 ops[0].op = opcode;
1002
602adf40
YS
1003 /*
1004 * op extent offset and length will be set later on
1005 * in calc_raw_layout()
1006 */
57cfc106
AE
1007 ops[0].payload_len = payload_len;
1008
1009 return ops;
602adf40
YS
1010}
1011
1012static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
1013{
1014 kfree(ops);
1015}
1016
1fec7093
YS
1017static void rbd_coll_end_req_index(struct request *rq,
1018 struct rbd_req_coll *coll,
1019 int index,
1020 int ret, u64 len)
1021{
1022 struct request_queue *q;
1023 int min, max, i;
1024
bd919d45
AE
1025 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
1026 coll, index, ret, (unsigned long long) len);
1fec7093
YS
1027
1028 if (!rq)
1029 return;
1030
1031 if (!coll) {
1032 blk_end_request(rq, ret, len);
1033 return;
1034 }
1035
1036 q = rq->q;
1037
1038 spin_lock_irq(q->queue_lock);
1039 coll->status[index].done = 1;
1040 coll->status[index].rc = ret;
1041 coll->status[index].bytes = len;
1042 max = min = coll->num_done;
1043 while (max < coll->total && coll->status[max].done)
1044 max++;
1045
1046 for (i = min; i<max; i++) {
1047 __blk_end_request(rq, coll->status[i].rc,
1048 coll->status[i].bytes);
1049 coll->num_done++;
1050 kref_put(&coll->kref, rbd_coll_release);
1051 }
1052 spin_unlock_irq(q->queue_lock);
1053}
1054
1055static void rbd_coll_end_req(struct rbd_request *req,
1056 int ret, u64 len)
1057{
1058 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
1059}
1060
602adf40
YS
1061/*
1062 * Send ceph osd request
1063 */
1064static int rbd_do_request(struct request *rq,
0ce1a794 1065 struct rbd_device *rbd_dev,
602adf40
YS
1066 struct ceph_snap_context *snapc,
1067 u64 snapid,
aded07ea 1068 const char *object_name, u64 ofs, u64 len,
602adf40
YS
1069 struct bio *bio,
1070 struct page **pages,
1071 int num_pages,
1072 int flags,
1073 struct ceph_osd_req_op *ops,
1fec7093
YS
1074 struct rbd_req_coll *coll,
1075 int coll_index,
602adf40 1076 void (*rbd_cb)(struct ceph_osd_request *req,
59c2be1e
YS
1077 struct ceph_msg *msg),
1078 struct ceph_osd_request **linger_req,
1079 u64 *ver)
602adf40
YS
1080{
1081 struct ceph_osd_request *req;
1082 struct ceph_file_layout *layout;
1083 int ret;
1084 u64 bno;
1085 struct timespec mtime = CURRENT_TIME;
1086 struct rbd_request *req_data;
1087 struct ceph_osd_request_head *reqhead;
1dbb4399 1088 struct ceph_osd_client *osdc;
602adf40 1089
602adf40 1090 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
1fec7093
YS
1091 if (!req_data) {
1092 if (coll)
1093 rbd_coll_end_req_index(rq, coll, coll_index,
1094 -ENOMEM, len);
1095 return -ENOMEM;
1096 }
1097
1098 if (coll) {
1099 req_data->coll = coll;
1100 req_data->coll_index = coll_index;
1101 }
602adf40 1102
f7760dad
AE
1103 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1104 object_name, (unsigned long long) ofs,
1105 (unsigned long long) len, coll, coll_index);
602adf40 1106
0ce1a794 1107 osdc = &rbd_dev->rbd_client->client->osdc;
1dbb4399
AE
1108 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1109 false, GFP_NOIO, pages, bio);
4ad12621 1110 if (!req) {
4ad12621 1111 ret = -ENOMEM;
602adf40
YS
1112 goto done_pages;
1113 }
1114
1115 req->r_callback = rbd_cb;
1116
1117 req_data->rq = rq;
1118 req_data->bio = bio;
1119 req_data->pages = pages;
1120 req_data->len = len;
1121
1122 req->r_priv = req_data;
1123
1124 reqhead = req->r_request->front.iov_base;
1125 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1126
aded07ea 1127 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
602adf40
YS
1128 req->r_oid_len = strlen(req->r_oid);
1129
1130 layout = &req->r_file_layout;
1131 memset(layout, 0, sizeof(*layout));
1132 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1133 layout->fl_stripe_count = cpu_to_le32(1);
1134 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
0d7dbfce 1135 layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
6cae3717
SW
1136 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1137 req, ops);
1138 rbd_assert(ret == 0);
602adf40
YS
1139
1140 ceph_osdc_build_request(req, ofs, &len,
1141 ops,
1142 snapc,
1143 &mtime,
1144 req->r_oid, req->r_oid_len);
602adf40 1145
59c2be1e 1146 if (linger_req) {
1dbb4399 1147 ceph_osdc_set_request_linger(osdc, req);
59c2be1e
YS
1148 *linger_req = req;
1149 }
1150
1dbb4399 1151 ret = ceph_osdc_start_request(osdc, req, false);
602adf40
YS
1152 if (ret < 0)
1153 goto done_err;
1154
1155 if (!rbd_cb) {
1dbb4399 1156 ret = ceph_osdc_wait_request(osdc, req);
59c2be1e
YS
1157 if (ver)
1158 *ver = le64_to_cpu(req->r_reassert_version.version);
bd919d45
AE
1159 dout("reassert_ver=%llu\n",
1160 (unsigned long long)
1161 le64_to_cpu(req->r_reassert_version.version));
602adf40
YS
1162 ceph_osdc_put_request(req);
1163 }
1164 return ret;
1165
1166done_err:
1167 bio_chain_put(req_data->bio);
1168 ceph_osdc_put_request(req);
1169done_pages:
1fec7093 1170 rbd_coll_end_req(req_data, ret, len);
602adf40 1171 kfree(req_data);
602adf40
YS
1172 return ret;
1173}
1174
1175/*
1176 * Ceph osd op callback
1177 */
1178static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1179{
1180 struct rbd_request *req_data = req->r_priv;
1181 struct ceph_osd_reply_head *replyhead;
1182 struct ceph_osd_op *op;
1183 __s32 rc;
1184 u64 bytes;
1185 int read_op;
1186
1187 /* parse reply */
1188 replyhead = msg->front.iov_base;
1189 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1190 op = (void *)(replyhead + 1);
1191 rc = le32_to_cpu(replyhead->result);
1192 bytes = le64_to_cpu(op->extent.length);
895cfcc8 1193 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
602adf40 1194
bd919d45
AE
1195 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1196 (unsigned long long) bytes, read_op, (int) rc);
602adf40
YS
1197
1198 if (rc == -ENOENT && read_op) {
1199 zero_bio_chain(req_data->bio, 0);
1200 rc = 0;
1201 } else if (rc == 0 && read_op && bytes < req_data->len) {
1202 zero_bio_chain(req_data->bio, bytes);
1203 bytes = req_data->len;
1204 }
1205
1fec7093 1206 rbd_coll_end_req(req_data, rc, bytes);
602adf40
YS
1207
1208 if (req_data->bio)
1209 bio_chain_put(req_data->bio);
1210
1211 ceph_osdc_put_request(req);
1212 kfree(req_data);
1213}
1214
59c2be1e
YS
1215static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1216{
1217 ceph_osdc_put_request(req);
1218}
1219
602adf40
YS
1220/*
1221 * Do a synchronous ceph osd operation
1222 */
0ce1a794 1223static int rbd_req_sync_op(struct rbd_device *rbd_dev,
602adf40
YS
1224 struct ceph_snap_context *snapc,
1225 u64 snapid,
602adf40 1226 int flags,
913d2fdc 1227 struct ceph_osd_req_op *ops,
aded07ea 1228 const char *object_name,
f8d4de6e
AE
1229 u64 ofs, u64 inbound_size,
1230 char *inbound,
59c2be1e
YS
1231 struct ceph_osd_request **linger_req,
1232 u64 *ver)
602adf40
YS
1233{
1234 int ret;
1235 struct page **pages;
1236 int num_pages;
913d2fdc 1237
aafb230e 1238 rbd_assert(ops != NULL);
602adf40 1239
f8d4de6e 1240 num_pages = calc_pages_for(ofs, inbound_size);
602adf40 1241 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
b8d0638a
DC
1242 if (IS_ERR(pages))
1243 return PTR_ERR(pages);
602adf40 1244
0ce1a794 1245 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
f8d4de6e 1246 object_name, ofs, inbound_size, NULL,
602adf40
YS
1247 pages, num_pages,
1248 flags,
1249 ops,
1fec7093 1250 NULL, 0,
59c2be1e
YS
1251 NULL,
1252 linger_req, ver);
602adf40 1253 if (ret < 0)
913d2fdc 1254 goto done;
602adf40 1255
f8d4de6e
AE
1256 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1257 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
602adf40 1258
602adf40
YS
1259done:
1260 ceph_release_page_vector(pages, num_pages);
1261 return ret;
1262}
1263
1264/*
1265 * Do an asynchronous ceph osd operation
1266 */
1267static int rbd_do_op(struct request *rq,
0ce1a794 1268 struct rbd_device *rbd_dev,
602adf40 1269 struct ceph_snap_context *snapc,
602adf40 1270 u64 ofs, u64 len,
1fec7093
YS
1271 struct bio *bio,
1272 struct rbd_req_coll *coll,
1273 int coll_index)
602adf40
YS
1274{
1275 char *seg_name;
1276 u64 seg_ofs;
1277 u64 seg_len;
1278 int ret;
1279 struct ceph_osd_req_op *ops;
1280 u32 payload_len;
ff2e4bb5
AE
1281 int opcode;
1282 int flags;
4634246d 1283 u64 snapid;
602adf40 1284
65ccfe21 1285 seg_name = rbd_segment_name(rbd_dev, ofs);
602adf40
YS
1286 if (!seg_name)
1287 return -ENOMEM;
65ccfe21
AE
1288 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1289 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
602adf40 1290
ff2e4bb5
AE
1291 if (rq_data_dir(rq) == WRITE) {
1292 opcode = CEPH_OSD_OP_WRITE;
1293 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
4634246d 1294 snapid = CEPH_NOSNAP;
ff2e4bb5
AE
1295 payload_len = seg_len;
1296 } else {
1297 opcode = CEPH_OSD_OP_READ;
1298 flags = CEPH_OSD_FLAG_READ;
4634246d 1299 snapc = NULL;
0d7dbfce 1300 snapid = rbd_dev->spec->snap_id;
ff2e4bb5
AE
1301 payload_len = 0;
1302 }
602adf40 1303
57cfc106
AE
1304 ret = -ENOMEM;
1305 ops = rbd_create_rw_ops(1, opcode, payload_len);
1306 if (!ops)
602adf40
YS
1307 goto done;
1308
1309 /* we've taken care of segment sizes earlier when we
1310 cloned the bios. We should never have a segment
1311 truncated at this point */
aafb230e 1312 rbd_assert(seg_len == len);
602adf40
YS
1313
1314 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1315 seg_name, seg_ofs, seg_len,
1316 bio,
1317 NULL, 0,
1318 flags,
1319 ops,
1fec7093 1320 coll, coll_index,
59c2be1e 1321 rbd_req_cb, 0, NULL);
11f77002
SW
1322
1323 rbd_destroy_ops(ops);
602adf40
YS
1324done:
1325 kfree(seg_name);
1326 return ret;
1327}
1328
602adf40
YS
1329/*
1330 * Request sync osd read
1331 */
0ce1a794 1332static int rbd_req_sync_read(struct rbd_device *rbd_dev,
602adf40 1333 u64 snapid,
aded07ea 1334 const char *object_name,
602adf40 1335 u64 ofs, u64 len,
59c2be1e
YS
1336 char *buf,
1337 u64 *ver)
602adf40 1338{
913d2fdc
AE
1339 struct ceph_osd_req_op *ops;
1340 int ret;
1341
1342 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1343 if (!ops)
1344 return -ENOMEM;
1345
1346 ret = rbd_req_sync_op(rbd_dev, NULL,
b06e6a6b 1347 snapid,
602adf40 1348 CEPH_OSD_FLAG_READ,
913d2fdc
AE
1349 ops, object_name, ofs, len, buf, NULL, ver);
1350 rbd_destroy_ops(ops);
1351
1352 return ret;
602adf40
YS
1353}
1354
1355/*
59c2be1e
YS
1356 * Request sync osd watch
1357 */
0ce1a794 1358static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
59c2be1e 1359 u64 ver,
7f0a24d8 1360 u64 notify_id)
59c2be1e
YS
1361{
1362 struct ceph_osd_req_op *ops;
11f77002
SW
1363 int ret;
1364
57cfc106
AE
1365 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1366 if (!ops)
1367 return -ENOMEM;
59c2be1e 1368
a71b891b 1369 ops[0].watch.ver = cpu_to_le64(ver);
59c2be1e
YS
1370 ops[0].watch.cookie = notify_id;
1371 ops[0].watch.flag = 0;
1372
0ce1a794 1373 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
7f0a24d8 1374 rbd_dev->header_name, 0, 0, NULL,
ad4f232f 1375 NULL, 0,
59c2be1e
YS
1376 CEPH_OSD_FLAG_READ,
1377 ops,
1fec7093 1378 NULL, 0,
59c2be1e
YS
1379 rbd_simple_req_cb, 0, NULL);
1380
1381 rbd_destroy_ops(ops);
1382 return ret;
1383}
1384
1385static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1386{
0ce1a794 1387 struct rbd_device *rbd_dev = (struct rbd_device *)data;
a71b891b 1388 u64 hver;
13143d2d
SW
1389 int rc;
1390
0ce1a794 1391 if (!rbd_dev)
59c2be1e
YS
1392 return;
1393
bd919d45
AE
1394 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1395 rbd_dev->header_name, (unsigned long long) notify_id,
1396 (unsigned int) opcode);
117973fb 1397 rc = rbd_dev_refresh(rbd_dev, &hver);
13143d2d 1398 if (rc)
f0f8cef5 1399 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
0ce1a794 1400 " update snaps: %d\n", rbd_dev->major, rc);
59c2be1e 1401
7f0a24d8 1402 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
59c2be1e
YS
1403}
1404
1405/*
1406 * Request sync osd watch
1407 */
0e6f322d 1408static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
59c2be1e
YS
1409{
1410 struct ceph_osd_req_op *ops;
0ce1a794 1411 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
57cfc106 1412 int ret;
59c2be1e 1413
57cfc106
AE
1414 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1415 if (!ops)
1416 return -ENOMEM;
59c2be1e
YS
1417
1418 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
0ce1a794 1419 (void *)rbd_dev, &rbd_dev->watch_event);
59c2be1e
YS
1420 if (ret < 0)
1421 goto fail;
1422
0e6f322d 1423 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
0ce1a794 1424 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
59c2be1e
YS
1425 ops[0].watch.flag = 1;
1426
0ce1a794 1427 ret = rbd_req_sync_op(rbd_dev, NULL,
59c2be1e 1428 CEPH_NOSNAP,
59c2be1e
YS
1429 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1430 ops,
0e6f322d
AE
1431 rbd_dev->header_name,
1432 0, 0, NULL,
0ce1a794 1433 &rbd_dev->watch_request, NULL);
59c2be1e
YS
1434
1435 if (ret < 0)
1436 goto fail_event;
1437
1438 rbd_destroy_ops(ops);
1439 return 0;
1440
1441fail_event:
0ce1a794
AE
1442 ceph_osdc_cancel_event(rbd_dev->watch_event);
1443 rbd_dev->watch_event = NULL;
59c2be1e
YS
1444fail:
1445 rbd_destroy_ops(ops);
1446 return ret;
1447}
1448
79e3057c
YS
1449/*
1450 * Request sync osd unwatch
1451 */
070c633f 1452static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
79e3057c
YS
1453{
1454 struct ceph_osd_req_op *ops;
57cfc106 1455 int ret;
79e3057c 1456
57cfc106
AE
1457 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1458 if (!ops)
1459 return -ENOMEM;
79e3057c
YS
1460
1461 ops[0].watch.ver = 0;
0ce1a794 1462 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
79e3057c
YS
1463 ops[0].watch.flag = 0;
1464
0ce1a794 1465 ret = rbd_req_sync_op(rbd_dev, NULL,
79e3057c 1466 CEPH_NOSNAP,
79e3057c
YS
1467 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1468 ops,
070c633f
AE
1469 rbd_dev->header_name,
1470 0, 0, NULL, NULL, NULL);
1471
79e3057c
YS
1472
1473 rbd_destroy_ops(ops);
0ce1a794
AE
1474 ceph_osdc_cancel_event(rbd_dev->watch_event);
1475 rbd_dev->watch_event = NULL;
79e3057c
YS
1476 return ret;
1477}
1478
602adf40 1479/*
3cb4a687 1480 * Synchronous osd object method call
602adf40 1481 */
0ce1a794 1482static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
aded07ea
AE
1483 const char *object_name,
1484 const char *class_name,
1485 const char *method_name,
3cb4a687
AE
1486 const char *outbound,
1487 size_t outbound_size,
f8d4de6e
AE
1488 char *inbound,
1489 size_t inbound_size,
3cb4a687 1490 int flags,
59c2be1e 1491 u64 *ver)
602adf40
YS
1492{
1493 struct ceph_osd_req_op *ops;
aded07ea
AE
1494 int class_name_len = strlen(class_name);
1495 int method_name_len = strlen(method_name);
3cb4a687 1496 int payload_size;
57cfc106
AE
1497 int ret;
1498
3cb4a687
AE
1499 /*
1500 * Any input parameters required by the method we're calling
1501 * will be sent along with the class and method names as
1502 * part of the message payload. That data and its size are
1503 * supplied via the indata and indata_len fields (named from
1504 * the perspective of the server side) in the OSD request
1505 * operation.
1506 */
1507 payload_size = class_name_len + method_name_len + outbound_size;
1508 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
57cfc106
AE
1509 if (!ops)
1510 return -ENOMEM;
602adf40 1511
aded07ea
AE
1512 ops[0].cls.class_name = class_name;
1513 ops[0].cls.class_len = (__u8) class_name_len;
1514 ops[0].cls.method_name = method_name;
1515 ops[0].cls.method_len = (__u8) method_name_len;
602adf40 1516 ops[0].cls.argc = 0;
3cb4a687
AE
1517 ops[0].cls.indata = outbound;
1518 ops[0].cls.indata_len = outbound_size;
602adf40 1519
0ce1a794 1520 ret = rbd_req_sync_op(rbd_dev, NULL,
602adf40 1521 CEPH_NOSNAP,
3cb4a687 1522 flags, ops,
f8d4de6e
AE
1523 object_name, 0, inbound_size, inbound,
1524 NULL, ver);
602adf40
YS
1525
1526 rbd_destroy_ops(ops);
1527
1528 dout("cls_exec returned %d\n", ret);
1529 return ret;
1530}
1531
1fec7093
YS
1532static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1533{
1534 struct rbd_req_coll *coll =
1535 kzalloc(sizeof(struct rbd_req_coll) +
1536 sizeof(struct rbd_req_status) * num_reqs,
1537 GFP_ATOMIC);
1538
1539 if (!coll)
1540 return NULL;
1541 coll->total = num_reqs;
1542 kref_init(&coll->kref);
1543 return coll;
1544}
1545
602adf40
YS
1546/*
1547 * block device queue callback
1548 */
1549static void rbd_rq_fn(struct request_queue *q)
1550{
1551 struct rbd_device *rbd_dev = q->queuedata;
1552 struct request *rq;
602adf40 1553
00f1f36f 1554 while ((rq = blk_fetch_request(q))) {
602adf40 1555 struct bio *bio;
602adf40 1556 bool do_write;
bd919d45 1557 unsigned int size;
602adf40 1558 u64 ofs;
1fec7093
YS
1559 int num_segs, cur_seg = 0;
1560 struct rbd_req_coll *coll;
d1d25646 1561 struct ceph_snap_context *snapc;
f7760dad 1562 unsigned int bio_offset;
602adf40 1563
602adf40
YS
1564 dout("fetched request\n");
1565
1566 /* filter out block requests we don't understand */
1567 if ((rq->cmd_type != REQ_TYPE_FS)) {
1568 __blk_end_request_all(rq, 0);
00f1f36f 1569 continue;
602adf40
YS
1570 }
1571
1572 /* deduce our operation (read, write) */
1573 do_write = (rq_data_dir(rq) == WRITE);
f84344f3 1574 if (do_write && rbd_dev->mapping.read_only) {
602adf40 1575 __blk_end_request_all(rq, -EROFS);
00f1f36f 1576 continue;
602adf40
YS
1577 }
1578
1579 spin_unlock_irq(q->queue_lock);
1580
d1d25646 1581 down_read(&rbd_dev->header_rwsem);
e88a36ec 1582
daba5fdb 1583 if (!rbd_dev->exists) {
0d7dbfce 1584 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
e88a36ec 1585 up_read(&rbd_dev->header_rwsem);
d1d25646
JD
1586 dout("request for non-existent snapshot");
1587 spin_lock_irq(q->queue_lock);
1588 __blk_end_request_all(rq, -ENXIO);
1589 continue;
e88a36ec
JD
1590 }
1591
d1d25646
JD
1592 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1593
1594 up_read(&rbd_dev->header_rwsem);
1595
f7760dad
AE
1596 size = blk_rq_bytes(rq);
1597 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1598 bio = rq->bio;
1599
602adf40
YS
1600 dout("%s 0x%x bytes at 0x%llx\n",
1601 do_write ? "write" : "read",
bd919d45 1602 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
602adf40 1603
1fec7093 1604 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
df111be6
AE
1605 if (num_segs <= 0) {
1606 spin_lock_irq(q->queue_lock);
1607 __blk_end_request_all(rq, num_segs);
1608 ceph_put_snap_context(snapc);
1609 continue;
1610 }
1fec7093
YS
1611 coll = rbd_alloc_coll(num_segs);
1612 if (!coll) {
1613 spin_lock_irq(q->queue_lock);
1614 __blk_end_request_all(rq, -ENOMEM);
d1d25646 1615 ceph_put_snap_context(snapc);
00f1f36f 1616 continue;
1fec7093
YS
1617 }
1618
f7760dad 1619 bio_offset = 0;
602adf40 1620 do {
f7760dad
AE
1621 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1622 unsigned int chain_size;
1623 struct bio *bio_chain;
1624
1625 BUG_ON(limit > (u64) UINT_MAX);
1626 chain_size = (unsigned int) limit;
bd919d45 1627 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
f7760dad 1628
1fec7093 1629 kref_get(&coll->kref);
f7760dad
AE
1630
1631 /* Pass a cloned bio chain via an osd request */
1632
1633 bio_chain = bio_chain_clone_range(&bio,
1634 &bio_offset, chain_size,
1635 GFP_ATOMIC);
1636 if (bio_chain)
4634246d 1637 (void) rbd_do_op(rq, rbd_dev, snapc,
f7760dad
AE
1638 ofs, chain_size,
1639 bio_chain, coll, cur_seg);
4634246d 1640 else
1fec7093 1641 rbd_coll_end_req_index(rq, coll, cur_seg,
f7760dad
AE
1642 -ENOMEM, chain_size);
1643 size -= chain_size;
1644 ofs += chain_size;
602adf40 1645
1fec7093 1646 cur_seg++;
602adf40 1647 } while (size > 0);
1fec7093 1648 kref_put(&coll->kref, rbd_coll_release);
602adf40 1649
602adf40 1650 spin_lock_irq(q->queue_lock);
d1d25646
JD
1651
1652 ceph_put_snap_context(snapc);
602adf40
YS
1653 }
1654}
1655
1656/*
1657 * a queue callback. Makes sure that we don't create a bio that spans across
1658 * multiple osd objects. One exception would be with a single page bios,
f7760dad 1659 * which we handle later at bio_chain_clone_range()
602adf40
YS
1660 */
1661static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1662 struct bio_vec *bvec)
1663{
1664 struct rbd_device *rbd_dev = q->queuedata;
e5cfeed2
AE
1665 sector_t sector_offset;
1666 sector_t sectors_per_obj;
1667 sector_t obj_sector_offset;
1668 int ret;
1669
1670 /*
1671 * Find how far into its rbd object the partition-relative
1672 * bio start sector is to offset relative to the enclosing
1673 * device.
1674 */
1675 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1676 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1677 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
1678
1679 /*
1680 * Compute the number of bytes from that offset to the end
1681 * of the object. Account for what's already used by the bio.
1682 */
1683 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1684 if (ret > bmd->bi_size)
1685 ret -= bmd->bi_size;
1686 else
1687 ret = 0;
1688
1689 /*
1690 * Don't send back more than was asked for. And if the bio
1691 * was empty, let the whole thing through because: "Note
1692 * that a block device *must* allow a single page to be
1693 * added to an empty bio."
1694 */
1695 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1696 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1697 ret = (int) bvec->bv_len;
1698
1699 return ret;
602adf40
YS
1700}
1701
1702static void rbd_free_disk(struct rbd_device *rbd_dev)
1703{
1704 struct gendisk *disk = rbd_dev->disk;
1705
1706 if (!disk)
1707 return;
1708
602adf40
YS
1709 if (disk->flags & GENHD_FL_UP)
1710 del_gendisk(disk);
1711 if (disk->queue)
1712 blk_cleanup_queue(disk->queue);
1713 put_disk(disk);
1714}
1715
1716/*
4156d998
AE
1717 * Read the complete header for the given rbd device.
1718 *
1719 * Returns a pointer to a dynamically-allocated buffer containing
1720 * the complete and validated header. Caller can pass the address
1721 * of a variable that will be filled in with the version of the
1722 * header object at the time it was read.
1723 *
1724 * Returns a pointer-coded errno if a failure occurs.
602adf40 1725 */
4156d998
AE
1726static struct rbd_image_header_ondisk *
1727rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 1728{
4156d998 1729 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 1730 u32 snap_count = 0;
4156d998
AE
1731 u64 names_size = 0;
1732 u32 want_count;
1733 int ret;
602adf40 1734
00f1f36f 1735 /*
4156d998
AE
1736 * The complete header will include an array of its 64-bit
1737 * snapshot ids, followed by the names of those snapshots as
1738 * a contiguous block of NUL-terminated strings. Note that
1739 * the number of snapshots could change by the time we read
1740 * it in, in which case we re-read it.
00f1f36f 1741 */
4156d998
AE
1742 do {
1743 size_t size;
1744
1745 kfree(ondisk);
1746
1747 size = sizeof (*ondisk);
1748 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1749 size += names_size;
1750 ondisk = kmalloc(size, GFP_KERNEL);
1751 if (!ondisk)
1752 return ERR_PTR(-ENOMEM);
1753
1754 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
0bed54dc 1755 rbd_dev->header_name,
4156d998
AE
1756 0, size,
1757 (char *) ondisk, version);
1758
1759 if (ret < 0)
1760 goto out_err;
1761 if (WARN_ON((size_t) ret < size)) {
1762 ret = -ENXIO;
1763 pr_warning("short header read for image %s"
1764 " (want %zd got %d)\n",
0d7dbfce 1765 rbd_dev->spec->image_name, size, ret);
4156d998
AE
1766 goto out_err;
1767 }
1768 if (!rbd_dev_ondisk_valid(ondisk)) {
1769 ret = -ENXIO;
1770 pr_warning("invalid header for image %s\n",
0d7dbfce 1771 rbd_dev->spec->image_name);
4156d998 1772 goto out_err;
81e759fb 1773 }
602adf40 1774
4156d998
AE
1775 names_size = le64_to_cpu(ondisk->snap_names_len);
1776 want_count = snap_count;
1777 snap_count = le32_to_cpu(ondisk->snap_count);
1778 } while (snap_count != want_count);
00f1f36f 1779
4156d998 1780 return ondisk;
00f1f36f 1781
4156d998
AE
1782out_err:
1783 kfree(ondisk);
1784
1785 return ERR_PTR(ret);
1786}
1787
1788/*
1789 * reload the ondisk the header
1790 */
1791static int rbd_read_header(struct rbd_device *rbd_dev,
1792 struct rbd_image_header *header)
1793{
1794 struct rbd_image_header_ondisk *ondisk;
1795 u64 ver = 0;
1796 int ret;
602adf40 1797
4156d998
AE
1798 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1799 if (IS_ERR(ondisk))
1800 return PTR_ERR(ondisk);
1801 ret = rbd_header_from_disk(header, ondisk);
1802 if (ret >= 0)
1803 header->obj_version = ver;
1804 kfree(ondisk);
1805
1806 return ret;
602adf40
YS
1807}
1808
41f38c2b 1809static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
dfc5606d
YS
1810{
1811 struct rbd_snap *snap;
a0593290 1812 struct rbd_snap *next;
dfc5606d 1813
a0593290 1814 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
41f38c2b 1815 rbd_remove_snap_dev(snap);
dfc5606d
YS
1816}
1817
9478554a
AE
1818static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1819{
1820 sector_t size;
1821
0d7dbfce 1822 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
9478554a
AE
1823 return;
1824
1825 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1826 dout("setting size to %llu sectors", (unsigned long long) size);
1827 rbd_dev->mapping.size = (u64) size;
1828 set_capacity(rbd_dev->disk, size);
1829}
1830
602adf40
YS
1831/*
1832 * only read the first part of the ondisk header, without the snaps info
1833 */
117973fb 1834static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
1835{
1836 int ret;
1837 struct rbd_image_header h;
602adf40
YS
1838
1839 ret = rbd_read_header(rbd_dev, &h);
1840 if (ret < 0)
1841 return ret;
1842
a51aa0c0
JD
1843 down_write(&rbd_dev->header_rwsem);
1844
9478554a
AE
1845 /* Update image size, and check for resize of mapped image */
1846 rbd_dev->header.image_size = h.image_size;
1847 rbd_update_mapping_size(rbd_dev);
9db4b3e3 1848
849b4260 1849 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 1850 kfree(rbd_dev->header.snap_sizes);
849b4260 1851 kfree(rbd_dev->header.snap_names);
d1d25646
JD
1852 /* osd requests may still refer to snapc */
1853 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 1854
b813623a
AE
1855 if (hver)
1856 *hver = h.obj_version;
a71b891b 1857 rbd_dev->header.obj_version = h.obj_version;
93a24e08 1858 rbd_dev->header.image_size = h.image_size;
602adf40
YS
1859 rbd_dev->header.snapc = h.snapc;
1860 rbd_dev->header.snap_names = h.snap_names;
1861 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
1862 /* Free the extra copy of the object prefix */
1863 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1864 kfree(h.object_prefix);
1865
304f6808
AE
1866 ret = rbd_dev_snaps_update(rbd_dev);
1867 if (!ret)
1868 ret = rbd_dev_snaps_register(rbd_dev);
dfc5606d 1869
c666601a 1870 up_write(&rbd_dev->header_rwsem);
602adf40 1871
dfc5606d 1872 return ret;
602adf40
YS
1873}
1874
117973fb 1875static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
1fe5e993
AE
1876{
1877 int ret;
1878
117973fb 1879 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1fe5e993 1880 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
117973fb
AE
1881 if (rbd_dev->image_format == 1)
1882 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1883 else
1884 ret = rbd_dev_v2_refresh(rbd_dev, hver);
1fe5e993
AE
1885 mutex_unlock(&ctl_mutex);
1886
1887 return ret;
1888}
1889
602adf40
YS
1890static int rbd_init_disk(struct rbd_device *rbd_dev)
1891{
1892 struct gendisk *disk;
1893 struct request_queue *q;
593a9e7b 1894 u64 segment_size;
602adf40 1895
602adf40 1896 /* create gendisk info */
602adf40
YS
1897 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1898 if (!disk)
1fcdb8aa 1899 return -ENOMEM;
602adf40 1900
f0f8cef5 1901 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 1902 rbd_dev->dev_id);
602adf40
YS
1903 disk->major = rbd_dev->major;
1904 disk->first_minor = 0;
1905 disk->fops = &rbd_bd_ops;
1906 disk->private_data = rbd_dev;
1907
1908 /* init rq */
602adf40
YS
1909 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1910 if (!q)
1911 goto out_disk;
029bcbd8 1912
593a9e7b
AE
1913 /* We use the default size, but let's be explicit about it. */
1914 blk_queue_physical_block_size(q, SECTOR_SIZE);
1915
029bcbd8 1916 /* set io sizes to object size */
593a9e7b
AE
1917 segment_size = rbd_obj_bytes(&rbd_dev->header);
1918 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1919 blk_queue_max_segment_size(q, segment_size);
1920 blk_queue_io_min(q, segment_size);
1921 blk_queue_io_opt(q, segment_size);
029bcbd8 1922
602adf40
YS
1923 blk_queue_merge_bvec(q, rbd_merge_bvec);
1924 disk->queue = q;
1925
1926 q->queuedata = rbd_dev;
1927
1928 rbd_dev->disk = disk;
602adf40 1929
12f02944
AE
1930 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1931
602adf40 1932 return 0;
602adf40
YS
1933out_disk:
1934 put_disk(disk);
1fcdb8aa
AE
1935
1936 return -ENOMEM;
602adf40
YS
1937}
1938
dfc5606d
YS
1939/*
1940 sysfs
1941*/
1942
593a9e7b
AE
1943static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1944{
1945 return container_of(dev, struct rbd_device, dev);
1946}
1947
dfc5606d
YS
1948static ssize_t rbd_size_show(struct device *dev,
1949 struct device_attribute *attr, char *buf)
1950{
593a9e7b 1951 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
1952 sector_t size;
1953
1954 down_read(&rbd_dev->header_rwsem);
1955 size = get_capacity(rbd_dev->disk);
1956 up_read(&rbd_dev->header_rwsem);
dfc5606d 1957
a51aa0c0 1958 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
1959}
1960
34b13184
AE
1961/*
1962 * Note this shows the features for whatever's mapped, which is not
1963 * necessarily the base image.
1964 */
1965static ssize_t rbd_features_show(struct device *dev,
1966 struct device_attribute *attr, char *buf)
1967{
1968 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1969
1970 return sprintf(buf, "0x%016llx\n",
1971 (unsigned long long) rbd_dev->mapping.features);
1972}
1973
dfc5606d
YS
1974static ssize_t rbd_major_show(struct device *dev,
1975 struct device_attribute *attr, char *buf)
1976{
593a9e7b 1977 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 1978
dfc5606d
YS
1979 return sprintf(buf, "%d\n", rbd_dev->major);
1980}
1981
1982static ssize_t rbd_client_id_show(struct device *dev,
1983 struct device_attribute *attr, char *buf)
602adf40 1984{
593a9e7b 1985 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1986
1dbb4399
AE
1987 return sprintf(buf, "client%lld\n",
1988 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
1989}
1990
dfc5606d
YS
1991static ssize_t rbd_pool_show(struct device *dev,
1992 struct device_attribute *attr, char *buf)
602adf40 1993{
593a9e7b 1994 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1995
0d7dbfce 1996 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
dfc5606d
YS
1997}
1998
9bb2f334
AE
1999static ssize_t rbd_pool_id_show(struct device *dev,
2000 struct device_attribute *attr, char *buf)
2001{
2002 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2003
0d7dbfce
AE
2004 return sprintf(buf, "%llu\n",
2005 (unsigned long long) rbd_dev->spec->pool_id);
9bb2f334
AE
2006}
2007
dfc5606d
YS
2008static ssize_t rbd_name_show(struct device *dev,
2009 struct device_attribute *attr, char *buf)
2010{
593a9e7b 2011 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2012
a92ffdf8
AE
2013 if (rbd_dev->spec->image_name)
2014 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2015
2016 return sprintf(buf, "(unknown)\n");
dfc5606d
YS
2017}
2018
589d30e0
AE
2019static ssize_t rbd_image_id_show(struct device *dev,
2020 struct device_attribute *attr, char *buf)
2021{
2022 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2023
0d7dbfce 2024 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
589d30e0
AE
2025}
2026
34b13184
AE
2027/*
2028 * Shows the name of the currently-mapped snapshot (or
2029 * RBD_SNAP_HEAD_NAME for the base image).
2030 */
dfc5606d
YS
2031static ssize_t rbd_snap_show(struct device *dev,
2032 struct device_attribute *attr,
2033 char *buf)
2034{
593a9e7b 2035 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2036
0d7dbfce 2037 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
dfc5606d
YS
2038}
2039
86b00e0d
AE
2040/*
2041 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2042 * for the parent image. If there is no parent, simply shows
2043 * "(no parent image)".
2044 */
2045static ssize_t rbd_parent_show(struct device *dev,
2046 struct device_attribute *attr,
2047 char *buf)
2048{
2049 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2050 struct rbd_spec *spec = rbd_dev->parent_spec;
2051 int count;
2052 char *bufp = buf;
2053
2054 if (!spec)
2055 return sprintf(buf, "(no parent image)\n");
2056
2057 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2058 (unsigned long long) spec->pool_id, spec->pool_name);
2059 if (count < 0)
2060 return count;
2061 bufp += count;
2062
2063 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2064 spec->image_name ? spec->image_name : "(unknown)");
2065 if (count < 0)
2066 return count;
2067 bufp += count;
2068
2069 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2070 (unsigned long long) spec->snap_id, spec->snap_name);
2071 if (count < 0)
2072 return count;
2073 bufp += count;
2074
2075 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2076 if (count < 0)
2077 return count;
2078 bufp += count;
2079
2080 return (ssize_t) (bufp - buf);
2081}
2082
dfc5606d
YS
2083static ssize_t rbd_image_refresh(struct device *dev,
2084 struct device_attribute *attr,
2085 const char *buf,
2086 size_t size)
2087{
593a9e7b 2088 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 2089 int ret;
602adf40 2090
117973fb 2091 ret = rbd_dev_refresh(rbd_dev, NULL);
b813623a
AE
2092
2093 return ret < 0 ? ret : size;
dfc5606d 2094}
602adf40 2095
dfc5606d 2096static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
34b13184 2097static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
dfc5606d
YS
2098static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2099static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2100static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 2101static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d 2102static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
589d30e0 2103static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
dfc5606d
YS
2104static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2105static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
86b00e0d 2106static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
dfc5606d
YS
2107
2108static struct attribute *rbd_attrs[] = {
2109 &dev_attr_size.attr,
34b13184 2110 &dev_attr_features.attr,
dfc5606d
YS
2111 &dev_attr_major.attr,
2112 &dev_attr_client_id.attr,
2113 &dev_attr_pool.attr,
9bb2f334 2114 &dev_attr_pool_id.attr,
dfc5606d 2115 &dev_attr_name.attr,
589d30e0 2116 &dev_attr_image_id.attr,
dfc5606d 2117 &dev_attr_current_snap.attr,
86b00e0d 2118 &dev_attr_parent.attr,
dfc5606d 2119 &dev_attr_refresh.attr,
dfc5606d
YS
2120 NULL
2121};
2122
2123static struct attribute_group rbd_attr_group = {
2124 .attrs = rbd_attrs,
2125};
2126
2127static const struct attribute_group *rbd_attr_groups[] = {
2128 &rbd_attr_group,
2129 NULL
2130};
2131
2132static void rbd_sysfs_dev_release(struct device *dev)
2133{
2134}
2135
2136static struct device_type rbd_device_type = {
2137 .name = "rbd",
2138 .groups = rbd_attr_groups,
2139 .release = rbd_sysfs_dev_release,
2140};
2141
2142
2143/*
2144 sysfs - snapshots
2145*/
2146
2147static ssize_t rbd_snap_size_show(struct device *dev,
2148 struct device_attribute *attr,
2149 char *buf)
2150{
2151 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2152
3591538f 2153 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
2154}
2155
2156static ssize_t rbd_snap_id_show(struct device *dev,
2157 struct device_attribute *attr,
2158 char *buf)
2159{
2160 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2161
3591538f 2162 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
2163}
2164
34b13184
AE
2165static ssize_t rbd_snap_features_show(struct device *dev,
2166 struct device_attribute *attr,
2167 char *buf)
2168{
2169 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2170
2171 return sprintf(buf, "0x%016llx\n",
2172 (unsigned long long) snap->features);
2173}
2174
dfc5606d
YS
2175static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2176static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
34b13184 2177static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
dfc5606d
YS
2178
2179static struct attribute *rbd_snap_attrs[] = {
2180 &dev_attr_snap_size.attr,
2181 &dev_attr_snap_id.attr,
34b13184 2182 &dev_attr_snap_features.attr,
dfc5606d
YS
2183 NULL,
2184};
2185
2186static struct attribute_group rbd_snap_attr_group = {
2187 .attrs = rbd_snap_attrs,
2188};
2189
2190static void rbd_snap_dev_release(struct device *dev)
2191{
2192 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2193 kfree(snap->name);
2194 kfree(snap);
2195}
2196
2197static const struct attribute_group *rbd_snap_attr_groups[] = {
2198 &rbd_snap_attr_group,
2199 NULL
2200};
2201
2202static struct device_type rbd_snap_device_type = {
2203 .groups = rbd_snap_attr_groups,
2204 .release = rbd_snap_dev_release,
2205};
2206
8b8fb99c
AE
2207static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2208{
2209 kref_get(&spec->kref);
2210
2211 return spec;
2212}
2213
2214static void rbd_spec_free(struct kref *kref);
2215static void rbd_spec_put(struct rbd_spec *spec)
2216{
2217 if (spec)
2218 kref_put(&spec->kref, rbd_spec_free);
2219}
2220
2221static struct rbd_spec *rbd_spec_alloc(void)
2222{
2223 struct rbd_spec *spec;
2224
2225 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2226 if (!spec)
2227 return NULL;
2228 kref_init(&spec->kref);
2229
2230 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2231
2232 return spec;
2233}
2234
2235static void rbd_spec_free(struct kref *kref)
2236{
2237 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2238
2239 kfree(spec->pool_name);
2240 kfree(spec->image_id);
2241 kfree(spec->image_name);
2242 kfree(spec->snap_name);
2243 kfree(spec);
2244}
2245
c53d5893
AE
2246struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2247 struct rbd_spec *spec)
2248{
2249 struct rbd_device *rbd_dev;
2250
2251 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2252 if (!rbd_dev)
2253 return NULL;
2254
2255 spin_lock_init(&rbd_dev->lock);
2256 INIT_LIST_HEAD(&rbd_dev->node);
2257 INIT_LIST_HEAD(&rbd_dev->snaps);
2258 init_rwsem(&rbd_dev->header_rwsem);
2259
2260 rbd_dev->spec = spec;
2261 rbd_dev->rbd_client = rbdc;
2262
2263 return rbd_dev;
2264}
2265
2266static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2267{
86b00e0d 2268 rbd_spec_put(rbd_dev->parent_spec);
c53d5893
AE
2269 kfree(rbd_dev->header_name);
2270 rbd_put_client(rbd_dev->rbd_client);
2271 rbd_spec_put(rbd_dev->spec);
2272 kfree(rbd_dev);
2273}
2274
304f6808
AE
2275static bool rbd_snap_registered(struct rbd_snap *snap)
2276{
2277 bool ret = snap->dev.type == &rbd_snap_device_type;
2278 bool reg = device_is_registered(&snap->dev);
2279
2280 rbd_assert(!ret ^ reg);
2281
2282 return ret;
2283}
2284
41f38c2b 2285static void rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2286{
2287 list_del(&snap->node);
304f6808
AE
2288 if (device_is_registered(&snap->dev))
2289 device_unregister(&snap->dev);
dfc5606d
YS
2290}
2291
14e7085d 2292static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2293 struct device *parent)
2294{
2295 struct device *dev = &snap->dev;
2296 int ret;
2297
2298 dev->type = &rbd_snap_device_type;
2299 dev->parent = parent;
2300 dev->release = rbd_snap_dev_release;
d4b125e9 2301 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
304f6808
AE
2302 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2303
dfc5606d
YS
2304 ret = device_register(dev);
2305
2306 return ret;
2307}
2308
4e891e0a 2309static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
c8d18425 2310 const char *snap_name,
34b13184
AE
2311 u64 snap_id, u64 snap_size,
2312 u64 snap_features)
dfc5606d 2313{
4e891e0a 2314 struct rbd_snap *snap;
dfc5606d 2315 int ret;
4e891e0a
AE
2316
2317 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2318 if (!snap)
4e891e0a
AE
2319 return ERR_PTR(-ENOMEM);
2320
2321 ret = -ENOMEM;
c8d18425 2322 snap->name = kstrdup(snap_name, GFP_KERNEL);
4e891e0a
AE
2323 if (!snap->name)
2324 goto err;
2325
c8d18425
AE
2326 snap->id = snap_id;
2327 snap->size = snap_size;
34b13184 2328 snap->features = snap_features;
4e891e0a
AE
2329
2330 return snap;
2331
dfc5606d
YS
2332err:
2333 kfree(snap->name);
2334 kfree(snap);
4e891e0a
AE
2335
2336 return ERR_PTR(ret);
dfc5606d
YS
2337}
2338
cd892126
AE
2339static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2340 u64 *snap_size, u64 *snap_features)
2341{
2342 char *snap_name;
2343
2344 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2345
2346 *snap_size = rbd_dev->header.snap_sizes[which];
2347 *snap_features = 0; /* No features for v1 */
2348
2349 /* Skip over names until we find the one we are looking for */
2350
2351 snap_name = rbd_dev->header.snap_names;
2352 while (which--)
2353 snap_name += strlen(snap_name) + 1;
2354
2355 return snap_name;
2356}
2357
9d475de5
AE
2358/*
2359 * Get the size and object order for an image snapshot, or if
2360 * snap_id is CEPH_NOSNAP, gets this information for the base
2361 * image.
2362 */
2363static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2364 u8 *order, u64 *snap_size)
2365{
2366 __le64 snapid = cpu_to_le64(snap_id);
2367 int ret;
2368 struct {
2369 u8 order;
2370 __le64 size;
2371 } __attribute__ ((packed)) size_buf = { 0 };
2372
2373 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2374 "rbd", "get_size",
2375 (char *) &snapid, sizeof (snapid),
2376 (char *) &size_buf, sizeof (size_buf),
2377 CEPH_OSD_FLAG_READ, NULL);
2378 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2379 if (ret < 0)
2380 return ret;
2381
2382 *order = size_buf.order;
2383 *snap_size = le64_to_cpu(size_buf.size);
2384
2385 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2386 (unsigned long long) snap_id, (unsigned int) *order,
2387 (unsigned long long) *snap_size);
2388
2389 return 0;
2390}
2391
2392static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2393{
2394 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2395 &rbd_dev->header.obj_order,
2396 &rbd_dev->header.image_size);
2397}
2398
1e130199
AE
2399static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2400{
2401 void *reply_buf;
2402 int ret;
2403 void *p;
2404
2405 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2406 if (!reply_buf)
2407 return -ENOMEM;
2408
2409 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2410 "rbd", "get_object_prefix",
2411 NULL, 0,
2412 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2413 CEPH_OSD_FLAG_READ, NULL);
2414 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2415 if (ret < 0)
2416 goto out;
a0ea3a40 2417 ret = 0; /* rbd_req_sync_exec() can return positive */
1e130199
AE
2418
2419 p = reply_buf;
2420 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2421 p + RBD_OBJ_PREFIX_LEN_MAX,
2422 NULL, GFP_NOIO);
2423
2424 if (IS_ERR(rbd_dev->header.object_prefix)) {
2425 ret = PTR_ERR(rbd_dev->header.object_prefix);
2426 rbd_dev->header.object_prefix = NULL;
2427 } else {
2428 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2429 }
2430
2431out:
2432 kfree(reply_buf);
2433
2434 return ret;
2435}
2436
b1b5402a
AE
2437static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2438 u64 *snap_features)
2439{
2440 __le64 snapid = cpu_to_le64(snap_id);
2441 struct {
2442 __le64 features;
2443 __le64 incompat;
2444 } features_buf = { 0 };
d889140c 2445 u64 incompat;
b1b5402a
AE
2446 int ret;
2447
2448 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2449 "rbd", "get_features",
2450 (char *) &snapid, sizeof (snapid),
2451 (char *) &features_buf, sizeof (features_buf),
2452 CEPH_OSD_FLAG_READ, NULL);
2453 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2454 if (ret < 0)
2455 return ret;
d889140c
AE
2456
2457 incompat = le64_to_cpu(features_buf.incompat);
2458 if (incompat & ~RBD_FEATURES_ALL)
2459 return -ENOTSUPP;
2460
b1b5402a
AE
2461 *snap_features = le64_to_cpu(features_buf.features);
2462
2463 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2464 (unsigned long long) snap_id,
2465 (unsigned long long) *snap_features,
2466 (unsigned long long) le64_to_cpu(features_buf.incompat));
2467
2468 return 0;
2469}
2470
2471static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2472{
2473 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2474 &rbd_dev->header.features);
2475}
2476
86b00e0d
AE
2477static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2478{
2479 struct rbd_spec *parent_spec;
2480 size_t size;
2481 void *reply_buf = NULL;
2482 __le64 snapid;
2483 void *p;
2484 void *end;
2485 char *image_id;
2486 u64 overlap;
2487 size_t len = 0;
2488 int ret;
2489
2490 parent_spec = rbd_spec_alloc();
2491 if (!parent_spec)
2492 return -ENOMEM;
2493
2494 size = sizeof (__le64) + /* pool_id */
2495 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2496 sizeof (__le64) + /* snap_id */
2497 sizeof (__le64); /* overlap */
2498 reply_buf = kmalloc(size, GFP_KERNEL);
2499 if (!reply_buf) {
2500 ret = -ENOMEM;
2501 goto out_err;
2502 }
2503
2504 snapid = cpu_to_le64(CEPH_NOSNAP);
2505 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2506 "rbd", "get_parent",
2507 (char *) &snapid, sizeof (snapid),
2508 (char *) reply_buf, size,
2509 CEPH_OSD_FLAG_READ, NULL);
2510 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2511 if (ret < 0)
2512 goto out_err;
2513
2514 ret = -ERANGE;
2515 p = reply_buf;
2516 end = (char *) reply_buf + size;
2517 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2518 if (parent_spec->pool_id == CEPH_NOPOOL)
2519 goto out; /* No parent? No problem. */
2520
2521 image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2522 if (IS_ERR(image_id)) {
2523 ret = PTR_ERR(image_id);
2524 goto out_err;
2525 }
2526 parent_spec->image_id = image_id;
9e15b77d 2527 parent_spec->image_id_len = len;
86b00e0d
AE
2528 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2529 ceph_decode_64_safe(&p, end, overlap, out_err);
2530
2531 rbd_dev->parent_overlap = overlap;
2532 rbd_dev->parent_spec = parent_spec;
2533 parent_spec = NULL; /* rbd_dev now owns this */
2534out:
2535 ret = 0;
2536out_err:
2537 kfree(reply_buf);
2538 rbd_spec_put(parent_spec);
2539
2540 return ret;
2541}
2542
9e15b77d
AE
2543static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2544{
2545 size_t image_id_size;
2546 char *image_id;
2547 void *p;
2548 void *end;
2549 size_t size;
2550 void *reply_buf = NULL;
2551 size_t len = 0;
2552 char *image_name = NULL;
2553 int ret;
2554
2555 rbd_assert(!rbd_dev->spec->image_name);
2556
2557 image_id_size = sizeof (__le32) + rbd_dev->spec->image_id_len;
2558 image_id = kmalloc(image_id_size, GFP_KERNEL);
2559 if (!image_id)
2560 return NULL;
2561
2562 p = image_id;
2563 end = (char *) image_id + image_id_size;
2564 ceph_encode_string(&p, end, rbd_dev->spec->image_id,
2565 (u32) rbd_dev->spec->image_id_len);
2566
2567 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2568 reply_buf = kmalloc(size, GFP_KERNEL);
2569 if (!reply_buf)
2570 goto out;
2571
2572 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
2573 "rbd", "dir_get_name",
2574 image_id, image_id_size,
2575 (char *) reply_buf, size,
2576 CEPH_OSD_FLAG_READ, NULL);
2577 if (ret < 0)
2578 goto out;
2579 p = reply_buf;
2580 end = (char *) reply_buf + size;
2581 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2582 if (IS_ERR(image_name))
2583 image_name = NULL;
2584 else
2585 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2586out:
2587 kfree(reply_buf);
2588 kfree(image_id);
2589
2590 return image_name;
2591}
2592
2593/*
2594 * When a parent image gets probed, we only have the pool, image,
2595 * and snapshot ids but not the names of any of them. This call
2596 * is made later to fill in those names. It has to be done after
2597 * rbd_dev_snaps_update() has completed because some of the
2598 * information (in particular, snapshot name) is not available
2599 * until then.
2600 */
2601static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2602{
2603 struct ceph_osd_client *osdc;
2604 const char *name;
2605 void *reply_buf = NULL;
2606 int ret;
2607
2608 if (rbd_dev->spec->pool_name)
2609 return 0; /* Already have the names */
2610
2611 /* Look up the pool name */
2612
2613 osdc = &rbd_dev->rbd_client->client->osdc;
2614 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
2615 if (!name)
2616 return -EIO; /* pool id too large (>= 2^31) */
2617
2618 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2619 if (!rbd_dev->spec->pool_name)
2620 return -ENOMEM;
2621
2622 /* Fetch the image name; tolerate failure here */
2623
2624 name = rbd_dev_image_name(rbd_dev);
2625 if (name) {
2626 rbd_dev->spec->image_name_len = strlen(name);
2627 rbd_dev->spec->image_name = (char *) name;
2628 } else {
2629 pr_warning(RBD_DRV_NAME "%d "
2630 "unable to get image name for image id %s\n",
2631 rbd_dev->major, rbd_dev->spec->image_id);
2632 }
2633
2634 /* Look up the snapshot name. */
2635
2636 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2637 if (!name) {
2638 ret = -EIO;
2639 goto out_err;
2640 }
2641 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
2642 if(!rbd_dev->spec->snap_name)
2643 goto out_err;
2644
2645 return 0;
2646out_err:
2647 kfree(reply_buf);
2648 kfree(rbd_dev->spec->pool_name);
2649 rbd_dev->spec->pool_name = NULL;
2650
2651 return ret;
2652}
2653
6e14b1a6 2654static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
35d489f9
AE
2655{
2656 size_t size;
2657 int ret;
2658 void *reply_buf;
2659 void *p;
2660 void *end;
2661 u64 seq;
2662 u32 snap_count;
2663 struct ceph_snap_context *snapc;
2664 u32 i;
2665
2666 /*
2667 * We'll need room for the seq value (maximum snapshot id),
2668 * snapshot count, and array of that many snapshot ids.
2669 * For now we have a fixed upper limit on the number we're
2670 * prepared to receive.
2671 */
2672 size = sizeof (__le64) + sizeof (__le32) +
2673 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2674 reply_buf = kzalloc(size, GFP_KERNEL);
2675 if (!reply_buf)
2676 return -ENOMEM;
2677
2678 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2679 "rbd", "get_snapcontext",
2680 NULL, 0,
2681 reply_buf, size,
6e14b1a6 2682 CEPH_OSD_FLAG_READ, ver);
35d489f9
AE
2683 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2684 if (ret < 0)
2685 goto out;
2686
2687 ret = -ERANGE;
2688 p = reply_buf;
2689 end = (char *) reply_buf + size;
2690 ceph_decode_64_safe(&p, end, seq, out);
2691 ceph_decode_32_safe(&p, end, snap_count, out);
2692
2693 /*
2694 * Make sure the reported number of snapshot ids wouldn't go
2695 * beyond the end of our buffer. But before checking that,
2696 * make sure the computed size of the snapshot context we
2697 * allocate is representable in a size_t.
2698 */
2699 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2700 / sizeof (u64)) {
2701 ret = -EINVAL;
2702 goto out;
2703 }
2704 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2705 goto out;
2706
2707 size = sizeof (struct ceph_snap_context) +
2708 snap_count * sizeof (snapc->snaps[0]);
2709 snapc = kmalloc(size, GFP_KERNEL);
2710 if (!snapc) {
2711 ret = -ENOMEM;
2712 goto out;
2713 }
2714
2715 atomic_set(&snapc->nref, 1);
2716 snapc->seq = seq;
2717 snapc->num_snaps = snap_count;
2718 for (i = 0; i < snap_count; i++)
2719 snapc->snaps[i] = ceph_decode_64(&p);
2720
2721 rbd_dev->header.snapc = snapc;
2722
2723 dout(" snap context seq = %llu, snap_count = %u\n",
2724 (unsigned long long) seq, (unsigned int) snap_count);
2725
2726out:
2727 kfree(reply_buf);
2728
2729 return 0;
2730}
2731
b8b1e2db
AE
2732static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2733{
2734 size_t size;
2735 void *reply_buf;
2736 __le64 snap_id;
2737 int ret;
2738 void *p;
2739 void *end;
b8b1e2db
AE
2740 char *snap_name;
2741
2742 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2743 reply_buf = kmalloc(size, GFP_KERNEL);
2744 if (!reply_buf)
2745 return ERR_PTR(-ENOMEM);
2746
2747 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2748 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2749 "rbd", "get_snapshot_name",
2750 (char *) &snap_id, sizeof (snap_id),
2751 reply_buf, size,
2752 CEPH_OSD_FLAG_READ, NULL);
2753 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2754 if (ret < 0)
2755 goto out;
2756
2757 p = reply_buf;
2758 end = (char *) reply_buf + size;
e5c35534 2759 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
b8b1e2db
AE
2760 if (IS_ERR(snap_name)) {
2761 ret = PTR_ERR(snap_name);
2762 goto out;
2763 } else {
2764 dout(" snap_id 0x%016llx snap_name = %s\n",
2765 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2766 }
2767 kfree(reply_buf);
2768
2769 return snap_name;
2770out:
2771 kfree(reply_buf);
2772
2773 return ERR_PTR(ret);
2774}
2775
2776static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2777 u64 *snap_size, u64 *snap_features)
2778{
2779 __le64 snap_id;
2780 u8 order;
2781 int ret;
2782
2783 snap_id = rbd_dev->header.snapc->snaps[which];
2784 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2785 if (ret)
2786 return ERR_PTR(ret);
2787 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2788 if (ret)
2789 return ERR_PTR(ret);
2790
2791 return rbd_dev_v2_snap_name(rbd_dev, which);
2792}
2793
2794static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2795 u64 *snap_size, u64 *snap_features)
2796{
2797 if (rbd_dev->image_format == 1)
2798 return rbd_dev_v1_snap_info(rbd_dev, which,
2799 snap_size, snap_features);
2800 if (rbd_dev->image_format == 2)
2801 return rbd_dev_v2_snap_info(rbd_dev, which,
2802 snap_size, snap_features);
2803 return ERR_PTR(-EINVAL);
2804}
2805
117973fb
AE
2806static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2807{
2808 int ret;
2809 __u8 obj_order;
2810
2811 down_write(&rbd_dev->header_rwsem);
2812
2813 /* Grab old order first, to see if it changes */
2814
2815 obj_order = rbd_dev->header.obj_order,
2816 ret = rbd_dev_v2_image_size(rbd_dev);
2817 if (ret)
2818 goto out;
2819 if (rbd_dev->header.obj_order != obj_order) {
2820 ret = -EIO;
2821 goto out;
2822 }
2823 rbd_update_mapping_size(rbd_dev);
2824
2825 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2826 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2827 if (ret)
2828 goto out;
2829 ret = rbd_dev_snaps_update(rbd_dev);
2830 dout("rbd_dev_snaps_update returned %d\n", ret);
2831 if (ret)
2832 goto out;
2833 ret = rbd_dev_snaps_register(rbd_dev);
2834 dout("rbd_dev_snaps_register returned %d\n", ret);
2835out:
2836 up_write(&rbd_dev->header_rwsem);
2837
2838 return ret;
2839}
2840
dfc5606d 2841/*
35938150
AE
2842 * Scan the rbd device's current snapshot list and compare it to the
2843 * newly-received snapshot context. Remove any existing snapshots
2844 * not present in the new snapshot context. Add a new snapshot for
2845 * any snaphots in the snapshot context not in the current list.
2846 * And verify there are no changes to snapshots we already know
2847 * about.
2848 *
2849 * Assumes the snapshots in the snapshot context are sorted by
2850 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2851 * are also maintained in that order.)
dfc5606d 2852 */
304f6808 2853static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
dfc5606d 2854{
35938150
AE
2855 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2856 const u32 snap_count = snapc->num_snaps;
35938150
AE
2857 struct list_head *head = &rbd_dev->snaps;
2858 struct list_head *links = head->next;
2859 u32 index = 0;
dfc5606d 2860
9fcbb800 2861 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
2862 while (index < snap_count || links != head) {
2863 u64 snap_id;
2864 struct rbd_snap *snap;
cd892126
AE
2865 char *snap_name;
2866 u64 snap_size = 0;
2867 u64 snap_features = 0;
dfc5606d 2868
35938150
AE
2869 snap_id = index < snap_count ? snapc->snaps[index]
2870 : CEPH_NOSNAP;
2871 snap = links != head ? list_entry(links, struct rbd_snap, node)
2872 : NULL;
aafb230e 2873 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 2874
35938150
AE
2875 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2876 struct list_head *next = links->next;
dfc5606d 2877
35938150 2878 /* Existing snapshot not in the new snap context */
dfc5606d 2879
0d7dbfce 2880 if (rbd_dev->spec->snap_id == snap->id)
daba5fdb 2881 rbd_dev->exists = false;
41f38c2b 2882 rbd_remove_snap_dev(snap);
9fcbb800 2883 dout("%ssnap id %llu has been removed\n",
0d7dbfce
AE
2884 rbd_dev->spec->snap_id == snap->id ?
2885 "mapped " : "",
9fcbb800 2886 (unsigned long long) snap->id);
35938150
AE
2887
2888 /* Done with this list entry; advance */
2889
2890 links = next;
dfc5606d
YS
2891 continue;
2892 }
35938150 2893
b8b1e2db
AE
2894 snap_name = rbd_dev_snap_info(rbd_dev, index,
2895 &snap_size, &snap_features);
cd892126
AE
2896 if (IS_ERR(snap_name))
2897 return PTR_ERR(snap_name);
2898
9fcbb800
AE
2899 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2900 (unsigned long long) snap_id);
35938150
AE
2901 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2902 struct rbd_snap *new_snap;
2903
2904 /* We haven't seen this snapshot before */
2905
c8d18425 2906 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
cd892126 2907 snap_id, snap_size, snap_features);
9fcbb800
AE
2908 if (IS_ERR(new_snap)) {
2909 int err = PTR_ERR(new_snap);
2910
2911 dout(" failed to add dev, error %d\n", err);
2912
2913 return err;
2914 }
35938150
AE
2915
2916 /* New goes before existing, or at end of list */
2917
9fcbb800 2918 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
2919 if (snap)
2920 list_add_tail(&new_snap->node, &snap->node);
2921 else
523f3258 2922 list_add_tail(&new_snap->node, head);
35938150
AE
2923 } else {
2924 /* Already have this one */
2925
9fcbb800
AE
2926 dout(" already present\n");
2927
cd892126 2928 rbd_assert(snap->size == snap_size);
aafb230e 2929 rbd_assert(!strcmp(snap->name, snap_name));
cd892126 2930 rbd_assert(snap->features == snap_features);
35938150
AE
2931
2932 /* Done with this list entry; advance */
2933
2934 links = links->next;
dfc5606d 2935 }
35938150
AE
2936
2937 /* Advance to the next entry in the snapshot context */
2938
2939 index++;
dfc5606d 2940 }
9fcbb800 2941 dout("%s: done\n", __func__);
dfc5606d
YS
2942
2943 return 0;
2944}
2945
304f6808
AE
2946/*
2947 * Scan the list of snapshots and register the devices for any that
2948 * have not already been registered.
2949 */
2950static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2951{
2952 struct rbd_snap *snap;
2953 int ret = 0;
2954
2955 dout("%s called\n", __func__);
86ff77bb
AE
2956 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2957 return -EIO;
304f6808
AE
2958
2959 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2960 if (!rbd_snap_registered(snap)) {
2961 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2962 if (ret < 0)
2963 break;
2964 }
2965 }
2966 dout("%s: returning %d\n", __func__, ret);
2967
2968 return ret;
2969}
2970
dfc5606d
YS
2971static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2972{
dfc5606d 2973 struct device *dev;
cd789ab9 2974 int ret;
dfc5606d
YS
2975
2976 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
dfc5606d 2977
cd789ab9 2978 dev = &rbd_dev->dev;
dfc5606d
YS
2979 dev->bus = &rbd_bus_type;
2980 dev->type = &rbd_device_type;
2981 dev->parent = &rbd_root_dev;
2982 dev->release = rbd_dev_release;
de71a297 2983 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d 2984 ret = device_register(dev);
dfc5606d 2985
dfc5606d 2986 mutex_unlock(&ctl_mutex);
cd789ab9 2987
dfc5606d 2988 return ret;
602adf40
YS
2989}
2990
dfc5606d
YS
2991static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2992{
2993 device_unregister(&rbd_dev->dev);
2994}
2995
59c2be1e
YS
2996static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2997{
2998 int ret, rc;
2999
3000 do {
0e6f322d 3001 ret = rbd_req_sync_watch(rbd_dev);
59c2be1e 3002 if (ret == -ERANGE) {
117973fb 3003 rc = rbd_dev_refresh(rbd_dev, NULL);
59c2be1e
YS
3004 if (rc < 0)
3005 return rc;
3006 }
3007 } while (ret == -ERANGE);
3008
3009 return ret;
3010}
3011
e2839308 3012static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
3013
3014/*
499afd5b
AE
3015 * Get a unique rbd identifier for the given new rbd_dev, and add
3016 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 3017 */
e2839308 3018static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 3019{
e2839308 3020 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
3021
3022 spin_lock(&rbd_dev_list_lock);
3023 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3024 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
3025 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3026 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 3027}
b7f23c36 3028
1ddbe94e 3029/*
499afd5b
AE
3030 * Remove an rbd_dev from the global list, and record that its
3031 * identifier is no longer in use.
1ddbe94e 3032 */
e2839308 3033static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 3034{
d184f6bf 3035 struct list_head *tmp;
de71a297 3036 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
3037 int max_id;
3038
aafb230e 3039 rbd_assert(rbd_id > 0);
499afd5b 3040
e2839308
AE
3041 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3042 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
3043 spin_lock(&rbd_dev_list_lock);
3044 list_del_init(&rbd_dev->node);
d184f6bf
AE
3045
3046 /*
3047 * If the id being "put" is not the current maximum, there
3048 * is nothing special we need to do.
3049 */
e2839308 3050 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
3051 spin_unlock(&rbd_dev_list_lock);
3052 return;
3053 }
3054
3055 /*
3056 * We need to update the current maximum id. Search the
3057 * list to find out what it is. We're more likely to find
3058 * the maximum at the end, so search the list backward.
3059 */
3060 max_id = 0;
3061 list_for_each_prev(tmp, &rbd_dev_list) {
3062 struct rbd_device *rbd_dev;
3063
3064 rbd_dev = list_entry(tmp, struct rbd_device, node);
b213e0b1
AE
3065 if (rbd_dev->dev_id > max_id)
3066 max_id = rbd_dev->dev_id;
d184f6bf 3067 }
499afd5b 3068 spin_unlock(&rbd_dev_list_lock);
b7f23c36 3069
1ddbe94e 3070 /*
e2839308 3071 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
3072 * which case it now accurately reflects the new maximum.
3073 * Be careful not to overwrite the maximum value in that
3074 * case.
1ddbe94e 3075 */
e2839308
AE
3076 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3077 dout(" max dev id has been reset\n");
b7f23c36
AE
3078}
3079
e28fff26
AE
3080/*
3081 * Skips over white space at *buf, and updates *buf to point to the
3082 * first found non-space character (if any). Returns the length of
593a9e7b
AE
3083 * the token (string of non-white space characters) found. Note
3084 * that *buf must be terminated with '\0'.
e28fff26
AE
3085 */
3086static inline size_t next_token(const char **buf)
3087{
3088 /*
3089 * These are the characters that produce nonzero for
3090 * isspace() in the "C" and "POSIX" locales.
3091 */
3092 const char *spaces = " \f\n\r\t\v";
3093
3094 *buf += strspn(*buf, spaces); /* Find start of token */
3095
3096 return strcspn(*buf, spaces); /* Return token length */
3097}
3098
3099/*
3100 * Finds the next token in *buf, and if the provided token buffer is
3101 * big enough, copies the found token into it. The result, if
593a9e7b
AE
3102 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3103 * must be terminated with '\0' on entry.
e28fff26
AE
3104 *
3105 * Returns the length of the token found (not including the '\0').
3106 * Return value will be 0 if no token is found, and it will be >=
3107 * token_size if the token would not fit.
3108 *
593a9e7b 3109 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
3110 * found token. Note that this occurs even if the token buffer is
3111 * too small to hold it.
3112 */
3113static inline size_t copy_token(const char **buf,
3114 char *token,
3115 size_t token_size)
3116{
3117 size_t len;
3118
3119 len = next_token(buf);
3120 if (len < token_size) {
3121 memcpy(token, *buf, len);
3122 *(token + len) = '\0';
3123 }
3124 *buf += len;
3125
3126 return len;
3127}
3128
ea3352f4
AE
3129/*
3130 * Finds the next token in *buf, dynamically allocates a buffer big
3131 * enough to hold a copy of it, and copies the token into the new
3132 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3133 * that a duplicate buffer is created even for a zero-length token.
3134 *
3135 * Returns a pointer to the newly-allocated duplicate, or a null
3136 * pointer if memory for the duplicate was not available. If
3137 * the lenp argument is a non-null pointer, the length of the token
3138 * (not including the '\0') is returned in *lenp.
3139 *
3140 * If successful, the *buf pointer will be updated to point beyond
3141 * the end of the found token.
3142 *
3143 * Note: uses GFP_KERNEL for allocation.
3144 */
3145static inline char *dup_token(const char **buf, size_t *lenp)
3146{
3147 char *dup;
3148 size_t len;
3149
3150 len = next_token(buf);
3151 dup = kmalloc(len + 1, GFP_KERNEL);
3152 if (!dup)
3153 return NULL;
3154
3155 memcpy(dup, *buf, len);
3156 *(dup + len) = '\0';
3157 *buf += len;
3158
3159 if (lenp)
3160 *lenp = len;
3161
3162 return dup;
3163}
3164
a725f65e 3165/*
859c31df
AE
3166 * Parse the options provided for an "rbd add" (i.e., rbd image
3167 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3168 * and the data written is passed here via a NUL-terminated buffer.
3169 * Returns 0 if successful or an error code otherwise.
d22f76e7 3170 *
859c31df
AE
3171 * The information extracted from these options is recorded in
3172 * the other parameters which return dynamically-allocated
3173 * structures:
3174 * ceph_opts
3175 * The address of a pointer that will refer to a ceph options
3176 * structure. Caller must release the returned pointer using
3177 * ceph_destroy_options() when it is no longer needed.
3178 * rbd_opts
3179 * Address of an rbd options pointer. Fully initialized by
3180 * this function; caller must release with kfree().
3181 * spec
3182 * Address of an rbd image specification pointer. Fully
3183 * initialized by this function based on parsed options.
3184 * Caller must release with rbd_spec_put().
3185 *
3186 * The options passed take this form:
3187 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3188 * where:
3189 * <mon_addrs>
3190 * A comma-separated list of one or more monitor addresses.
3191 * A monitor address is an ip address, optionally followed
3192 * by a port number (separated by a colon).
3193 * I.e.: ip1[:port1][,ip2[:port2]...]
3194 * <options>
3195 * A comma-separated list of ceph and/or rbd options.
3196 * <pool_name>
3197 * The name of the rados pool containing the rbd image.
3198 * <image_name>
3199 * The name of the image in that pool to map.
3200 * <snap_id>
3201 * An optional snapshot id. If provided, the mapping will
3202 * present data from the image at the time that snapshot was
3203 * created. The image head is used if no snapshot id is
3204 * provided. Snapshot mappings are always read-only.
a725f65e 3205 */
859c31df 3206static int rbd_add_parse_args(const char *buf,
dc79b113 3207 struct ceph_options **ceph_opts,
859c31df
AE
3208 struct rbd_options **opts,
3209 struct rbd_spec **rbd_spec)
e28fff26 3210{
d22f76e7 3211 size_t len;
859c31df 3212 char *options;
0ddebc0c
AE
3213 const char *mon_addrs;
3214 size_t mon_addrs_size;
859c31df 3215 struct rbd_spec *spec = NULL;
4e9afeba 3216 struct rbd_options *rbd_opts = NULL;
859c31df 3217 struct ceph_options *copts;
dc79b113 3218 int ret;
e28fff26
AE
3219
3220 /* The first four tokens are required */
3221
7ef3214a
AE
3222 len = next_token(&buf);
3223 if (!len)
dc79b113 3224 return -EINVAL; /* Missing monitor address(es) */
0ddebc0c 3225 mon_addrs = buf;
f28e565a 3226 mon_addrs_size = len + 1;
7ef3214a 3227 buf += len;
a725f65e 3228
dc79b113 3229 ret = -EINVAL;
f28e565a
AE
3230 options = dup_token(&buf, NULL);
3231 if (!options)
dc79b113 3232 return -ENOMEM;
f28e565a
AE
3233 if (!*options)
3234 goto out_err; /* Missing options */
e28fff26 3235
859c31df
AE
3236 spec = rbd_spec_alloc();
3237 if (!spec)
f28e565a 3238 goto out_mem;
859c31df
AE
3239
3240 spec->pool_name = dup_token(&buf, NULL);
3241 if (!spec->pool_name)
3242 goto out_mem;
3243 if (!*spec->pool_name)
f28e565a 3244 goto out_err; /* Missing pool name */
e28fff26 3245
859c31df
AE
3246 spec->image_name = dup_token(&buf, &spec->image_name_len);
3247 if (!spec->image_name)
f28e565a 3248 goto out_mem;
859c31df 3249 if (!*spec->image_name)
f28e565a 3250 goto out_err; /* Missing image name */
d4b125e9 3251
f28e565a
AE
3252 /*
3253 * Snapshot name is optional; default is to use "-"
3254 * (indicating the head/no snapshot).
3255 */
3feeb894 3256 len = next_token(&buf);
820a5f3e 3257 if (!len) {
3feeb894
AE
3258 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3259 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a 3260 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
dc79b113 3261 ret = -ENAMETOOLONG;
f28e565a 3262 goto out_err;
849b4260 3263 }
859c31df
AE
3264 spec->snap_name = kmalloc(len + 1, GFP_KERNEL);
3265 if (!spec->snap_name)
f28e565a 3266 goto out_mem;
859c31df
AE
3267 memcpy(spec->snap_name, buf, len);
3268 *(spec->snap_name + len) = '\0';
e5c35534 3269
0ddebc0c 3270 /* Initialize all rbd options to the defaults */
e28fff26 3271
4e9afeba
AE
3272 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3273 if (!rbd_opts)
3274 goto out_mem;
3275
3276 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
d22f76e7 3277
859c31df 3278 copts = ceph_parse_options(options, mon_addrs,
0ddebc0c 3279 mon_addrs + mon_addrs_size - 1,
4e9afeba 3280 parse_rbd_opts_token, rbd_opts);
859c31df
AE
3281 if (IS_ERR(copts)) {
3282 ret = PTR_ERR(copts);
dc79b113
AE
3283 goto out_err;
3284 }
859c31df
AE
3285 kfree(options);
3286
3287 *ceph_opts = copts;
4e9afeba 3288 *opts = rbd_opts;
859c31df 3289 *rbd_spec = spec;
0ddebc0c 3290
dc79b113 3291 return 0;
f28e565a 3292out_mem:
dc79b113 3293 ret = -ENOMEM;
d22f76e7 3294out_err:
859c31df
AE
3295 kfree(rbd_opts);
3296 rbd_spec_put(spec);
f28e565a 3297 kfree(options);
d22f76e7 3298
dc79b113 3299 return ret;
a725f65e
AE
3300}
3301
589d30e0
AE
3302/*
3303 * An rbd format 2 image has a unique identifier, distinct from the
3304 * name given to it by the user. Internally, that identifier is
3305 * what's used to specify the names of objects related to the image.
3306 *
3307 * A special "rbd id" object is used to map an rbd image name to its
3308 * id. If that object doesn't exist, then there is no v2 rbd image
3309 * with the supplied name.
3310 *
3311 * This function will record the given rbd_dev's image_id field if
3312 * it can be determined, and in that case will return 0. If any
3313 * errors occur a negative errno will be returned and the rbd_dev's
3314 * image_id field will be unchanged (and should be NULL).
3315 */
3316static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3317{
3318 int ret;
3319 size_t size;
3320 char *object_name;
3321 void *response;
3322 void *p;
3323
2c0d0a10
AE
3324 /*
3325 * When probing a parent image, the image id is already
3326 * known (and the image name likely is not). There's no
3327 * need to fetch the image id again in this case.
3328 */
3329 if (rbd_dev->spec->image_id)
3330 return 0;
3331
589d30e0
AE
3332 /*
3333 * First, see if the format 2 image id file exists, and if
3334 * so, get the image's persistent id from it.
3335 */
0d7dbfce 3336 size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len;
589d30e0
AE
3337 object_name = kmalloc(size, GFP_NOIO);
3338 if (!object_name)
3339 return -ENOMEM;
0d7dbfce 3340 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
589d30e0
AE
3341 dout("rbd id object name is %s\n", object_name);
3342
3343 /* Response will be an encoded string, which includes a length */
3344
3345 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3346 response = kzalloc(size, GFP_NOIO);
3347 if (!response) {
3348 ret = -ENOMEM;
3349 goto out;
3350 }
3351
3352 ret = rbd_req_sync_exec(rbd_dev, object_name,
3353 "rbd", "get_id",
3354 NULL, 0,
3355 response, RBD_IMAGE_ID_LEN_MAX,
3356 CEPH_OSD_FLAG_READ, NULL);
3357 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3358 if (ret < 0)
3359 goto out;
a0ea3a40 3360 ret = 0; /* rbd_req_sync_exec() can return positive */
589d30e0
AE
3361
3362 p = response;
0d7dbfce 3363 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
589d30e0 3364 p + RBD_IMAGE_ID_LEN_MAX,
0d7dbfce 3365 &rbd_dev->spec->image_id_len,
589d30e0 3366 GFP_NOIO);
0d7dbfce
AE
3367 if (IS_ERR(rbd_dev->spec->image_id)) {
3368 ret = PTR_ERR(rbd_dev->spec->image_id);
3369 rbd_dev->spec->image_id = NULL;
589d30e0 3370 } else {
0d7dbfce 3371 dout("image_id is %s\n", rbd_dev->spec->image_id);
589d30e0
AE
3372 }
3373out:
3374 kfree(response);
3375 kfree(object_name);
3376
3377 return ret;
3378}
3379
a30b71b9
AE
3380static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3381{
3382 int ret;
3383 size_t size;
3384
3385 /* Version 1 images have no id; empty string is used */
3386
0d7dbfce
AE
3387 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3388 if (!rbd_dev->spec->image_id)
a30b71b9 3389 return -ENOMEM;
0d7dbfce 3390 rbd_dev->spec->image_id_len = 0;
a30b71b9
AE
3391
3392 /* Record the header object name for this rbd image. */
3393
0d7dbfce 3394 size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX);
a30b71b9
AE
3395 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3396 if (!rbd_dev->header_name) {
3397 ret = -ENOMEM;
3398 goto out_err;
3399 }
0d7dbfce
AE
3400 sprintf(rbd_dev->header_name, "%s%s",
3401 rbd_dev->spec->image_name, RBD_SUFFIX);
a30b71b9
AE
3402
3403 /* Populate rbd image metadata */
3404
3405 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3406 if (ret < 0)
3407 goto out_err;
86b00e0d
AE
3408
3409 /* Version 1 images have no parent (no layering) */
3410
3411 rbd_dev->parent_spec = NULL;
3412 rbd_dev->parent_overlap = 0;
3413
a30b71b9
AE
3414 rbd_dev->image_format = 1;
3415
3416 dout("discovered version 1 image, header name is %s\n",
3417 rbd_dev->header_name);
3418
3419 return 0;
3420
3421out_err:
3422 kfree(rbd_dev->header_name);
3423 rbd_dev->header_name = NULL;
0d7dbfce
AE
3424 kfree(rbd_dev->spec->image_id);
3425 rbd_dev->spec->image_id = NULL;
a30b71b9
AE
3426
3427 return ret;
3428}
3429
3430static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3431{
3432 size_t size;
9d475de5 3433 int ret;
6e14b1a6 3434 u64 ver = 0;
a30b71b9
AE
3435
3436 /*
3437 * Image id was filled in by the caller. Record the header
3438 * object name for this rbd image.
3439 */
0d7dbfce 3440 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len;
a30b71b9
AE
3441 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3442 if (!rbd_dev->header_name)
3443 return -ENOMEM;
3444 sprintf(rbd_dev->header_name, "%s%s",
0d7dbfce 3445 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
9d475de5
AE
3446
3447 /* Get the size and object order for the image */
3448
3449 ret = rbd_dev_v2_image_size(rbd_dev);
1e130199
AE
3450 if (ret < 0)
3451 goto out_err;
3452
3453 /* Get the object prefix (a.k.a. block_name) for the image */
3454
3455 ret = rbd_dev_v2_object_prefix(rbd_dev);
b1b5402a
AE
3456 if (ret < 0)
3457 goto out_err;
3458
d889140c 3459 /* Get the and check features for the image */
b1b5402a
AE
3460
3461 ret = rbd_dev_v2_features(rbd_dev);
9d475de5
AE
3462 if (ret < 0)
3463 goto out_err;
35d489f9 3464
86b00e0d
AE
3465 /* If the image supports layering, get the parent info */
3466
3467 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3468 ret = rbd_dev_v2_parent_info(rbd_dev);
3469 if (ret < 0)
3470 goto out_err;
3471 }
3472
6e14b1a6
AE
3473 /* crypto and compression type aren't (yet) supported for v2 images */
3474
3475 rbd_dev->header.crypt_type = 0;
3476 rbd_dev->header.comp_type = 0;
35d489f9 3477
6e14b1a6
AE
3478 /* Get the snapshot context, plus the header version */
3479
3480 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
35d489f9
AE
3481 if (ret)
3482 goto out_err;
6e14b1a6
AE
3483 rbd_dev->header.obj_version = ver;
3484
a30b71b9
AE
3485 rbd_dev->image_format = 2;
3486
3487 dout("discovered version 2 image, header name is %s\n",
3488 rbd_dev->header_name);
3489
35152979 3490 return 0;
9d475de5 3491out_err:
86b00e0d
AE
3492 rbd_dev->parent_overlap = 0;
3493 rbd_spec_put(rbd_dev->parent_spec);
3494 rbd_dev->parent_spec = NULL;
9d475de5
AE
3495 kfree(rbd_dev->header_name);
3496 rbd_dev->header_name = NULL;
1e130199
AE
3497 kfree(rbd_dev->header.object_prefix);
3498 rbd_dev->header.object_prefix = NULL;
9d475de5
AE
3499
3500 return ret;
a30b71b9
AE
3501}
3502
83a06263
AE
3503static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3504{
3505 int ret;
3506
3507 /* no need to lock here, as rbd_dev is not registered yet */
3508 ret = rbd_dev_snaps_update(rbd_dev);
3509 if (ret)
3510 return ret;
3511
9e15b77d
AE
3512 ret = rbd_dev_probe_update_spec(rbd_dev);
3513 if (ret)
3514 goto err_out_snaps;
3515
83a06263
AE
3516 ret = rbd_dev_set_mapping(rbd_dev);
3517 if (ret)
3518 goto err_out_snaps;
3519
3520 /* generate unique id: find highest unique id, add one */
3521 rbd_dev_id_get(rbd_dev);
3522
3523 /* Fill in the device name, now that we have its id. */
3524 BUILD_BUG_ON(DEV_NAME_LEN
3525 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3526 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3527
3528 /* Get our block major device number. */
3529
3530 ret = register_blkdev(0, rbd_dev->name);
3531 if (ret < 0)
3532 goto err_out_id;
3533 rbd_dev->major = ret;
3534
3535 /* Set up the blkdev mapping. */
3536
3537 ret = rbd_init_disk(rbd_dev);
3538 if (ret)
3539 goto err_out_blkdev;
3540
3541 ret = rbd_bus_add_dev(rbd_dev);
3542 if (ret)
3543 goto err_out_disk;
3544
3545 /*
3546 * At this point cleanup in the event of an error is the job
3547 * of the sysfs code (initiated by rbd_bus_del_dev()).
3548 */
3549 down_write(&rbd_dev->header_rwsem);
3550 ret = rbd_dev_snaps_register(rbd_dev);
3551 up_write(&rbd_dev->header_rwsem);
3552 if (ret)
3553 goto err_out_bus;
3554
3555 ret = rbd_init_watch_dev(rbd_dev);
3556 if (ret)
3557 goto err_out_bus;
3558
3559 /* Everything's ready. Announce the disk to the world. */
3560
3561 add_disk(rbd_dev->disk);
3562
3563 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3564 (unsigned long long) rbd_dev->mapping.size);
3565
3566 return ret;
3567err_out_bus:
3568 /* this will also clean up rest of rbd_dev stuff */
3569
3570 rbd_bus_del_dev(rbd_dev);
3571
3572 return ret;
3573err_out_disk:
3574 rbd_free_disk(rbd_dev);
3575err_out_blkdev:
3576 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3577err_out_id:
3578 rbd_dev_id_put(rbd_dev);
3579err_out_snaps:
3580 rbd_remove_all_snaps(rbd_dev);
3581
3582 return ret;
3583}
3584
a30b71b9
AE
3585/*
3586 * Probe for the existence of the header object for the given rbd
3587 * device. For format 2 images this includes determining the image
3588 * id.
3589 */
3590static int rbd_dev_probe(struct rbd_device *rbd_dev)
3591{
3592 int ret;
3593
3594 /*
3595 * Get the id from the image id object. If it's not a
3596 * format 2 image, we'll get ENOENT back, and we'll assume
3597 * it's a format 1 image.
3598 */
3599 ret = rbd_dev_image_id(rbd_dev);
3600 if (ret)
3601 ret = rbd_dev_v1_probe(rbd_dev);
3602 else
3603 ret = rbd_dev_v2_probe(rbd_dev);
83a06263 3604 if (ret) {
a30b71b9
AE
3605 dout("probe failed, returning %d\n", ret);
3606
83a06263
AE
3607 return ret;
3608 }
3609
3610 ret = rbd_dev_probe_finish(rbd_dev);
3611 if (ret)
3612 rbd_header_free(&rbd_dev->header);
3613
a30b71b9
AE
3614 return ret;
3615}
3616
59c2be1e
YS
3617static ssize_t rbd_add(struct bus_type *bus,
3618 const char *buf,
3619 size_t count)
602adf40 3620{
cb8627c7 3621 struct rbd_device *rbd_dev = NULL;
dc79b113 3622 struct ceph_options *ceph_opts = NULL;
4e9afeba 3623 struct rbd_options *rbd_opts = NULL;
859c31df 3624 struct rbd_spec *spec = NULL;
9d3997fd 3625 struct rbd_client *rbdc;
27cc2594
AE
3626 struct ceph_osd_client *osdc;
3627 int rc = -ENOMEM;
602adf40
YS
3628
3629 if (!try_module_get(THIS_MODULE))
3630 return -ENODEV;
3631
602adf40 3632 /* parse add command */
859c31df 3633 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
dc79b113 3634 if (rc < 0)
bd4ba655 3635 goto err_out_module;
78cea76e 3636
9d3997fd
AE
3637 rbdc = rbd_get_client(ceph_opts);
3638 if (IS_ERR(rbdc)) {
3639 rc = PTR_ERR(rbdc);
0ddebc0c 3640 goto err_out_args;
9d3997fd 3641 }
c53d5893 3642 ceph_opts = NULL; /* rbd_dev client now owns this */
602adf40 3643
602adf40 3644 /* pick the pool */
9d3997fd 3645 osdc = &rbdc->client->osdc;
859c31df 3646 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
602adf40
YS
3647 if (rc < 0)
3648 goto err_out_client;
859c31df
AE
3649 spec->pool_id = (u64) rc;
3650
c53d5893 3651 rbd_dev = rbd_dev_create(rbdc, spec);
bd4ba655
AE
3652 if (!rbd_dev)
3653 goto err_out_client;
c53d5893
AE
3654 rbdc = NULL; /* rbd_dev now owns this */
3655 spec = NULL; /* rbd_dev now owns this */
602adf40 3656
bd4ba655 3657 rbd_dev->mapping.read_only = rbd_opts->read_only;
c53d5893
AE
3658 kfree(rbd_opts);
3659 rbd_opts = NULL; /* done with this */
bd4ba655 3660
a30b71b9
AE
3661 rc = rbd_dev_probe(rbd_dev);
3662 if (rc < 0)
c53d5893 3663 goto err_out_rbd_dev;
05fd6f6f 3664
602adf40 3665 return count;
c53d5893
AE
3666err_out_rbd_dev:
3667 rbd_dev_destroy(rbd_dev);
bd4ba655 3668err_out_client:
9d3997fd 3669 rbd_put_client(rbdc);
0ddebc0c 3670err_out_args:
78cea76e
AE
3671 if (ceph_opts)
3672 ceph_destroy_options(ceph_opts);
4e9afeba 3673 kfree(rbd_opts);
859c31df 3674 rbd_spec_put(spec);
bd4ba655
AE
3675err_out_module:
3676 module_put(THIS_MODULE);
27cc2594 3677
602adf40 3678 dout("Error adding device %s\n", buf);
27cc2594
AE
3679
3680 return (ssize_t) rc;
602adf40
YS
3681}
3682
de71a297 3683static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
3684{
3685 struct list_head *tmp;
3686 struct rbd_device *rbd_dev;
3687
e124a82f 3688 spin_lock(&rbd_dev_list_lock);
602adf40
YS
3689 list_for_each(tmp, &rbd_dev_list) {
3690 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 3691 if (rbd_dev->dev_id == dev_id) {
e124a82f 3692 spin_unlock(&rbd_dev_list_lock);
602adf40 3693 return rbd_dev;
e124a82f 3694 }
602adf40 3695 }
e124a82f 3696 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
3697 return NULL;
3698}
3699
dfc5606d 3700static void rbd_dev_release(struct device *dev)
602adf40 3701{
593a9e7b 3702 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 3703
1dbb4399
AE
3704 if (rbd_dev->watch_request) {
3705 struct ceph_client *client = rbd_dev->rbd_client->client;
3706
3707 ceph_osdc_unregister_linger_request(&client->osdc,
59c2be1e 3708 rbd_dev->watch_request);
1dbb4399 3709 }
59c2be1e 3710 if (rbd_dev->watch_event)
070c633f 3711 rbd_req_sync_unwatch(rbd_dev);
59c2be1e 3712
602adf40
YS
3713
3714 /* clean up and free blkdev */
3715 rbd_free_disk(rbd_dev);
3716 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d 3717
2ac4e75d
AE
3718 /* release allocated disk header fields */
3719 rbd_header_free(&rbd_dev->header);
3720
32eec68d 3721 /* done with the id, and with the rbd_dev */
e2839308 3722 rbd_dev_id_put(rbd_dev);
c53d5893
AE
3723 rbd_assert(rbd_dev->rbd_client != NULL);
3724 rbd_dev_destroy(rbd_dev);
602adf40
YS
3725
3726 /* release module ref */
3727 module_put(THIS_MODULE);
602adf40
YS
3728}
3729
dfc5606d
YS
3730static ssize_t rbd_remove(struct bus_type *bus,
3731 const char *buf,
3732 size_t count)
602adf40
YS
3733{
3734 struct rbd_device *rbd_dev = NULL;
3735 int target_id, rc;
3736 unsigned long ul;
3737 int ret = count;
3738
3739 rc = strict_strtoul(buf, 10, &ul);
3740 if (rc)
3741 return rc;
3742
3743 /* convert to int; abort if we lost anything in the conversion */
3744 target_id = (int) ul;
3745 if (target_id != ul)
3746 return -EINVAL;
3747
3748 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3749
3750 rbd_dev = __rbd_get_dev(target_id);
3751 if (!rbd_dev) {
3752 ret = -ENOENT;
3753 goto done;
42382b70
AE
3754 }
3755
3756 if (rbd_dev->open_count) {
3757 ret = -EBUSY;
3758 goto done;
602adf40
YS
3759 }
3760
41f38c2b 3761 rbd_remove_all_snaps(rbd_dev);
dfc5606d 3762 rbd_bus_del_dev(rbd_dev);
602adf40
YS
3763
3764done:
3765 mutex_unlock(&ctl_mutex);
aafb230e 3766
602adf40
YS
3767 return ret;
3768}
3769
602adf40
YS
3770/*
3771 * create control files in sysfs
dfc5606d 3772 * /sys/bus/rbd/...
602adf40
YS
3773 */
3774static int rbd_sysfs_init(void)
3775{
dfc5606d 3776 int ret;
602adf40 3777
fed4c143 3778 ret = device_register(&rbd_root_dev);
21079786 3779 if (ret < 0)
dfc5606d 3780 return ret;
602adf40 3781
fed4c143
AE
3782 ret = bus_register(&rbd_bus_type);
3783 if (ret < 0)
3784 device_unregister(&rbd_root_dev);
602adf40 3785
602adf40
YS
3786 return ret;
3787}
3788
3789static void rbd_sysfs_cleanup(void)
3790{
dfc5606d 3791 bus_unregister(&rbd_bus_type);
fed4c143 3792 device_unregister(&rbd_root_dev);
602adf40
YS
3793}
3794
3795int __init rbd_init(void)
3796{
3797 int rc;
3798
3799 rc = rbd_sysfs_init();
3800 if (rc)
3801 return rc;
f0f8cef5 3802 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
3803 return 0;
3804}
3805
3806void __exit rbd_exit(void)
3807{
3808 rbd_sysfs_cleanup();
3809}
3810
3811module_init(rbd_init);
3812module_exit(rbd_exit);
3813
3814MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3815MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3816MODULE_DESCRIPTION("rados block device");
3817
3818/* following authorship retained from original osdblk.c */
3819MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3820
3821MODULE_LICENSE("GPL");
This page took 0.744978 seconds and 5 git commands to generate.