libceph: fix messenger CONFIG_BLOCK dependencies
[deliverable/linux.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
2647ba38 55/* It might be useful to have these defined elsewhere */
df111be6 56
2647ba38
AE
57#define U8_MAX ((u8) (~0U))
58#define U16_MAX ((u16) (~0U))
59#define U32_MAX ((u32) (~0U))
60#define U64_MAX ((u64) (~0ULL))
df111be6 61
f0f8cef5
AE
62#define RBD_DRV_NAME "rbd"
63#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
64
65#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
66
d4b125e9
AE
67#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
68#define RBD_MAX_SNAP_NAME_LEN \
69 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
70
35d489f9 71#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
72
73#define RBD_SNAP_HEAD_NAME "-"
74
9e15b77d
AE
75/* This allows a single page to hold an image name sent by OSD */
76#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
1e130199 77#define RBD_IMAGE_ID_LEN_MAX 64
9e15b77d 78
1e130199 79#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 80
d889140c
AE
81/* Feature bits */
82
83#define RBD_FEATURE_LAYERING 1
84
85/* Features supported by this (client software) implementation. */
86
87#define RBD_FEATURES_ALL (0)
88
81a89793
AE
89/*
90 * An RBD device name will be "rbd#", where the "rbd" comes from
91 * RBD_DRV_NAME above, and # is a unique integer identifier.
92 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
93 * enough to hold all possible device names.
94 */
602adf40 95#define DEV_NAME_LEN 32
81a89793 96#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40
YS
97
98/*
99 * block device image metadata (in-memory version)
100 */
101struct rbd_image_header {
f84344f3 102 /* These four fields never change for a given rbd image */
849b4260 103 char *object_prefix;
34b13184 104 u64 features;
602adf40
YS
105 __u8 obj_order;
106 __u8 crypt_type;
107 __u8 comp_type;
602adf40 108
f84344f3
AE
109 /* The remaining fields need to be updated occasionally */
110 u64 image_size;
111 struct ceph_snap_context *snapc;
602adf40
YS
112 char *snap_names;
113 u64 *snap_sizes;
59c2be1e
YS
114
115 u64 obj_version;
116};
117
0d7dbfce
AE
118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
c66c6e0c
AE
122 * identify an image. Each rbd_dev structure includes a pointer to
123 * an rbd_spec structure that encapsulates this identity.
124 *
125 * Each of the id's in an rbd_spec has an associated name. For a
126 * user-mapped image, the names are supplied and the id's associated
127 * with them are looked up. For a layered image, a parent image is
128 * defined by the tuple, and the names are looked up.
129 *
130 * An rbd_dev structure contains a parent_spec pointer which is
131 * non-null if the image it represents is a child in a layered
132 * image. This pointer will refer to the rbd_spec structure used
133 * by the parent rbd_dev for its own identity (i.e., the structure
134 * is shared between the parent and child).
135 *
136 * Since these structures are populated once, during the discovery
137 * phase of image construction, they are effectively immutable so
138 * we make no effort to synchronize access to them.
139 *
140 * Note that code herein does not assume the image name is known (it
141 * could be a null pointer).
0d7dbfce
AE
142 */
143struct rbd_spec {
144 u64 pool_id;
145 char *pool_name;
146
147 char *image_id;
0d7dbfce 148 char *image_name;
0d7dbfce
AE
149
150 u64 snap_id;
151 char *snap_name;
152
153 struct kref kref;
154};
155
602adf40 156/*
f0f8cef5 157 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
158 */
159struct rbd_client {
160 struct ceph_client *client;
161 struct kref kref;
162 struct list_head node;
163};
164
165/*
f0f8cef5 166 * a request completion status
602adf40 167 */
1fec7093
YS
168struct rbd_req_status {
169 int done;
8986cb37 170 s32 rc;
1fec7093
YS
171 u64 bytes;
172};
173
174/*
175 * a collection of requests
176 */
177struct rbd_req_coll {
178 int total;
179 int num_done;
180 struct kref kref;
181 struct rbd_req_status status[0];
602adf40
YS
182};
183
f0f8cef5
AE
184/*
185 * a single io request
186 */
187struct rbd_request {
188 struct request *rq; /* blk layer request */
189 struct bio *bio; /* cloned bio */
190 struct page **pages; /* list of used pages */
191 u64 len;
192 int coll_index;
193 struct rbd_req_coll *coll;
194};
195
dfc5606d
YS
196struct rbd_snap {
197 struct device dev;
198 const char *name;
3591538f 199 u64 size;
dfc5606d
YS
200 struct list_head node;
201 u64 id;
34b13184 202 u64 features;
dfc5606d
YS
203};
204
f84344f3 205struct rbd_mapping {
99c1f08f 206 u64 size;
34b13184 207 u64 features;
f84344f3
AE
208 bool read_only;
209};
210
602adf40
YS
211/*
212 * a single device
213 */
214struct rbd_device {
de71a297 215 int dev_id; /* blkdev unique id */
602adf40
YS
216
217 int major; /* blkdev assigned major */
218 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 219
a30b71b9 220 u32 image_format; /* Either 1 or 2 */
602adf40
YS
221 struct rbd_client *rbd_client;
222
223 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
224
225 spinlock_t lock; /* queue lock */
226
227 struct rbd_image_header header;
d78b650a 228 atomic_t exists;
0d7dbfce 229 struct rbd_spec *spec;
602adf40 230
0d7dbfce 231 char *header_name;
971f839a 232
0903e875
AE
233 struct ceph_file_layout layout;
234
59c2be1e
YS
235 struct ceph_osd_event *watch_event;
236 struct ceph_osd_request *watch_request;
237
86b00e0d
AE
238 struct rbd_spec *parent_spec;
239 u64 parent_overlap;
240
c666601a
JD
241 /* protects updating the header */
242 struct rw_semaphore header_rwsem;
f84344f3
AE
243
244 struct rbd_mapping mapping;
602adf40
YS
245
246 struct list_head node;
dfc5606d
YS
247
248 /* list of snapshots */
249 struct list_head snaps;
250
251 /* sysfs related */
252 struct device dev;
42382b70 253 unsigned long open_count;
dfc5606d
YS
254};
255
602adf40 256static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 257
602adf40 258static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
259static DEFINE_SPINLOCK(rbd_dev_list_lock);
260
432b8587
AE
261static LIST_HEAD(rbd_client_list); /* clients */
262static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 263
304f6808
AE
264static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
265static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
266
dfc5606d 267static void rbd_dev_release(struct device *dev);
41f38c2b 268static void rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 269
f0f8cef5
AE
270static ssize_t rbd_add(struct bus_type *bus, const char *buf,
271 size_t count);
272static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
273 size_t count);
274
275static struct bus_attribute rbd_bus_attrs[] = {
276 __ATTR(add, S_IWUSR, NULL, rbd_add),
277 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
278 __ATTR_NULL
279};
280
281static struct bus_type rbd_bus_type = {
282 .name = "rbd",
283 .bus_attrs = rbd_bus_attrs,
284};
285
286static void rbd_root_dev_release(struct device *dev)
287{
288}
289
290static struct device rbd_root_dev = {
291 .init_name = "rbd",
292 .release = rbd_root_dev_release,
293};
294
06ecc6cb
AE
295static __printf(2, 3)
296void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
297{
298 struct va_format vaf;
299 va_list args;
300
301 va_start(args, fmt);
302 vaf.fmt = fmt;
303 vaf.va = &args;
304
305 if (!rbd_dev)
306 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
307 else if (rbd_dev->disk)
308 printk(KERN_WARNING "%s: %s: %pV\n",
309 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
310 else if (rbd_dev->spec && rbd_dev->spec->image_name)
311 printk(KERN_WARNING "%s: image %s: %pV\n",
312 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
313 else if (rbd_dev->spec && rbd_dev->spec->image_id)
314 printk(KERN_WARNING "%s: id %s: %pV\n",
315 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
316 else /* punt */
317 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
318 RBD_DRV_NAME, rbd_dev, &vaf);
319 va_end(args);
320}
321
aafb230e
AE
322#ifdef RBD_DEBUG
323#define rbd_assert(expr) \
324 if (unlikely(!(expr))) { \
325 printk(KERN_ERR "\nAssertion failure in %s() " \
326 "at line %d:\n\n" \
327 "\trbd_assert(%s);\n\n", \
328 __func__, __LINE__, #expr); \
329 BUG(); \
330 }
331#else /* !RBD_DEBUG */
332# define rbd_assert(expr) ((void) 0)
333#endif /* !RBD_DEBUG */
dfc5606d 334
117973fb
AE
335static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
336static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 337
602adf40
YS
338static int rbd_open(struct block_device *bdev, fmode_t mode)
339{
f0f8cef5 340 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
602adf40 341
f84344f3 342 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
343 return -EROFS;
344
42382b70 345 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
c3e946ce 346 (void) get_device(&rbd_dev->dev);
f84344f3 347 set_device_ro(bdev, rbd_dev->mapping.read_only);
42382b70
AE
348 rbd_dev->open_count++;
349 mutex_unlock(&ctl_mutex);
340c7a2b 350
602adf40
YS
351 return 0;
352}
353
dfc5606d
YS
354static int rbd_release(struct gendisk *disk, fmode_t mode)
355{
356 struct rbd_device *rbd_dev = disk->private_data;
357
42382b70
AE
358 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
359 rbd_assert(rbd_dev->open_count > 0);
360 rbd_dev->open_count--;
c3e946ce 361 put_device(&rbd_dev->dev);
42382b70 362 mutex_unlock(&ctl_mutex);
dfc5606d
YS
363
364 return 0;
365}
366
602adf40
YS
367static const struct block_device_operations rbd_bd_ops = {
368 .owner = THIS_MODULE,
369 .open = rbd_open,
dfc5606d 370 .release = rbd_release,
602adf40
YS
371};
372
373/*
374 * Initialize an rbd client instance.
43ae4701 375 * We own *ceph_opts.
602adf40 376 */
f8c38929 377static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
378{
379 struct rbd_client *rbdc;
380 int ret = -ENOMEM;
381
382 dout("rbd_client_create\n");
383 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
384 if (!rbdc)
385 goto out_opt;
386
387 kref_init(&rbdc->kref);
388 INIT_LIST_HEAD(&rbdc->node);
389
bc534d86
AE
390 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
391
43ae4701 392 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 393 if (IS_ERR(rbdc->client))
bc534d86 394 goto out_mutex;
43ae4701 395 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
396
397 ret = ceph_open_session(rbdc->client);
398 if (ret < 0)
399 goto out_err;
400
432b8587 401 spin_lock(&rbd_client_list_lock);
602adf40 402 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 403 spin_unlock(&rbd_client_list_lock);
602adf40 404
bc534d86
AE
405 mutex_unlock(&ctl_mutex);
406
602adf40
YS
407 dout("rbd_client_create created %p\n", rbdc);
408 return rbdc;
409
410out_err:
411 ceph_destroy_client(rbdc->client);
bc534d86
AE
412out_mutex:
413 mutex_unlock(&ctl_mutex);
602adf40
YS
414 kfree(rbdc);
415out_opt:
43ae4701
AE
416 if (ceph_opts)
417 ceph_destroy_options(ceph_opts);
28f259b7 418 return ERR_PTR(ret);
602adf40
YS
419}
420
421/*
1f7ba331
AE
422 * Find a ceph client with specific addr and configuration. If
423 * found, bump its reference count.
602adf40 424 */
1f7ba331 425static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
426{
427 struct rbd_client *client_node;
1f7ba331 428 bool found = false;
602adf40 429
43ae4701 430 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
431 return NULL;
432
1f7ba331
AE
433 spin_lock(&rbd_client_list_lock);
434 list_for_each_entry(client_node, &rbd_client_list, node) {
435 if (!ceph_compare_options(ceph_opts, client_node->client)) {
436 kref_get(&client_node->kref);
437 found = true;
438 break;
439 }
440 }
441 spin_unlock(&rbd_client_list_lock);
442
443 return found ? client_node : NULL;
602adf40
YS
444}
445
59c2be1e
YS
446/*
447 * mount options
448 */
449enum {
59c2be1e
YS
450 Opt_last_int,
451 /* int args above */
452 Opt_last_string,
453 /* string args above */
cc0538b6
AE
454 Opt_read_only,
455 Opt_read_write,
456 /* Boolean args above */
457 Opt_last_bool,
59c2be1e
YS
458};
459
43ae4701 460static match_table_t rbd_opts_tokens = {
59c2be1e
YS
461 /* int args above */
462 /* string args above */
be466c1c 463 {Opt_read_only, "read_only"},
cc0538b6
AE
464 {Opt_read_only, "ro"}, /* Alternate spelling */
465 {Opt_read_write, "read_write"},
466 {Opt_read_write, "rw"}, /* Alternate spelling */
467 /* Boolean args above */
59c2be1e
YS
468 {-1, NULL}
469};
470
98571b5a
AE
471struct rbd_options {
472 bool read_only;
473};
474
475#define RBD_READ_ONLY_DEFAULT false
476
59c2be1e
YS
477static int parse_rbd_opts_token(char *c, void *private)
478{
43ae4701 479 struct rbd_options *rbd_opts = private;
59c2be1e
YS
480 substring_t argstr[MAX_OPT_ARGS];
481 int token, intval, ret;
482
43ae4701 483 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
484 if (token < 0)
485 return -EINVAL;
486
487 if (token < Opt_last_int) {
488 ret = match_int(&argstr[0], &intval);
489 if (ret < 0) {
490 pr_err("bad mount option arg (not int) "
491 "at '%s'\n", c);
492 return ret;
493 }
494 dout("got int token %d val %d\n", token, intval);
495 } else if (token > Opt_last_int && token < Opt_last_string) {
496 dout("got string token %d val %s\n", token,
497 argstr[0].from);
cc0538b6
AE
498 } else if (token > Opt_last_string && token < Opt_last_bool) {
499 dout("got Boolean token %d\n", token);
59c2be1e
YS
500 } else {
501 dout("got token %d\n", token);
502 }
503
504 switch (token) {
cc0538b6
AE
505 case Opt_read_only:
506 rbd_opts->read_only = true;
507 break;
508 case Opt_read_write:
509 rbd_opts->read_only = false;
510 break;
59c2be1e 511 default:
aafb230e
AE
512 rbd_assert(false);
513 break;
59c2be1e
YS
514 }
515 return 0;
516}
517
602adf40
YS
518/*
519 * Get a ceph client with specific addr and configuration, if one does
520 * not exist create it.
521 */
9d3997fd 522static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
602adf40 523{
f8c38929 524 struct rbd_client *rbdc;
59c2be1e 525
1f7ba331 526 rbdc = rbd_client_find(ceph_opts);
9d3997fd 527 if (rbdc) /* using an existing client */
43ae4701 528 ceph_destroy_options(ceph_opts);
9d3997fd 529 else
f8c38929 530 rbdc = rbd_client_create(ceph_opts);
602adf40 531
9d3997fd 532 return rbdc;
602adf40
YS
533}
534
535/*
536 * Destroy ceph client
d23a4b3f 537 *
432b8587 538 * Caller must hold rbd_client_list_lock.
602adf40
YS
539 */
540static void rbd_client_release(struct kref *kref)
541{
542 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
543
544 dout("rbd_release_client %p\n", rbdc);
cd9d9f5d 545 spin_lock(&rbd_client_list_lock);
602adf40 546 list_del(&rbdc->node);
cd9d9f5d 547 spin_unlock(&rbd_client_list_lock);
602adf40
YS
548
549 ceph_destroy_client(rbdc->client);
550 kfree(rbdc);
551}
552
553/*
554 * Drop reference to ceph client node. If it's not referenced anymore, release
555 * it.
556 */
9d3997fd 557static void rbd_put_client(struct rbd_client *rbdc)
602adf40 558{
c53d5893
AE
559 if (rbdc)
560 kref_put(&rbdc->kref, rbd_client_release);
602adf40
YS
561}
562
1fec7093
YS
563/*
564 * Destroy requests collection
565 */
566static void rbd_coll_release(struct kref *kref)
567{
568 struct rbd_req_coll *coll =
569 container_of(kref, struct rbd_req_coll, kref);
570
571 dout("rbd_coll_release %p\n", coll);
572 kfree(coll);
573}
602adf40 574
a30b71b9
AE
575static bool rbd_image_format_valid(u32 image_format)
576{
577 return image_format == 1 || image_format == 2;
578}
579
8e94af8e
AE
580static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
581{
103a150f
AE
582 size_t size;
583 u32 snap_count;
584
585 /* The header has to start with the magic rbd header text */
586 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
587 return false;
588
db2388b6
AE
589 /* The bio layer requires at least sector-sized I/O */
590
591 if (ondisk->options.order < SECTOR_SHIFT)
592 return false;
593
594 /* If we use u64 in a few spots we may be able to loosen this */
595
596 if (ondisk->options.order > 8 * sizeof (int) - 1)
597 return false;
598
103a150f
AE
599 /*
600 * The size of a snapshot header has to fit in a size_t, and
601 * that limits the number of snapshots.
602 */
603 snap_count = le32_to_cpu(ondisk->snap_count);
604 size = SIZE_MAX - sizeof (struct ceph_snap_context);
605 if (snap_count > size / sizeof (__le64))
606 return false;
607
608 /*
609 * Not only that, but the size of the entire the snapshot
610 * header must also be representable in a size_t.
611 */
612 size -= snap_count * sizeof (__le64);
613 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
614 return false;
615
616 return true;
8e94af8e
AE
617}
618
602adf40
YS
619/*
620 * Create a new header structure, translate header format from the on-disk
621 * header.
622 */
623static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 624 struct rbd_image_header_ondisk *ondisk)
602adf40 625{
ccece235 626 u32 snap_count;
58c17b0e 627 size_t len;
d2bb24e5 628 size_t size;
621901d6 629 u32 i;
602adf40 630
6a52325f
AE
631 memset(header, 0, sizeof (*header));
632
103a150f
AE
633 snap_count = le32_to_cpu(ondisk->snap_count);
634
58c17b0e
AE
635 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
636 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 637 if (!header->object_prefix)
602adf40 638 return -ENOMEM;
58c17b0e
AE
639 memcpy(header->object_prefix, ondisk->object_prefix, len);
640 header->object_prefix[len] = '\0';
00f1f36f 641
602adf40 642 if (snap_count) {
f785cc1d
AE
643 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
644
621901d6
AE
645 /* Save a copy of the snapshot names */
646
f785cc1d
AE
647 if (snap_names_len > (u64) SIZE_MAX)
648 return -EIO;
649 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 650 if (!header->snap_names)
6a52325f 651 goto out_err;
f785cc1d
AE
652 /*
653 * Note that rbd_dev_v1_header_read() guarantees
654 * the ondisk buffer we're working with has
655 * snap_names_len bytes beyond the end of the
656 * snapshot id array, this memcpy() is safe.
657 */
658 memcpy(header->snap_names, &ondisk->snaps[snap_count],
659 snap_names_len);
6a52325f 660
621901d6
AE
661 /* Record each snapshot's size */
662
d2bb24e5
AE
663 size = snap_count * sizeof (*header->snap_sizes);
664 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 665 if (!header->snap_sizes)
6a52325f 666 goto out_err;
621901d6
AE
667 for (i = 0; i < snap_count; i++)
668 header->snap_sizes[i] =
669 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 670 } else {
ccece235 671 WARN_ON(ondisk->snap_names_len);
602adf40
YS
672 header->snap_names = NULL;
673 header->snap_sizes = NULL;
674 }
849b4260 675
34b13184 676 header->features = 0; /* No features support in v1 images */
602adf40
YS
677 header->obj_order = ondisk->options.order;
678 header->crypt_type = ondisk->options.crypt_type;
679 header->comp_type = ondisk->options.comp_type;
6a52325f 680
621901d6
AE
681 /* Allocate and fill in the snapshot context */
682
f84344f3 683 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
684 size = sizeof (struct ceph_snap_context);
685 size += snap_count * sizeof (header->snapc->snaps[0]);
686 header->snapc = kzalloc(size, GFP_KERNEL);
687 if (!header->snapc)
688 goto out_err;
602adf40
YS
689
690 atomic_set(&header->snapc->nref, 1);
505cbb9b 691 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 692 header->snapc->num_snaps = snap_count;
621901d6
AE
693 for (i = 0; i < snap_count; i++)
694 header->snapc->snaps[i] =
695 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
696
697 return 0;
698
6a52325f 699out_err:
849b4260 700 kfree(header->snap_sizes);
ccece235 701 header->snap_sizes = NULL;
602adf40 702 kfree(header->snap_names);
ccece235 703 header->snap_names = NULL;
6a52325f
AE
704 kfree(header->object_prefix);
705 header->object_prefix = NULL;
ccece235 706
00f1f36f 707 return -ENOMEM;
602adf40
YS
708}
709
9e15b77d
AE
710static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
711{
712 struct rbd_snap *snap;
713
714 if (snap_id == CEPH_NOSNAP)
715 return RBD_SNAP_HEAD_NAME;
716
717 list_for_each_entry(snap, &rbd_dev->snaps, node)
718 if (snap_id == snap->id)
719 return snap->name;
720
721 return NULL;
722}
723
8836b995 724static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40 725{
602adf40 726
e86924a8 727 struct rbd_snap *snap;
602adf40 728
e86924a8
AE
729 list_for_each_entry(snap, &rbd_dev->snaps, node) {
730 if (!strcmp(snap_name, snap->name)) {
0d7dbfce 731 rbd_dev->spec->snap_id = snap->id;
e86924a8 732 rbd_dev->mapping.size = snap->size;
34b13184 733 rbd_dev->mapping.features = snap->features;
602adf40 734
e86924a8 735 return 0;
00f1f36f 736 }
00f1f36f 737 }
e86924a8 738
00f1f36f 739 return -ENOENT;
602adf40
YS
740}
741
819d52bf 742static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
602adf40 743{
78dc447d 744 int ret;
602adf40 745
0d7dbfce 746 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 747 sizeof (RBD_SNAP_HEAD_NAME))) {
0d7dbfce 748 rbd_dev->spec->snap_id = CEPH_NOSNAP;
99c1f08f 749 rbd_dev->mapping.size = rbd_dev->header.image_size;
34b13184 750 rbd_dev->mapping.features = rbd_dev->header.features;
e86924a8 751 ret = 0;
602adf40 752 } else {
0d7dbfce 753 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
602adf40
YS
754 if (ret < 0)
755 goto done;
f84344f3 756 rbd_dev->mapping.read_only = true;
602adf40 757 }
d78b650a 758 atomic_set(&rbd_dev->exists, 1);
602adf40 759done:
602adf40
YS
760 return ret;
761}
762
763static void rbd_header_free(struct rbd_image_header *header)
764{
849b4260 765 kfree(header->object_prefix);
d78fd7ae 766 header->object_prefix = NULL;
602adf40 767 kfree(header->snap_sizes);
d78fd7ae 768 header->snap_sizes = NULL;
849b4260 769 kfree(header->snap_names);
d78fd7ae 770 header->snap_names = NULL;
d1d25646 771 ceph_put_snap_context(header->snapc);
d78fd7ae 772 header->snapc = NULL;
602adf40
YS
773}
774
98571b5a 775static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 776{
65ccfe21
AE
777 char *name;
778 u64 segment;
779 int ret;
602adf40 780
2fd82b9e 781 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
65ccfe21
AE
782 if (!name)
783 return NULL;
784 segment = offset >> rbd_dev->header.obj_order;
2fd82b9e 785 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
65ccfe21 786 rbd_dev->header.object_prefix, segment);
2fd82b9e 787 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
65ccfe21
AE
788 pr_err("error formatting segment name for #%llu (%d)\n",
789 segment, ret);
790 kfree(name);
791 name = NULL;
792 }
602adf40 793
65ccfe21
AE
794 return name;
795}
602adf40 796
65ccfe21
AE
797static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
798{
799 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 800
65ccfe21
AE
801 return offset & (segment_size - 1);
802}
803
804static u64 rbd_segment_length(struct rbd_device *rbd_dev,
805 u64 offset, u64 length)
806{
807 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
808
809 offset &= segment_size - 1;
810
aafb230e 811 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
812 if (offset + length > segment_size)
813 length = segment_size - offset;
814
815 return length;
602adf40
YS
816}
817
1fec7093
YS
818static int rbd_get_num_segments(struct rbd_image_header *header,
819 u64 ofs, u64 len)
820{
df111be6
AE
821 u64 start_seg;
822 u64 end_seg;
38901e0f 823 u64 result;
df111be6
AE
824
825 if (!len)
826 return 0;
827 if (len - 1 > U64_MAX - ofs)
828 return -ERANGE;
829
830 start_seg = ofs >> header->obj_order;
831 end_seg = (ofs + len - 1) >> header->obj_order;
832
38901e0f
AE
833 result = end_seg - start_seg + 1;
834 if (result > (u64) INT_MAX)
835 return -ERANGE;
836
837 return (int) result;
1fec7093
YS
838}
839
029bcbd8
JD
840/*
841 * returns the size of an object in the image
842 */
843static u64 rbd_obj_bytes(struct rbd_image_header *header)
844{
845 return 1 << header->obj_order;
846}
847
602adf40
YS
848/*
849 * bio helpers
850 */
851
852static void bio_chain_put(struct bio *chain)
853{
854 struct bio *tmp;
855
856 while (chain) {
857 tmp = chain;
858 chain = chain->bi_next;
859 bio_put(tmp);
860 }
861}
862
863/*
864 * zeros a bio chain, starting at specific offset
865 */
866static void zero_bio_chain(struct bio *chain, int start_ofs)
867{
868 struct bio_vec *bv;
869 unsigned long flags;
870 void *buf;
871 int i;
872 int pos = 0;
873
874 while (chain) {
875 bio_for_each_segment(bv, chain, i) {
876 if (pos + bv->bv_len > start_ofs) {
877 int remainder = max(start_ofs - pos, 0);
878 buf = bvec_kmap_irq(bv, &flags);
879 memset(buf + remainder, 0,
880 bv->bv_len - remainder);
85b5aaa6 881 bvec_kunmap_irq(buf, &flags);
602adf40
YS
882 }
883 pos += bv->bv_len;
884 }
885
886 chain = chain->bi_next;
887 }
888}
889
890/*
f7760dad
AE
891 * Clone a portion of a bio, starting at the given byte offset
892 * and continuing for the number of bytes indicated.
602adf40 893 */
f7760dad
AE
894static struct bio *bio_clone_range(struct bio *bio_src,
895 unsigned int offset,
896 unsigned int len,
897 gfp_t gfpmask)
602adf40 898{
f7760dad
AE
899 struct bio_vec *bv;
900 unsigned int resid;
901 unsigned short idx;
902 unsigned int voff;
903 unsigned short end_idx;
904 unsigned short vcnt;
905 struct bio *bio;
906
907 /* Handle the easy case for the caller */
908
909 if (!offset && len == bio_src->bi_size)
910 return bio_clone(bio_src, gfpmask);
911
912 if (WARN_ON_ONCE(!len))
913 return NULL;
914 if (WARN_ON_ONCE(len > bio_src->bi_size))
915 return NULL;
916 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
917 return NULL;
918
919 /* Find first affected segment... */
920
921 resid = offset;
922 __bio_for_each_segment(bv, bio_src, idx, 0) {
923 if (resid < bv->bv_len)
924 break;
925 resid -= bv->bv_len;
602adf40 926 }
f7760dad 927 voff = resid;
602adf40 928
f7760dad 929 /* ...and the last affected segment */
602adf40 930
f7760dad
AE
931 resid += len;
932 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
933 if (resid <= bv->bv_len)
934 break;
935 resid -= bv->bv_len;
936 }
937 vcnt = end_idx - idx + 1;
938
939 /* Build the clone */
940
941 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
942 if (!bio)
943 return NULL; /* ENOMEM */
602adf40 944
f7760dad
AE
945 bio->bi_bdev = bio_src->bi_bdev;
946 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
947 bio->bi_rw = bio_src->bi_rw;
948 bio->bi_flags |= 1 << BIO_CLONED;
949
950 /*
951 * Copy over our part of the bio_vec, then update the first
952 * and last (or only) entries.
953 */
954 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
955 vcnt * sizeof (struct bio_vec));
956 bio->bi_io_vec[0].bv_offset += voff;
957 if (vcnt > 1) {
958 bio->bi_io_vec[0].bv_len -= voff;
959 bio->bi_io_vec[vcnt - 1].bv_len = resid;
960 } else {
961 bio->bi_io_vec[0].bv_len = len;
602adf40
YS
962 }
963
f7760dad
AE
964 bio->bi_vcnt = vcnt;
965 bio->bi_size = len;
966 bio->bi_idx = 0;
967
968 return bio;
969}
970
971/*
972 * Clone a portion of a bio chain, starting at the given byte offset
973 * into the first bio in the source chain and continuing for the
974 * number of bytes indicated. The result is another bio chain of
975 * exactly the given length, or a null pointer on error.
976 *
977 * The bio_src and offset parameters are both in-out. On entry they
978 * refer to the first source bio and the offset into that bio where
979 * the start of data to be cloned is located.
980 *
981 * On return, bio_src is updated to refer to the bio in the source
982 * chain that contains first un-cloned byte, and *offset will
983 * contain the offset of that byte within that bio.
984 */
985static struct bio *bio_chain_clone_range(struct bio **bio_src,
986 unsigned int *offset,
987 unsigned int len,
988 gfp_t gfpmask)
989{
990 struct bio *bi = *bio_src;
991 unsigned int off = *offset;
992 struct bio *chain = NULL;
993 struct bio **end;
994
995 /* Build up a chain of clone bios up to the limit */
996
997 if (!bi || off >= bi->bi_size || !len)
998 return NULL; /* Nothing to clone */
602adf40 999
f7760dad
AE
1000 end = &chain;
1001 while (len) {
1002 unsigned int bi_size;
1003 struct bio *bio;
1004
f5400b7a
AE
1005 if (!bi) {
1006 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
f7760dad 1007 goto out_err; /* EINVAL; ran out of bio's */
f5400b7a 1008 }
f7760dad
AE
1009 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1010 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1011 if (!bio)
1012 goto out_err; /* ENOMEM */
1013
1014 *end = bio;
1015 end = &bio->bi_next;
602adf40 1016
f7760dad
AE
1017 off += bi_size;
1018 if (off == bi->bi_size) {
1019 bi = bi->bi_next;
1020 off = 0;
1021 }
1022 len -= bi_size;
1023 }
1024 *bio_src = bi;
1025 *offset = off;
1026
1027 return chain;
1028out_err:
1029 bio_chain_put(chain);
602adf40 1030
602adf40
YS
1031 return NULL;
1032}
1033
8d23bf29
AE
1034struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
1035{
1036 struct ceph_osd_req_op *op;
1037 va_list args;
2647ba38 1038 size_t size;
8d23bf29
AE
1039
1040 op = kzalloc(sizeof (*op), GFP_NOIO);
1041 if (!op)
1042 return NULL;
1043 op->op = opcode;
1044 va_start(args, opcode);
1045 switch (opcode) {
1046 case CEPH_OSD_OP_READ:
1047 case CEPH_OSD_OP_WRITE:
1048 /* rbd_osd_req_op_create(READ, offset, length) */
1049 /* rbd_osd_req_op_create(WRITE, offset, length) */
1050 op->extent.offset = va_arg(args, u64);
1051 op->extent.length = va_arg(args, u64);
1052 if (opcode == CEPH_OSD_OP_WRITE)
1053 op->payload_len = op->extent.length;
1054 break;
2647ba38
AE
1055 case CEPH_OSD_OP_CALL:
1056 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1057 op->cls.class_name = va_arg(args, char *);
1058 size = strlen(op->cls.class_name);
1059 rbd_assert(size <= (size_t) U8_MAX);
1060 op->cls.class_len = size;
1061 op->payload_len = size;
1062
1063 op->cls.method_name = va_arg(args, char *);
1064 size = strlen(op->cls.method_name);
1065 rbd_assert(size <= (size_t) U8_MAX);
1066 op->cls.method_len = size;
1067 op->payload_len += size;
1068
1069 op->cls.argc = 0;
1070 op->cls.indata = va_arg(args, void *);
1071 size = va_arg(args, size_t);
1072 rbd_assert(size <= (size_t) U32_MAX);
1073 op->cls.indata_len = (u32) size;
1074 op->payload_len += size;
1075 break;
5efea49a
AE
1076 case CEPH_OSD_OP_NOTIFY_ACK:
1077 case CEPH_OSD_OP_WATCH:
1078 /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
1079 /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
1080 op->watch.cookie = va_arg(args, u64);
1081 op->watch.ver = va_arg(args, u64);
1082 op->watch.ver = cpu_to_le64(op->watch.ver);
1083 if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
1084 op->watch.flag = (u8) 1;
1085 break;
8d23bf29
AE
1086 default:
1087 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1088 kfree(op);
1089 op = NULL;
1090 break;
1091 }
1092 va_end(args);
1093
1094 return op;
1095}
1096
1097static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
1098{
1099 kfree(op);
1100}
1101
1fec7093
YS
1102static void rbd_coll_end_req_index(struct request *rq,
1103 struct rbd_req_coll *coll,
1104 int index,
8986cb37 1105 s32 ret, u64 len)
1fec7093
YS
1106{
1107 struct request_queue *q;
1108 int min, max, i;
1109
bd919d45 1110 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
8986cb37 1111 coll, index, (int)ret, (unsigned long long)len);
1fec7093
YS
1112
1113 if (!rq)
1114 return;
1115
1116 if (!coll) {
1117 blk_end_request(rq, ret, len);
1118 return;
1119 }
1120
1121 q = rq->q;
1122
1123 spin_lock_irq(q->queue_lock);
1124 coll->status[index].done = 1;
1125 coll->status[index].rc = ret;
1126 coll->status[index].bytes = len;
1127 max = min = coll->num_done;
1128 while (max < coll->total && coll->status[max].done)
1129 max++;
1130
1131 for (i = min; i<max; i++) {
8986cb37 1132 __blk_end_request(rq, (int)coll->status[i].rc,
1fec7093
YS
1133 coll->status[i].bytes);
1134 coll->num_done++;
1135 kref_put(&coll->kref, rbd_coll_release);
1136 }
1137 spin_unlock_irq(q->queue_lock);
1138}
1139
725afc97 1140static void rbd_coll_end_req(struct rbd_request *rbd_req,
8986cb37 1141 s32 ret, u64 len)
1fec7093 1142{
725afc97
AE
1143 rbd_coll_end_req_index(rbd_req->rq,
1144 rbd_req->coll, rbd_req->coll_index,
1145 ret, len);
1fec7093
YS
1146}
1147
602adf40
YS
1148/*
1149 * Send ceph osd request
1150 */
1151static int rbd_do_request(struct request *rq,
0ce1a794 1152 struct rbd_device *rbd_dev,
602adf40
YS
1153 struct ceph_snap_context *snapc,
1154 u64 snapid,
aded07ea 1155 const char *object_name, u64 ofs, u64 len,
602adf40
YS
1156 struct bio *bio,
1157 struct page **pages,
1158 int num_pages,
1159 int flags,
30573d68 1160 struct ceph_osd_req_op *op,
1fec7093
YS
1161 struct rbd_req_coll *coll,
1162 int coll_index,
5f29ddd4
AE
1163 void (*rbd_cb)(struct ceph_osd_request *,
1164 struct ceph_msg *),
59c2be1e 1165 u64 *ver)
602adf40 1166{
2e53c6c3 1167 struct ceph_osd_client *osdc;
5f29ddd4 1168 struct ceph_osd_request *osd_req;
2e53c6c3 1169 struct rbd_request *rbd_req = NULL;
602adf40 1170 struct timespec mtime = CURRENT_TIME;
2e53c6c3 1171 int ret;
602adf40 1172
f7760dad
AE
1173 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1174 object_name, (unsigned long long) ofs,
1175 (unsigned long long) len, coll, coll_index);
602adf40 1176
0ce1a794 1177 osdc = &rbd_dev->rbd_client->client->osdc;
30573d68 1178 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO);
2e53c6c3
AE
1179 if (!osd_req)
1180 return -ENOMEM;
602adf40 1181
d178a9e7 1182 osd_req->r_flags = flags;
54a54007
AE
1183 osd_req->r_pages = pages;
1184 if (bio) {
1185 osd_req->r_bio = bio;
1186 bio_get(osd_req->r_bio);
1187 }
602adf40 1188
18216657 1189 if (coll) {
2e53c6c3
AE
1190 ret = -ENOMEM;
1191 rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO);
1192 if (!rbd_req)
1193 goto done_osd_req;
1194
1195 rbd_req->rq = rq;
1196 rbd_req->bio = bio;
1197 rbd_req->pages = pages;
1198 rbd_req->len = len;
1199 rbd_req->coll = coll;
18216657 1200 rbd_req->coll_index = coll_index;
2e53c6c3 1201 }
602adf40 1202
2e53c6c3 1203 osd_req->r_callback = rbd_cb;
5f29ddd4 1204 osd_req->r_priv = rbd_req;
602adf40 1205
5f29ddd4
AE
1206 strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
1207 osd_req->r_oid_len = strlen(osd_req->r_oid);
602adf40 1208
0903e875 1209 osd_req->r_file_layout = rbd_dev->layout; /* struct */
e01e7927
AE
1210 osd_req->r_num_pages = calc_pages_for(ofs, len);
1211 osd_req->r_page_alignment = ofs & ~PAGE_MASK;
602adf40 1212
30573d68 1213 ceph_osdc_build_request(osd_req, ofs, len, 1, op,
ae7ca4a3 1214 snapc, snapid, &mtime);
602adf40 1215
8b84de79 1216 if (op->op == CEPH_OSD_OP_WATCH && op->watch.flag) {
5f29ddd4 1217 ceph_osdc_set_request_linger(osdc, osd_req);
8b84de79 1218 rbd_dev->watch_request = osd_req;
59c2be1e
YS
1219 }
1220
5f29ddd4 1221 ret = ceph_osdc_start_request(osdc, osd_req, false);
602adf40
YS
1222 if (ret < 0)
1223 goto done_err;
1224
1225 if (!rbd_cb) {
5f29ddd4
AE
1226 u64 version;
1227
1228 ret = ceph_osdc_wait_request(osdc, osd_req);
1229 version = le64_to_cpu(osd_req->r_reassert_version.version);
59c2be1e 1230 if (ver)
5f29ddd4
AE
1231 *ver = version;
1232 dout("reassert_ver=%llu\n", (unsigned long long) version);
1233 ceph_osdc_put_request(osd_req);
602adf40
YS
1234 }
1235 return ret;
1236
1237done_err:
2e53c6c3
AE
1238 if (bio)
1239 bio_chain_put(osd_req->r_bio);
725afc97 1240 kfree(rbd_req);
2e53c6c3
AE
1241done_osd_req:
1242 ceph_osdc_put_request(osd_req);
1243
602adf40
YS
1244 return ret;
1245}
1246
1247/*
1248 * Ceph osd op callback
1249 */
5f29ddd4 1250static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg)
602adf40 1251{
5f29ddd4 1252 struct rbd_request *rbd_req = osd_req->r_priv;
602adf40
YS
1253 struct ceph_osd_reply_head *replyhead;
1254 struct ceph_osd_op *op;
8986cb37 1255 s32 rc;
602adf40
YS
1256 u64 bytes;
1257 int read_op;
1258
1259 /* parse reply */
1260 replyhead = msg->front.iov_base;
1261 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1262 op = (void *)(replyhead + 1);
8986cb37 1263 rc = (s32)le32_to_cpu(replyhead->result);
602adf40 1264 bytes = le64_to_cpu(op->extent.length);
895cfcc8 1265 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
602adf40 1266
bd919d45
AE
1267 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1268 (unsigned long long) bytes, read_op, (int) rc);
602adf40 1269
8986cb37 1270 if (rc == (s32)-ENOENT && read_op) {
725afc97 1271 zero_bio_chain(rbd_req->bio, 0);
602adf40 1272 rc = 0;
725afc97
AE
1273 } else if (rc == 0 && read_op && bytes < rbd_req->len) {
1274 zero_bio_chain(rbd_req->bio, bytes);
1275 bytes = rbd_req->len;
602adf40
YS
1276 }
1277
725afc97 1278 rbd_coll_end_req(rbd_req, rc, bytes);
602adf40 1279
725afc97
AE
1280 if (rbd_req->bio)
1281 bio_chain_put(rbd_req->bio);
602adf40 1282
5f29ddd4 1283 ceph_osdc_put_request(osd_req);
725afc97 1284 kfree(rbd_req);
602adf40
YS
1285}
1286
5f29ddd4
AE
1287static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
1288 struct ceph_msg *msg)
59c2be1e 1289{
5f29ddd4 1290 ceph_osdc_put_request(osd_req);
59c2be1e
YS
1291}
1292
602adf40
YS
1293/*
1294 * Do a synchronous ceph osd operation
1295 */
0ce1a794 1296static int rbd_req_sync_op(struct rbd_device *rbd_dev,
602adf40 1297 int flags,
30573d68 1298 struct ceph_osd_req_op *op,
aded07ea 1299 const char *object_name,
f8d4de6e
AE
1300 u64 ofs, u64 inbound_size,
1301 char *inbound,
59c2be1e 1302 u64 *ver)
602adf40
YS
1303{
1304 int ret;
1305 struct page **pages;
1306 int num_pages;
913d2fdc 1307
30573d68 1308 rbd_assert(op != NULL);
602adf40 1309
f8d4de6e 1310 num_pages = calc_pages_for(ofs, inbound_size);
602adf40 1311 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
b8d0638a
DC
1312 if (IS_ERR(pages))
1313 return PTR_ERR(pages);
602adf40 1314
25704ac9 1315 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
f8d4de6e 1316 object_name, ofs, inbound_size, NULL,
602adf40
YS
1317 pages, num_pages,
1318 flags,
30573d68 1319 op,
1fec7093 1320 NULL, 0,
59c2be1e 1321 NULL,
8b84de79 1322 ver);
602adf40 1323 if (ret < 0)
913d2fdc 1324 goto done;
602adf40 1325
f8d4de6e
AE
1326 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1327 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
602adf40 1328
602adf40
YS
1329done:
1330 ceph_release_page_vector(pages, num_pages);
1331 return ret;
1332}
1333
1334/*
1335 * Do an asynchronous ceph osd operation
1336 */
1337static int rbd_do_op(struct request *rq,
0ce1a794 1338 struct rbd_device *rbd_dev,
602adf40 1339 struct ceph_snap_context *snapc,
602adf40 1340 u64 ofs, u64 len,
1fec7093
YS
1341 struct bio *bio,
1342 struct rbd_req_coll *coll,
1343 int coll_index)
602adf40 1344{
98571b5a 1345 const char *seg_name;
602adf40
YS
1346 u64 seg_ofs;
1347 u64 seg_len;
1348 int ret;
139b4318 1349 struct ceph_osd_req_op *op;
ff2e4bb5
AE
1350 int opcode;
1351 int flags;
4634246d 1352 u64 snapid;
602adf40 1353
65ccfe21 1354 seg_name = rbd_segment_name(rbd_dev, ofs);
602adf40
YS
1355 if (!seg_name)
1356 return -ENOMEM;
65ccfe21
AE
1357 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1358 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
602adf40 1359
ff2e4bb5
AE
1360 if (rq_data_dir(rq) == WRITE) {
1361 opcode = CEPH_OSD_OP_WRITE;
1362 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
4634246d 1363 snapid = CEPH_NOSNAP;
ff2e4bb5
AE
1364 } else {
1365 opcode = CEPH_OSD_OP_READ;
1366 flags = CEPH_OSD_FLAG_READ;
a7b4c65f 1367 rbd_assert(!snapc);
0d7dbfce 1368 snapid = rbd_dev->spec->snap_id;
ff2e4bb5 1369 }
602adf40 1370
57cfc106 1371 ret = -ENOMEM;
8d23bf29 1372 op = rbd_osd_req_op_create(opcode, seg_ofs, seg_len);
139b4318 1373 if (!op)
602adf40
YS
1374 goto done;
1375
1376 /* we've taken care of segment sizes earlier when we
1377 cloned the bios. We should never have a segment
1378 truncated at this point */
aafb230e 1379 rbd_assert(seg_len == len);
602adf40
YS
1380
1381 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1382 seg_name, seg_ofs, seg_len,
1383 bio,
1384 NULL, 0,
1385 flags,
30573d68 1386 op,
1fec7093 1387 coll, coll_index,
8b84de79 1388 rbd_req_cb, NULL);
cd323ac0
AE
1389 if (ret < 0)
1390 rbd_coll_end_req_index(rq, coll, coll_index,
1391 (s32)ret, seg_len);
8d23bf29 1392 rbd_osd_req_op_destroy(op);
602adf40
YS
1393done:
1394 kfree(seg_name);
1395 return ret;
1396}
1397
602adf40
YS
1398/*
1399 * Request sync osd read
1400 */
0ce1a794 1401static int rbd_req_sync_read(struct rbd_device *rbd_dev,
aded07ea 1402 const char *object_name,
602adf40 1403 u64 ofs, u64 len,
59c2be1e
YS
1404 char *buf,
1405 u64 *ver)
602adf40 1406{
139b4318 1407 struct ceph_osd_req_op *op;
913d2fdc
AE
1408 int ret;
1409
8d23bf29 1410 op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, ofs, len);
139b4318 1411 if (!op)
913d2fdc
AE
1412 return -ENOMEM;
1413
25704ac9 1414 ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
8b84de79 1415 op, object_name, ofs, len, buf, ver);
8d23bf29 1416 rbd_osd_req_op_destroy(op);
913d2fdc
AE
1417
1418 return ret;
602adf40
YS
1419}
1420
1421/*
59c2be1e
YS
1422 * Request sync osd watch
1423 */
0ce1a794 1424static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
59c2be1e 1425 u64 ver,
7f0a24d8 1426 u64 notify_id)
59c2be1e 1427{
139b4318 1428 struct ceph_osd_req_op *op;
11f77002
SW
1429 int ret;
1430
5efea49a 1431 op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
139b4318 1432 if (!op)
57cfc106 1433 return -ENOMEM;
59c2be1e 1434
0ce1a794 1435 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
7f0a24d8 1436 rbd_dev->header_name, 0, 0, NULL,
ad4f232f 1437 NULL, 0,
59c2be1e 1438 CEPH_OSD_FLAG_READ,
30573d68 1439 op,
1fec7093 1440 NULL, 0,
8b84de79 1441 rbd_simple_req_cb, NULL);
59c2be1e 1442
5efea49a
AE
1443 rbd_osd_req_op_destroy(op);
1444
59c2be1e
YS
1445 return ret;
1446}
1447
1448static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1449{
0ce1a794 1450 struct rbd_device *rbd_dev = (struct rbd_device *)data;
a71b891b 1451 u64 hver;
13143d2d
SW
1452 int rc;
1453
0ce1a794 1454 if (!rbd_dev)
59c2be1e
YS
1455 return;
1456
bd919d45
AE
1457 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1458 rbd_dev->header_name, (unsigned long long) notify_id,
1459 (unsigned int) opcode);
117973fb 1460 rc = rbd_dev_refresh(rbd_dev, &hver);
13143d2d 1461 if (rc)
06ecc6cb
AE
1462 rbd_warn(rbd_dev, "got notification but failed to "
1463 " update snaps: %d\n", rc);
59c2be1e 1464
7f0a24d8 1465 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
59c2be1e
YS
1466}
1467
1468/*
907703d0
AE
1469 * Request sync osd watch/unwatch. The value of "start" determines
1470 * whether a watch request is being initiated or torn down.
59c2be1e 1471 */
907703d0 1472static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
59c2be1e 1473{
5efea49a
AE
1474 struct ceph_osd_req_op *op;
1475 int ret = 0;
59c2be1e 1476
c0430647
AE
1477 rbd_assert(start ^ !!rbd_dev->watch_event);
1478 rbd_assert(start ^ !!rbd_dev->watch_request);
1479
907703d0
AE
1480 if (start) {
1481 struct ceph_osd_client *osdc;
79e3057c 1482
907703d0
AE
1483 osdc = &rbd_dev->rbd_client->client->osdc;
1484 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
1485 &rbd_dev->watch_event);
1486 if (ret < 0)
5efea49a 1487 return ret;
907703d0 1488 }
79e3057c 1489
5efea49a
AE
1490 op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
1491 rbd_dev->watch_event->cookie,
1492 rbd_dev->header.obj_version, start);
1493 if (op)
1494 ret = rbd_req_sync_op(rbd_dev,
79e3057c 1495 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
907703d0 1496 op, rbd_dev->header_name,
8b84de79 1497 0, 0, NULL, NULL);
79e3057c 1498
5efea49a
AE
1499 /* Cancel the event if we're tearing down, or on error */
1500
1501 if (!start || !op || ret < 0) {
907703d0
AE
1502 ceph_osdc_cancel_event(rbd_dev->watch_event);
1503 rbd_dev->watch_event = NULL;
1504 }
5efea49a 1505 rbd_osd_req_op_destroy(op);
907703d0 1506
79e3057c
YS
1507 return ret;
1508}
1509
602adf40 1510/*
3cb4a687 1511 * Synchronous osd object method call
602adf40 1512 */
0ce1a794 1513static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
aded07ea
AE
1514 const char *object_name,
1515 const char *class_name,
1516 const char *method_name,
3cb4a687
AE
1517 const char *outbound,
1518 size_t outbound_size,
f8d4de6e
AE
1519 char *inbound,
1520 size_t inbound_size,
59c2be1e 1521 u64 *ver)
602adf40 1522{
139b4318 1523 struct ceph_osd_req_op *op;
57cfc106
AE
1524 int ret;
1525
3cb4a687
AE
1526 /*
1527 * Any input parameters required by the method we're calling
1528 * will be sent along with the class and method names as
1529 * part of the message payload. That data and its size are
1530 * supplied via the indata and indata_len fields (named from
1531 * the perspective of the server side) in the OSD request
1532 * operation.
1533 */
2647ba38
AE
1534 op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1535 method_name, outbound, outbound_size);
139b4318 1536 if (!op)
57cfc106 1537 return -ENOMEM;
602adf40 1538
30573d68 1539 ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op,
f8d4de6e 1540 object_name, 0, inbound_size, inbound,
8b84de79 1541 ver);
602adf40 1542
2647ba38 1543 rbd_osd_req_op_destroy(op);
602adf40
YS
1544
1545 dout("cls_exec returned %d\n", ret);
1546 return ret;
1547}
1548
1fec7093
YS
1549static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1550{
1551 struct rbd_req_coll *coll =
1552 kzalloc(sizeof(struct rbd_req_coll) +
1553 sizeof(struct rbd_req_status) * num_reqs,
1554 GFP_ATOMIC);
1555
1556 if (!coll)
1557 return NULL;
1558 coll->total = num_reqs;
1559 kref_init(&coll->kref);
1560 return coll;
1561}
1562
8295cda7
AE
1563static int rbd_dev_do_request(struct request *rq,
1564 struct rbd_device *rbd_dev,
1565 struct ceph_snap_context *snapc,
1566 u64 ofs, unsigned int size,
1567 struct bio *bio_chain)
1568{
1569 int num_segs;
1570 struct rbd_req_coll *coll;
1571 unsigned int bio_offset;
1572 int cur_seg = 0;
1573
1574 dout("%s 0x%x bytes at 0x%llx\n",
1575 rq_data_dir(rq) == WRITE ? "write" : "read",
1576 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
1577
1578 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1579 if (num_segs <= 0)
1580 return num_segs;
1581
1582 coll = rbd_alloc_coll(num_segs);
1583 if (!coll)
1584 return -ENOMEM;
1585
1586 bio_offset = 0;
1587 do {
1588 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1589 unsigned int clone_size;
1590 struct bio *bio_clone;
1591
1592 BUG_ON(limit > (u64)UINT_MAX);
1593 clone_size = (unsigned int)limit;
1594 dout("bio_chain->bi_vcnt=%hu\n", bio_chain->bi_vcnt);
1595
1596 kref_get(&coll->kref);
1597
1598 /* Pass a cloned bio chain via an osd request */
1599
1600 bio_clone = bio_chain_clone_range(&bio_chain,
1601 &bio_offset, clone_size,
1602 GFP_ATOMIC);
1603 if (bio_clone)
1604 (void)rbd_do_op(rq, rbd_dev, snapc,
1605 ofs, clone_size,
1606 bio_clone, coll, cur_seg);
1607 else
1608 rbd_coll_end_req_index(rq, coll, cur_seg,
1609 (s32)-ENOMEM,
1610 clone_size);
1611 size -= clone_size;
1612 ofs += clone_size;
1613
1614 cur_seg++;
1615 } while (size > 0);
1616 kref_put(&coll->kref, rbd_coll_release);
1617
1618 return 0;
1619}
1620
602adf40
YS
1621/*
1622 * block device queue callback
1623 */
1624static void rbd_rq_fn(struct request_queue *q)
1625{
1626 struct rbd_device *rbd_dev = q->queuedata;
b395e8b5 1627 bool read_only = rbd_dev->mapping.read_only;
602adf40 1628 struct request *rq;
602adf40 1629
00f1f36f 1630 while ((rq = blk_fetch_request(q))) {
b395e8b5
AE
1631 struct ceph_snap_context *snapc = NULL;
1632 unsigned int size = 0;
8295cda7 1633 int result;
602adf40 1634
602adf40
YS
1635 dout("fetched request\n");
1636
b395e8b5
AE
1637 /* Filter out block requests we don't understand */
1638
602adf40
YS
1639 if ((rq->cmd_type != REQ_TYPE_FS)) {
1640 __blk_end_request_all(rq, 0);
00f1f36f 1641 continue;
602adf40 1642 }
b395e8b5 1643 spin_unlock_irq(q->queue_lock);
602adf40 1644
a7b4c65f
AE
1645 /* Write requests need a reference to the snapshot context */
1646
1647 if (rq_data_dir(rq) == WRITE) {
1648 result = -EROFS;
1649 if (read_only) /* Can't write to a read-only device */
1650 goto out_end_request;
1651
1652 /*
1653 * Note that each osd request will take its
1654 * own reference to the snapshot context
1655 * supplied. The reference we take here
1656 * just guarantees the one we provide stays
1657 * valid.
1658 */
1659 down_read(&rbd_dev->header_rwsem);
b395e8b5 1660 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
a7b4c65f 1661 up_read(&rbd_dev->header_rwsem);
b395e8b5 1662 rbd_assert(snapc != NULL);
a7b4c65f 1663 } else if (!atomic_read(&rbd_dev->exists)) {
0d7dbfce 1664 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
d1d25646 1665 dout("request for non-existent snapshot");
b395e8b5
AE
1666 result = -ENXIO;
1667 goto out_end_request;
e88a36ec
JD
1668 }
1669
f7760dad 1670 size = blk_rq_bytes(rq);
b395e8b5
AE
1671 result = rbd_dev_do_request(rq, rbd_dev, snapc,
1672 blk_rq_pos(rq) * SECTOR_SIZE,
1673 size, rq->bio);
1674out_end_request:
a7b4c65f
AE
1675 if (snapc)
1676 ceph_put_snap_context(snapc);
8295cda7
AE
1677 spin_lock_irq(q->queue_lock);
1678 if (!size || result < 0)
1679 __blk_end_request_all(rq, result);
602adf40
YS
1680 }
1681}
1682
1683/*
1684 * a queue callback. Makes sure that we don't create a bio that spans across
1685 * multiple osd objects. One exception would be with a single page bios,
f7760dad 1686 * which we handle later at bio_chain_clone_range()
602adf40
YS
1687 */
1688static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1689 struct bio_vec *bvec)
1690{
1691 struct rbd_device *rbd_dev = q->queuedata;
e5cfeed2
AE
1692 sector_t sector_offset;
1693 sector_t sectors_per_obj;
1694 sector_t obj_sector_offset;
1695 int ret;
1696
1697 /*
1698 * Find how far into its rbd object the partition-relative
1699 * bio start sector is to offset relative to the enclosing
1700 * device.
1701 */
1702 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1703 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1704 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
1705
1706 /*
1707 * Compute the number of bytes from that offset to the end
1708 * of the object. Account for what's already used by the bio.
1709 */
1710 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1711 if (ret > bmd->bi_size)
1712 ret -= bmd->bi_size;
1713 else
1714 ret = 0;
1715
1716 /*
1717 * Don't send back more than was asked for. And if the bio
1718 * was empty, let the whole thing through because: "Note
1719 * that a block device *must* allow a single page to be
1720 * added to an empty bio."
1721 */
1722 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1723 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1724 ret = (int) bvec->bv_len;
1725
1726 return ret;
602adf40
YS
1727}
1728
1729static void rbd_free_disk(struct rbd_device *rbd_dev)
1730{
1731 struct gendisk *disk = rbd_dev->disk;
1732
1733 if (!disk)
1734 return;
1735
602adf40
YS
1736 if (disk->flags & GENHD_FL_UP)
1737 del_gendisk(disk);
1738 if (disk->queue)
1739 blk_cleanup_queue(disk->queue);
1740 put_disk(disk);
1741}
1742
1743/*
4156d998
AE
1744 * Read the complete header for the given rbd device.
1745 *
1746 * Returns a pointer to a dynamically-allocated buffer containing
1747 * the complete and validated header. Caller can pass the address
1748 * of a variable that will be filled in with the version of the
1749 * header object at the time it was read.
1750 *
1751 * Returns a pointer-coded errno if a failure occurs.
602adf40 1752 */
4156d998
AE
1753static struct rbd_image_header_ondisk *
1754rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 1755{
4156d998 1756 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 1757 u32 snap_count = 0;
4156d998
AE
1758 u64 names_size = 0;
1759 u32 want_count;
1760 int ret;
602adf40 1761
00f1f36f 1762 /*
4156d998
AE
1763 * The complete header will include an array of its 64-bit
1764 * snapshot ids, followed by the names of those snapshots as
1765 * a contiguous block of NUL-terminated strings. Note that
1766 * the number of snapshots could change by the time we read
1767 * it in, in which case we re-read it.
00f1f36f 1768 */
4156d998
AE
1769 do {
1770 size_t size;
1771
1772 kfree(ondisk);
1773
1774 size = sizeof (*ondisk);
1775 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1776 size += names_size;
1777 ondisk = kmalloc(size, GFP_KERNEL);
1778 if (!ondisk)
1779 return ERR_PTR(-ENOMEM);
1780
4775618d 1781 ret = rbd_req_sync_read(rbd_dev, rbd_dev->header_name,
4156d998
AE
1782 0, size,
1783 (char *) ondisk, version);
1784
1785 if (ret < 0)
1786 goto out_err;
1787 if (WARN_ON((size_t) ret < size)) {
1788 ret = -ENXIO;
06ecc6cb
AE
1789 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
1790 size, ret);
4156d998
AE
1791 goto out_err;
1792 }
1793 if (!rbd_dev_ondisk_valid(ondisk)) {
1794 ret = -ENXIO;
06ecc6cb 1795 rbd_warn(rbd_dev, "invalid header");
4156d998 1796 goto out_err;
81e759fb 1797 }
602adf40 1798
4156d998
AE
1799 names_size = le64_to_cpu(ondisk->snap_names_len);
1800 want_count = snap_count;
1801 snap_count = le32_to_cpu(ondisk->snap_count);
1802 } while (snap_count != want_count);
00f1f36f 1803
4156d998 1804 return ondisk;
00f1f36f 1805
4156d998
AE
1806out_err:
1807 kfree(ondisk);
1808
1809 return ERR_PTR(ret);
1810}
1811
1812/*
1813 * reload the ondisk the header
1814 */
1815static int rbd_read_header(struct rbd_device *rbd_dev,
1816 struct rbd_image_header *header)
1817{
1818 struct rbd_image_header_ondisk *ondisk;
1819 u64 ver = 0;
1820 int ret;
602adf40 1821
4156d998
AE
1822 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1823 if (IS_ERR(ondisk))
1824 return PTR_ERR(ondisk);
1825 ret = rbd_header_from_disk(header, ondisk);
1826 if (ret >= 0)
1827 header->obj_version = ver;
1828 kfree(ondisk);
1829
1830 return ret;
602adf40
YS
1831}
1832
41f38c2b 1833static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
dfc5606d
YS
1834{
1835 struct rbd_snap *snap;
a0593290 1836 struct rbd_snap *next;
dfc5606d 1837
a0593290 1838 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
41f38c2b 1839 rbd_remove_snap_dev(snap);
dfc5606d
YS
1840}
1841
9478554a
AE
1842static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1843{
1844 sector_t size;
1845
0d7dbfce 1846 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
9478554a
AE
1847 return;
1848
1849 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1850 dout("setting size to %llu sectors", (unsigned long long) size);
1851 rbd_dev->mapping.size = (u64) size;
1852 set_capacity(rbd_dev->disk, size);
1853}
1854
602adf40
YS
1855/*
1856 * only read the first part of the ondisk header, without the snaps info
1857 */
117973fb 1858static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
1859{
1860 int ret;
1861 struct rbd_image_header h;
602adf40
YS
1862
1863 ret = rbd_read_header(rbd_dev, &h);
1864 if (ret < 0)
1865 return ret;
1866
a51aa0c0
JD
1867 down_write(&rbd_dev->header_rwsem);
1868
9478554a
AE
1869 /* Update image size, and check for resize of mapped image */
1870 rbd_dev->header.image_size = h.image_size;
1871 rbd_update_mapping_size(rbd_dev);
9db4b3e3 1872
849b4260 1873 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 1874 kfree(rbd_dev->header.snap_sizes);
849b4260 1875 kfree(rbd_dev->header.snap_names);
d1d25646
JD
1876 /* osd requests may still refer to snapc */
1877 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 1878
b813623a
AE
1879 if (hver)
1880 *hver = h.obj_version;
a71b891b 1881 rbd_dev->header.obj_version = h.obj_version;
93a24e08 1882 rbd_dev->header.image_size = h.image_size;
602adf40
YS
1883 rbd_dev->header.snapc = h.snapc;
1884 rbd_dev->header.snap_names = h.snap_names;
1885 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
1886 /* Free the extra copy of the object prefix */
1887 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1888 kfree(h.object_prefix);
1889
304f6808
AE
1890 ret = rbd_dev_snaps_update(rbd_dev);
1891 if (!ret)
1892 ret = rbd_dev_snaps_register(rbd_dev);
dfc5606d 1893
c666601a 1894 up_write(&rbd_dev->header_rwsem);
602adf40 1895
dfc5606d 1896 return ret;
602adf40
YS
1897}
1898
117973fb 1899static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
1fe5e993
AE
1900{
1901 int ret;
1902
117973fb 1903 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1fe5e993 1904 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
117973fb
AE
1905 if (rbd_dev->image_format == 1)
1906 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1907 else
1908 ret = rbd_dev_v2_refresh(rbd_dev, hver);
1fe5e993
AE
1909 mutex_unlock(&ctl_mutex);
1910
1911 return ret;
1912}
1913
602adf40
YS
1914static int rbd_init_disk(struct rbd_device *rbd_dev)
1915{
1916 struct gendisk *disk;
1917 struct request_queue *q;
593a9e7b 1918 u64 segment_size;
602adf40 1919
602adf40 1920 /* create gendisk info */
602adf40
YS
1921 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1922 if (!disk)
1fcdb8aa 1923 return -ENOMEM;
602adf40 1924
f0f8cef5 1925 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 1926 rbd_dev->dev_id);
602adf40
YS
1927 disk->major = rbd_dev->major;
1928 disk->first_minor = 0;
1929 disk->fops = &rbd_bd_ops;
1930 disk->private_data = rbd_dev;
1931
1932 /* init rq */
602adf40
YS
1933 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1934 if (!q)
1935 goto out_disk;
029bcbd8 1936
593a9e7b
AE
1937 /* We use the default size, but let's be explicit about it. */
1938 blk_queue_physical_block_size(q, SECTOR_SIZE);
1939
029bcbd8 1940 /* set io sizes to object size */
593a9e7b
AE
1941 segment_size = rbd_obj_bytes(&rbd_dev->header);
1942 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1943 blk_queue_max_segment_size(q, segment_size);
1944 blk_queue_io_min(q, segment_size);
1945 blk_queue_io_opt(q, segment_size);
029bcbd8 1946
602adf40
YS
1947 blk_queue_merge_bvec(q, rbd_merge_bvec);
1948 disk->queue = q;
1949
1950 q->queuedata = rbd_dev;
1951
1952 rbd_dev->disk = disk;
602adf40 1953
12f02944
AE
1954 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1955
602adf40 1956 return 0;
602adf40
YS
1957out_disk:
1958 put_disk(disk);
1fcdb8aa
AE
1959
1960 return -ENOMEM;
602adf40
YS
1961}
1962
dfc5606d
YS
1963/*
1964 sysfs
1965*/
1966
593a9e7b
AE
1967static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1968{
1969 return container_of(dev, struct rbd_device, dev);
1970}
1971
dfc5606d
YS
1972static ssize_t rbd_size_show(struct device *dev,
1973 struct device_attribute *attr, char *buf)
1974{
593a9e7b 1975 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
1976 sector_t size;
1977
1978 down_read(&rbd_dev->header_rwsem);
1979 size = get_capacity(rbd_dev->disk);
1980 up_read(&rbd_dev->header_rwsem);
dfc5606d 1981
a51aa0c0 1982 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
1983}
1984
34b13184
AE
1985/*
1986 * Note this shows the features for whatever's mapped, which is not
1987 * necessarily the base image.
1988 */
1989static ssize_t rbd_features_show(struct device *dev,
1990 struct device_attribute *attr, char *buf)
1991{
1992 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1993
1994 return sprintf(buf, "0x%016llx\n",
1995 (unsigned long long) rbd_dev->mapping.features);
1996}
1997
dfc5606d
YS
1998static ssize_t rbd_major_show(struct device *dev,
1999 struct device_attribute *attr, char *buf)
2000{
593a9e7b 2001 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 2002
dfc5606d
YS
2003 return sprintf(buf, "%d\n", rbd_dev->major);
2004}
2005
2006static ssize_t rbd_client_id_show(struct device *dev,
2007 struct device_attribute *attr, char *buf)
602adf40 2008{
593a9e7b 2009 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2010
1dbb4399
AE
2011 return sprintf(buf, "client%lld\n",
2012 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
2013}
2014
dfc5606d
YS
2015static ssize_t rbd_pool_show(struct device *dev,
2016 struct device_attribute *attr, char *buf)
602adf40 2017{
593a9e7b 2018 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2019
0d7dbfce 2020 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
dfc5606d
YS
2021}
2022
9bb2f334
AE
2023static ssize_t rbd_pool_id_show(struct device *dev,
2024 struct device_attribute *attr, char *buf)
2025{
2026 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2027
0d7dbfce
AE
2028 return sprintf(buf, "%llu\n",
2029 (unsigned long long) rbd_dev->spec->pool_id);
9bb2f334
AE
2030}
2031
dfc5606d
YS
2032static ssize_t rbd_name_show(struct device *dev,
2033 struct device_attribute *attr, char *buf)
2034{
593a9e7b 2035 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2036
a92ffdf8
AE
2037 if (rbd_dev->spec->image_name)
2038 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2039
2040 return sprintf(buf, "(unknown)\n");
dfc5606d
YS
2041}
2042
589d30e0
AE
2043static ssize_t rbd_image_id_show(struct device *dev,
2044 struct device_attribute *attr, char *buf)
2045{
2046 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2047
0d7dbfce 2048 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
589d30e0
AE
2049}
2050
34b13184
AE
2051/*
2052 * Shows the name of the currently-mapped snapshot (or
2053 * RBD_SNAP_HEAD_NAME for the base image).
2054 */
dfc5606d
YS
2055static ssize_t rbd_snap_show(struct device *dev,
2056 struct device_attribute *attr,
2057 char *buf)
2058{
593a9e7b 2059 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2060
0d7dbfce 2061 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
dfc5606d
YS
2062}
2063
86b00e0d
AE
2064/*
2065 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2066 * for the parent image. If there is no parent, simply shows
2067 * "(no parent image)".
2068 */
2069static ssize_t rbd_parent_show(struct device *dev,
2070 struct device_attribute *attr,
2071 char *buf)
2072{
2073 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2074 struct rbd_spec *spec = rbd_dev->parent_spec;
2075 int count;
2076 char *bufp = buf;
2077
2078 if (!spec)
2079 return sprintf(buf, "(no parent image)\n");
2080
2081 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2082 (unsigned long long) spec->pool_id, spec->pool_name);
2083 if (count < 0)
2084 return count;
2085 bufp += count;
2086
2087 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2088 spec->image_name ? spec->image_name : "(unknown)");
2089 if (count < 0)
2090 return count;
2091 bufp += count;
2092
2093 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2094 (unsigned long long) spec->snap_id, spec->snap_name);
2095 if (count < 0)
2096 return count;
2097 bufp += count;
2098
2099 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2100 if (count < 0)
2101 return count;
2102 bufp += count;
2103
2104 return (ssize_t) (bufp - buf);
2105}
2106
dfc5606d
YS
2107static ssize_t rbd_image_refresh(struct device *dev,
2108 struct device_attribute *attr,
2109 const char *buf,
2110 size_t size)
2111{
593a9e7b 2112 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 2113 int ret;
602adf40 2114
117973fb 2115 ret = rbd_dev_refresh(rbd_dev, NULL);
b813623a
AE
2116
2117 return ret < 0 ? ret : size;
dfc5606d 2118}
602adf40 2119
dfc5606d 2120static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
34b13184 2121static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
dfc5606d
YS
2122static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2123static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2124static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 2125static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d 2126static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
589d30e0 2127static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
dfc5606d
YS
2128static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2129static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
86b00e0d 2130static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
dfc5606d
YS
2131
2132static struct attribute *rbd_attrs[] = {
2133 &dev_attr_size.attr,
34b13184 2134 &dev_attr_features.attr,
dfc5606d
YS
2135 &dev_attr_major.attr,
2136 &dev_attr_client_id.attr,
2137 &dev_attr_pool.attr,
9bb2f334 2138 &dev_attr_pool_id.attr,
dfc5606d 2139 &dev_attr_name.attr,
589d30e0 2140 &dev_attr_image_id.attr,
dfc5606d 2141 &dev_attr_current_snap.attr,
86b00e0d 2142 &dev_attr_parent.attr,
dfc5606d 2143 &dev_attr_refresh.attr,
dfc5606d
YS
2144 NULL
2145};
2146
2147static struct attribute_group rbd_attr_group = {
2148 .attrs = rbd_attrs,
2149};
2150
2151static const struct attribute_group *rbd_attr_groups[] = {
2152 &rbd_attr_group,
2153 NULL
2154};
2155
2156static void rbd_sysfs_dev_release(struct device *dev)
2157{
2158}
2159
2160static struct device_type rbd_device_type = {
2161 .name = "rbd",
2162 .groups = rbd_attr_groups,
2163 .release = rbd_sysfs_dev_release,
2164};
2165
2166
2167/*
2168 sysfs - snapshots
2169*/
2170
2171static ssize_t rbd_snap_size_show(struct device *dev,
2172 struct device_attribute *attr,
2173 char *buf)
2174{
2175 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2176
3591538f 2177 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
2178}
2179
2180static ssize_t rbd_snap_id_show(struct device *dev,
2181 struct device_attribute *attr,
2182 char *buf)
2183{
2184 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2185
3591538f 2186 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
2187}
2188
34b13184
AE
2189static ssize_t rbd_snap_features_show(struct device *dev,
2190 struct device_attribute *attr,
2191 char *buf)
2192{
2193 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2194
2195 return sprintf(buf, "0x%016llx\n",
2196 (unsigned long long) snap->features);
2197}
2198
dfc5606d
YS
2199static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2200static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
34b13184 2201static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
dfc5606d
YS
2202
2203static struct attribute *rbd_snap_attrs[] = {
2204 &dev_attr_snap_size.attr,
2205 &dev_attr_snap_id.attr,
34b13184 2206 &dev_attr_snap_features.attr,
dfc5606d
YS
2207 NULL,
2208};
2209
2210static struct attribute_group rbd_snap_attr_group = {
2211 .attrs = rbd_snap_attrs,
2212};
2213
2214static void rbd_snap_dev_release(struct device *dev)
2215{
2216 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2217 kfree(snap->name);
2218 kfree(snap);
2219}
2220
2221static const struct attribute_group *rbd_snap_attr_groups[] = {
2222 &rbd_snap_attr_group,
2223 NULL
2224};
2225
2226static struct device_type rbd_snap_device_type = {
2227 .groups = rbd_snap_attr_groups,
2228 .release = rbd_snap_dev_release,
2229};
2230
8b8fb99c
AE
2231static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2232{
2233 kref_get(&spec->kref);
2234
2235 return spec;
2236}
2237
2238static void rbd_spec_free(struct kref *kref);
2239static void rbd_spec_put(struct rbd_spec *spec)
2240{
2241 if (spec)
2242 kref_put(&spec->kref, rbd_spec_free);
2243}
2244
2245static struct rbd_spec *rbd_spec_alloc(void)
2246{
2247 struct rbd_spec *spec;
2248
2249 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2250 if (!spec)
2251 return NULL;
2252 kref_init(&spec->kref);
2253
2254 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2255
2256 return spec;
2257}
2258
2259static void rbd_spec_free(struct kref *kref)
2260{
2261 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2262
2263 kfree(spec->pool_name);
2264 kfree(spec->image_id);
2265 kfree(spec->image_name);
2266 kfree(spec->snap_name);
2267 kfree(spec);
2268}
2269
c53d5893
AE
2270struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2271 struct rbd_spec *spec)
2272{
2273 struct rbd_device *rbd_dev;
2274
2275 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2276 if (!rbd_dev)
2277 return NULL;
2278
2279 spin_lock_init(&rbd_dev->lock);
d78b650a 2280 atomic_set(&rbd_dev->exists, 0);
c53d5893
AE
2281 INIT_LIST_HEAD(&rbd_dev->node);
2282 INIT_LIST_HEAD(&rbd_dev->snaps);
2283 init_rwsem(&rbd_dev->header_rwsem);
2284
2285 rbd_dev->spec = spec;
2286 rbd_dev->rbd_client = rbdc;
2287
0903e875
AE
2288 /* Initialize the layout used for all rbd requests */
2289
2290 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2291 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2292 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2293 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2294
c53d5893
AE
2295 return rbd_dev;
2296}
2297
2298static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2299{
86b00e0d 2300 rbd_spec_put(rbd_dev->parent_spec);
c53d5893
AE
2301 kfree(rbd_dev->header_name);
2302 rbd_put_client(rbd_dev->rbd_client);
2303 rbd_spec_put(rbd_dev->spec);
2304 kfree(rbd_dev);
2305}
2306
304f6808
AE
2307static bool rbd_snap_registered(struct rbd_snap *snap)
2308{
2309 bool ret = snap->dev.type == &rbd_snap_device_type;
2310 bool reg = device_is_registered(&snap->dev);
2311
2312 rbd_assert(!ret ^ reg);
2313
2314 return ret;
2315}
2316
41f38c2b 2317static void rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2318{
2319 list_del(&snap->node);
304f6808
AE
2320 if (device_is_registered(&snap->dev))
2321 device_unregister(&snap->dev);
dfc5606d
YS
2322}
2323
14e7085d 2324static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2325 struct device *parent)
2326{
2327 struct device *dev = &snap->dev;
2328 int ret;
2329
2330 dev->type = &rbd_snap_device_type;
2331 dev->parent = parent;
2332 dev->release = rbd_snap_dev_release;
d4b125e9 2333 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
304f6808
AE
2334 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2335
dfc5606d
YS
2336 ret = device_register(dev);
2337
2338 return ret;
2339}
2340
4e891e0a 2341static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
c8d18425 2342 const char *snap_name,
34b13184
AE
2343 u64 snap_id, u64 snap_size,
2344 u64 snap_features)
dfc5606d 2345{
4e891e0a 2346 struct rbd_snap *snap;
dfc5606d 2347 int ret;
4e891e0a
AE
2348
2349 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2350 if (!snap)
4e891e0a
AE
2351 return ERR_PTR(-ENOMEM);
2352
2353 ret = -ENOMEM;
c8d18425 2354 snap->name = kstrdup(snap_name, GFP_KERNEL);
4e891e0a
AE
2355 if (!snap->name)
2356 goto err;
2357
c8d18425
AE
2358 snap->id = snap_id;
2359 snap->size = snap_size;
34b13184 2360 snap->features = snap_features;
4e891e0a
AE
2361
2362 return snap;
2363
dfc5606d
YS
2364err:
2365 kfree(snap->name);
2366 kfree(snap);
4e891e0a
AE
2367
2368 return ERR_PTR(ret);
dfc5606d
YS
2369}
2370
cd892126
AE
2371static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2372 u64 *snap_size, u64 *snap_features)
2373{
2374 char *snap_name;
2375
2376 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2377
2378 *snap_size = rbd_dev->header.snap_sizes[which];
2379 *snap_features = 0; /* No features for v1 */
2380
2381 /* Skip over names until we find the one we are looking for */
2382
2383 snap_name = rbd_dev->header.snap_names;
2384 while (which--)
2385 snap_name += strlen(snap_name) + 1;
2386
2387 return snap_name;
2388}
2389
9d475de5
AE
2390/*
2391 * Get the size and object order for an image snapshot, or if
2392 * snap_id is CEPH_NOSNAP, gets this information for the base
2393 * image.
2394 */
2395static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2396 u8 *order, u64 *snap_size)
2397{
2398 __le64 snapid = cpu_to_le64(snap_id);
2399 int ret;
2400 struct {
2401 u8 order;
2402 __le64 size;
2403 } __attribute__ ((packed)) size_buf = { 0 };
2404
2405 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2406 "rbd", "get_size",
2407 (char *) &snapid, sizeof (snapid),
07b2391f 2408 (char *) &size_buf, sizeof (size_buf), NULL);
9d475de5
AE
2409 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2410 if (ret < 0)
2411 return ret;
2412
2413 *order = size_buf.order;
2414 *snap_size = le64_to_cpu(size_buf.size);
2415
2416 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2417 (unsigned long long) snap_id, (unsigned int) *order,
2418 (unsigned long long) *snap_size);
2419
2420 return 0;
2421}
2422
2423static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2424{
2425 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2426 &rbd_dev->header.obj_order,
2427 &rbd_dev->header.image_size);
2428}
2429
1e130199
AE
2430static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2431{
2432 void *reply_buf;
2433 int ret;
2434 void *p;
2435
2436 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2437 if (!reply_buf)
2438 return -ENOMEM;
2439
2440 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2441 "rbd", "get_object_prefix",
2442 NULL, 0,
07b2391f 2443 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
1e130199
AE
2444 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2445 if (ret < 0)
2446 goto out;
a0ea3a40 2447 ret = 0; /* rbd_req_sync_exec() can return positive */
1e130199
AE
2448
2449 p = reply_buf;
2450 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2451 p + RBD_OBJ_PREFIX_LEN_MAX,
2452 NULL, GFP_NOIO);
2453
2454 if (IS_ERR(rbd_dev->header.object_prefix)) {
2455 ret = PTR_ERR(rbd_dev->header.object_prefix);
2456 rbd_dev->header.object_prefix = NULL;
2457 } else {
2458 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2459 }
2460
2461out:
2462 kfree(reply_buf);
2463
2464 return ret;
2465}
2466
b1b5402a
AE
2467static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2468 u64 *snap_features)
2469{
2470 __le64 snapid = cpu_to_le64(snap_id);
2471 struct {
2472 __le64 features;
2473 __le64 incompat;
2474 } features_buf = { 0 };
d889140c 2475 u64 incompat;
b1b5402a
AE
2476 int ret;
2477
2478 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2479 "rbd", "get_features",
2480 (char *) &snapid, sizeof (snapid),
2481 (char *) &features_buf, sizeof (features_buf),
07b2391f 2482 NULL);
b1b5402a
AE
2483 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2484 if (ret < 0)
2485 return ret;
d889140c
AE
2486
2487 incompat = le64_to_cpu(features_buf.incompat);
2488 if (incompat & ~RBD_FEATURES_ALL)
b8f5c6ed 2489 return -ENXIO;
d889140c 2490
b1b5402a
AE
2491 *snap_features = le64_to_cpu(features_buf.features);
2492
2493 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2494 (unsigned long long) snap_id,
2495 (unsigned long long) *snap_features,
2496 (unsigned long long) le64_to_cpu(features_buf.incompat));
2497
2498 return 0;
2499}
2500
2501static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2502{
2503 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2504 &rbd_dev->header.features);
2505}
2506
86b00e0d
AE
2507static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2508{
2509 struct rbd_spec *parent_spec;
2510 size_t size;
2511 void *reply_buf = NULL;
2512 __le64 snapid;
2513 void *p;
2514 void *end;
2515 char *image_id;
2516 u64 overlap;
86b00e0d
AE
2517 int ret;
2518
2519 parent_spec = rbd_spec_alloc();
2520 if (!parent_spec)
2521 return -ENOMEM;
2522
2523 size = sizeof (__le64) + /* pool_id */
2524 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2525 sizeof (__le64) + /* snap_id */
2526 sizeof (__le64); /* overlap */
2527 reply_buf = kmalloc(size, GFP_KERNEL);
2528 if (!reply_buf) {
2529 ret = -ENOMEM;
2530 goto out_err;
2531 }
2532
2533 snapid = cpu_to_le64(CEPH_NOSNAP);
2534 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2535 "rbd", "get_parent",
2536 (char *) &snapid, sizeof (snapid),
07b2391f 2537 (char *) reply_buf, size, NULL);
86b00e0d
AE
2538 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2539 if (ret < 0)
2540 goto out_err;
2541
2542 ret = -ERANGE;
2543 p = reply_buf;
2544 end = (char *) reply_buf + size;
2545 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2546 if (parent_spec->pool_id == CEPH_NOPOOL)
2547 goto out; /* No parent? No problem. */
2548
0903e875
AE
2549 /* The ceph file layout needs to fit pool id in 32 bits */
2550
2551 ret = -EIO;
2552 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2553 goto out;
2554
979ed480 2555 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
86b00e0d
AE
2556 if (IS_ERR(image_id)) {
2557 ret = PTR_ERR(image_id);
2558 goto out_err;
2559 }
2560 parent_spec->image_id = image_id;
2561 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2562 ceph_decode_64_safe(&p, end, overlap, out_err);
2563
2564 rbd_dev->parent_overlap = overlap;
2565 rbd_dev->parent_spec = parent_spec;
2566 parent_spec = NULL; /* rbd_dev now owns this */
2567out:
2568 ret = 0;
2569out_err:
2570 kfree(reply_buf);
2571 rbd_spec_put(parent_spec);
2572
2573 return ret;
2574}
2575
9e15b77d
AE
2576static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2577{
2578 size_t image_id_size;
2579 char *image_id;
2580 void *p;
2581 void *end;
2582 size_t size;
2583 void *reply_buf = NULL;
2584 size_t len = 0;
2585 char *image_name = NULL;
2586 int ret;
2587
2588 rbd_assert(!rbd_dev->spec->image_name);
2589
69e7a02f
AE
2590 len = strlen(rbd_dev->spec->image_id);
2591 image_id_size = sizeof (__le32) + len;
9e15b77d
AE
2592 image_id = kmalloc(image_id_size, GFP_KERNEL);
2593 if (!image_id)
2594 return NULL;
2595
2596 p = image_id;
2597 end = (char *) image_id + image_id_size;
69e7a02f 2598 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
9e15b77d
AE
2599
2600 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2601 reply_buf = kmalloc(size, GFP_KERNEL);
2602 if (!reply_buf)
2603 goto out;
2604
2605 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
2606 "rbd", "dir_get_name",
2607 image_id, image_id_size,
07b2391f 2608 (char *) reply_buf, size, NULL);
9e15b77d
AE
2609 if (ret < 0)
2610 goto out;
2611 p = reply_buf;
2612 end = (char *) reply_buf + size;
2613 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2614 if (IS_ERR(image_name))
2615 image_name = NULL;
2616 else
2617 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2618out:
2619 kfree(reply_buf);
2620 kfree(image_id);
2621
2622 return image_name;
2623}
2624
2625/*
2626 * When a parent image gets probed, we only have the pool, image,
2627 * and snapshot ids but not the names of any of them. This call
2628 * is made later to fill in those names. It has to be done after
2629 * rbd_dev_snaps_update() has completed because some of the
2630 * information (in particular, snapshot name) is not available
2631 * until then.
2632 */
2633static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2634{
2635 struct ceph_osd_client *osdc;
2636 const char *name;
2637 void *reply_buf = NULL;
2638 int ret;
2639
2640 if (rbd_dev->spec->pool_name)
2641 return 0; /* Already have the names */
2642
2643 /* Look up the pool name */
2644
2645 osdc = &rbd_dev->rbd_client->client->osdc;
2646 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
935dc89f
AE
2647 if (!name) {
2648 rbd_warn(rbd_dev, "there is no pool with id %llu",
2649 rbd_dev->spec->pool_id); /* Really a BUG() */
2650 return -EIO;
2651 }
9e15b77d
AE
2652
2653 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2654 if (!rbd_dev->spec->pool_name)
2655 return -ENOMEM;
2656
2657 /* Fetch the image name; tolerate failure here */
2658
2659 name = rbd_dev_image_name(rbd_dev);
69e7a02f 2660 if (name)
9e15b77d 2661 rbd_dev->spec->image_name = (char *) name;
69e7a02f 2662 else
06ecc6cb 2663 rbd_warn(rbd_dev, "unable to get image name");
9e15b77d
AE
2664
2665 /* Look up the snapshot name. */
2666
2667 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2668 if (!name) {
935dc89f
AE
2669 rbd_warn(rbd_dev, "no snapshot with id %llu",
2670 rbd_dev->spec->snap_id); /* Really a BUG() */
9e15b77d
AE
2671 ret = -EIO;
2672 goto out_err;
2673 }
2674 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
2675 if(!rbd_dev->spec->snap_name)
2676 goto out_err;
2677
2678 return 0;
2679out_err:
2680 kfree(reply_buf);
2681 kfree(rbd_dev->spec->pool_name);
2682 rbd_dev->spec->pool_name = NULL;
2683
2684 return ret;
2685}
2686
6e14b1a6 2687static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
35d489f9
AE
2688{
2689 size_t size;
2690 int ret;
2691 void *reply_buf;
2692 void *p;
2693 void *end;
2694 u64 seq;
2695 u32 snap_count;
2696 struct ceph_snap_context *snapc;
2697 u32 i;
2698
2699 /*
2700 * We'll need room for the seq value (maximum snapshot id),
2701 * snapshot count, and array of that many snapshot ids.
2702 * For now we have a fixed upper limit on the number we're
2703 * prepared to receive.
2704 */
2705 size = sizeof (__le64) + sizeof (__le32) +
2706 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2707 reply_buf = kzalloc(size, GFP_KERNEL);
2708 if (!reply_buf)
2709 return -ENOMEM;
2710
2711 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2712 "rbd", "get_snapcontext",
2713 NULL, 0,
07b2391f 2714 reply_buf, size, ver);
35d489f9
AE
2715 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2716 if (ret < 0)
2717 goto out;
2718
2719 ret = -ERANGE;
2720 p = reply_buf;
2721 end = (char *) reply_buf + size;
2722 ceph_decode_64_safe(&p, end, seq, out);
2723 ceph_decode_32_safe(&p, end, snap_count, out);
2724
2725 /*
2726 * Make sure the reported number of snapshot ids wouldn't go
2727 * beyond the end of our buffer. But before checking that,
2728 * make sure the computed size of the snapshot context we
2729 * allocate is representable in a size_t.
2730 */
2731 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2732 / sizeof (u64)) {
2733 ret = -EINVAL;
2734 goto out;
2735 }
2736 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2737 goto out;
2738
2739 size = sizeof (struct ceph_snap_context) +
2740 snap_count * sizeof (snapc->snaps[0]);
2741 snapc = kmalloc(size, GFP_KERNEL);
2742 if (!snapc) {
2743 ret = -ENOMEM;
2744 goto out;
2745 }
2746
2747 atomic_set(&snapc->nref, 1);
2748 snapc->seq = seq;
2749 snapc->num_snaps = snap_count;
2750 for (i = 0; i < snap_count; i++)
2751 snapc->snaps[i] = ceph_decode_64(&p);
2752
2753 rbd_dev->header.snapc = snapc;
2754
2755 dout(" snap context seq = %llu, snap_count = %u\n",
2756 (unsigned long long) seq, (unsigned int) snap_count);
2757
2758out:
2759 kfree(reply_buf);
2760
2761 return 0;
2762}
2763
b8b1e2db
AE
2764static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2765{
2766 size_t size;
2767 void *reply_buf;
2768 __le64 snap_id;
2769 int ret;
2770 void *p;
2771 void *end;
b8b1e2db
AE
2772 char *snap_name;
2773
2774 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2775 reply_buf = kmalloc(size, GFP_KERNEL);
2776 if (!reply_buf)
2777 return ERR_PTR(-ENOMEM);
2778
2779 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2780 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2781 "rbd", "get_snapshot_name",
2782 (char *) &snap_id, sizeof (snap_id),
07b2391f 2783 reply_buf, size, NULL);
b8b1e2db
AE
2784 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2785 if (ret < 0)
2786 goto out;
2787
2788 p = reply_buf;
2789 end = (char *) reply_buf + size;
e5c35534 2790 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
b8b1e2db
AE
2791 if (IS_ERR(snap_name)) {
2792 ret = PTR_ERR(snap_name);
2793 goto out;
2794 } else {
2795 dout(" snap_id 0x%016llx snap_name = %s\n",
2796 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2797 }
2798 kfree(reply_buf);
2799
2800 return snap_name;
2801out:
2802 kfree(reply_buf);
2803
2804 return ERR_PTR(ret);
2805}
2806
2807static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2808 u64 *snap_size, u64 *snap_features)
2809{
e0b49868 2810 u64 snap_id;
b8b1e2db
AE
2811 u8 order;
2812 int ret;
2813
2814 snap_id = rbd_dev->header.snapc->snaps[which];
2815 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2816 if (ret)
2817 return ERR_PTR(ret);
2818 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2819 if (ret)
2820 return ERR_PTR(ret);
2821
2822 return rbd_dev_v2_snap_name(rbd_dev, which);
2823}
2824
2825static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2826 u64 *snap_size, u64 *snap_features)
2827{
2828 if (rbd_dev->image_format == 1)
2829 return rbd_dev_v1_snap_info(rbd_dev, which,
2830 snap_size, snap_features);
2831 if (rbd_dev->image_format == 2)
2832 return rbd_dev_v2_snap_info(rbd_dev, which,
2833 snap_size, snap_features);
2834 return ERR_PTR(-EINVAL);
2835}
2836
117973fb
AE
2837static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2838{
2839 int ret;
2840 __u8 obj_order;
2841
2842 down_write(&rbd_dev->header_rwsem);
2843
2844 /* Grab old order first, to see if it changes */
2845
2846 obj_order = rbd_dev->header.obj_order,
2847 ret = rbd_dev_v2_image_size(rbd_dev);
2848 if (ret)
2849 goto out;
2850 if (rbd_dev->header.obj_order != obj_order) {
2851 ret = -EIO;
2852 goto out;
2853 }
2854 rbd_update_mapping_size(rbd_dev);
2855
2856 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2857 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2858 if (ret)
2859 goto out;
2860 ret = rbd_dev_snaps_update(rbd_dev);
2861 dout("rbd_dev_snaps_update returned %d\n", ret);
2862 if (ret)
2863 goto out;
2864 ret = rbd_dev_snaps_register(rbd_dev);
2865 dout("rbd_dev_snaps_register returned %d\n", ret);
2866out:
2867 up_write(&rbd_dev->header_rwsem);
2868
2869 return ret;
2870}
2871
dfc5606d 2872/*
35938150
AE
2873 * Scan the rbd device's current snapshot list and compare it to the
2874 * newly-received snapshot context. Remove any existing snapshots
2875 * not present in the new snapshot context. Add a new snapshot for
2876 * any snaphots in the snapshot context not in the current list.
2877 * And verify there are no changes to snapshots we already know
2878 * about.
2879 *
2880 * Assumes the snapshots in the snapshot context are sorted by
2881 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2882 * are also maintained in that order.)
dfc5606d 2883 */
304f6808 2884static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
dfc5606d 2885{
35938150
AE
2886 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2887 const u32 snap_count = snapc->num_snaps;
35938150
AE
2888 struct list_head *head = &rbd_dev->snaps;
2889 struct list_head *links = head->next;
2890 u32 index = 0;
dfc5606d 2891
9fcbb800 2892 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
2893 while (index < snap_count || links != head) {
2894 u64 snap_id;
2895 struct rbd_snap *snap;
cd892126
AE
2896 char *snap_name;
2897 u64 snap_size = 0;
2898 u64 snap_features = 0;
dfc5606d 2899
35938150
AE
2900 snap_id = index < snap_count ? snapc->snaps[index]
2901 : CEPH_NOSNAP;
2902 snap = links != head ? list_entry(links, struct rbd_snap, node)
2903 : NULL;
aafb230e 2904 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 2905
35938150
AE
2906 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2907 struct list_head *next = links->next;
dfc5606d 2908
35938150 2909 /* Existing snapshot not in the new snap context */
dfc5606d 2910
0d7dbfce 2911 if (rbd_dev->spec->snap_id == snap->id)
d78b650a 2912 atomic_set(&rbd_dev->exists, 0);
41f38c2b 2913 rbd_remove_snap_dev(snap);
9fcbb800 2914 dout("%ssnap id %llu has been removed\n",
0d7dbfce
AE
2915 rbd_dev->spec->snap_id == snap->id ?
2916 "mapped " : "",
9fcbb800 2917 (unsigned long long) snap->id);
35938150
AE
2918
2919 /* Done with this list entry; advance */
2920
2921 links = next;
dfc5606d
YS
2922 continue;
2923 }
35938150 2924
b8b1e2db
AE
2925 snap_name = rbd_dev_snap_info(rbd_dev, index,
2926 &snap_size, &snap_features);
cd892126
AE
2927 if (IS_ERR(snap_name))
2928 return PTR_ERR(snap_name);
2929
9fcbb800
AE
2930 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2931 (unsigned long long) snap_id);
35938150
AE
2932 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2933 struct rbd_snap *new_snap;
2934
2935 /* We haven't seen this snapshot before */
2936
c8d18425 2937 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
cd892126 2938 snap_id, snap_size, snap_features);
9fcbb800
AE
2939 if (IS_ERR(new_snap)) {
2940 int err = PTR_ERR(new_snap);
2941
2942 dout(" failed to add dev, error %d\n", err);
2943
2944 return err;
2945 }
35938150
AE
2946
2947 /* New goes before existing, or at end of list */
2948
9fcbb800 2949 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
2950 if (snap)
2951 list_add_tail(&new_snap->node, &snap->node);
2952 else
523f3258 2953 list_add_tail(&new_snap->node, head);
35938150
AE
2954 } else {
2955 /* Already have this one */
2956
9fcbb800
AE
2957 dout(" already present\n");
2958
cd892126 2959 rbd_assert(snap->size == snap_size);
aafb230e 2960 rbd_assert(!strcmp(snap->name, snap_name));
cd892126 2961 rbd_assert(snap->features == snap_features);
35938150
AE
2962
2963 /* Done with this list entry; advance */
2964
2965 links = links->next;
dfc5606d 2966 }
35938150
AE
2967
2968 /* Advance to the next entry in the snapshot context */
2969
2970 index++;
dfc5606d 2971 }
9fcbb800 2972 dout("%s: done\n", __func__);
dfc5606d
YS
2973
2974 return 0;
2975}
2976
304f6808
AE
2977/*
2978 * Scan the list of snapshots and register the devices for any that
2979 * have not already been registered.
2980 */
2981static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2982{
2983 struct rbd_snap *snap;
2984 int ret = 0;
2985
2986 dout("%s called\n", __func__);
86ff77bb
AE
2987 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2988 return -EIO;
304f6808
AE
2989
2990 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2991 if (!rbd_snap_registered(snap)) {
2992 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2993 if (ret < 0)
2994 break;
2995 }
2996 }
2997 dout("%s: returning %d\n", __func__, ret);
2998
2999 return ret;
3000}
3001
dfc5606d
YS
3002static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3003{
dfc5606d 3004 struct device *dev;
cd789ab9 3005 int ret;
dfc5606d
YS
3006
3007 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
dfc5606d 3008
cd789ab9 3009 dev = &rbd_dev->dev;
dfc5606d
YS
3010 dev->bus = &rbd_bus_type;
3011 dev->type = &rbd_device_type;
3012 dev->parent = &rbd_root_dev;
3013 dev->release = rbd_dev_release;
de71a297 3014 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d 3015 ret = device_register(dev);
dfc5606d 3016
dfc5606d 3017 mutex_unlock(&ctl_mutex);
cd789ab9 3018
dfc5606d 3019 return ret;
602adf40
YS
3020}
3021
dfc5606d
YS
3022static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3023{
3024 device_unregister(&rbd_dev->dev);
3025}
3026
e2839308 3027static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
3028
3029/*
499afd5b
AE
3030 * Get a unique rbd identifier for the given new rbd_dev, and add
3031 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 3032 */
e2839308 3033static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 3034{
e2839308 3035 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
3036
3037 spin_lock(&rbd_dev_list_lock);
3038 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3039 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
3040 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3041 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 3042}
b7f23c36 3043
1ddbe94e 3044/*
499afd5b
AE
3045 * Remove an rbd_dev from the global list, and record that its
3046 * identifier is no longer in use.
1ddbe94e 3047 */
e2839308 3048static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 3049{
d184f6bf 3050 struct list_head *tmp;
de71a297 3051 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
3052 int max_id;
3053
aafb230e 3054 rbd_assert(rbd_id > 0);
499afd5b 3055
e2839308
AE
3056 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3057 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
3058 spin_lock(&rbd_dev_list_lock);
3059 list_del_init(&rbd_dev->node);
d184f6bf
AE
3060
3061 /*
3062 * If the id being "put" is not the current maximum, there
3063 * is nothing special we need to do.
3064 */
e2839308 3065 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
3066 spin_unlock(&rbd_dev_list_lock);
3067 return;
3068 }
3069
3070 /*
3071 * We need to update the current maximum id. Search the
3072 * list to find out what it is. We're more likely to find
3073 * the maximum at the end, so search the list backward.
3074 */
3075 max_id = 0;
3076 list_for_each_prev(tmp, &rbd_dev_list) {
3077 struct rbd_device *rbd_dev;
3078
3079 rbd_dev = list_entry(tmp, struct rbd_device, node);
b213e0b1
AE
3080 if (rbd_dev->dev_id > max_id)
3081 max_id = rbd_dev->dev_id;
d184f6bf 3082 }
499afd5b 3083 spin_unlock(&rbd_dev_list_lock);
b7f23c36 3084
1ddbe94e 3085 /*
e2839308 3086 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
3087 * which case it now accurately reflects the new maximum.
3088 * Be careful not to overwrite the maximum value in that
3089 * case.
1ddbe94e 3090 */
e2839308
AE
3091 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3092 dout(" max dev id has been reset\n");
b7f23c36
AE
3093}
3094
e28fff26
AE
3095/*
3096 * Skips over white space at *buf, and updates *buf to point to the
3097 * first found non-space character (if any). Returns the length of
593a9e7b
AE
3098 * the token (string of non-white space characters) found. Note
3099 * that *buf must be terminated with '\0'.
e28fff26
AE
3100 */
3101static inline size_t next_token(const char **buf)
3102{
3103 /*
3104 * These are the characters that produce nonzero for
3105 * isspace() in the "C" and "POSIX" locales.
3106 */
3107 const char *spaces = " \f\n\r\t\v";
3108
3109 *buf += strspn(*buf, spaces); /* Find start of token */
3110
3111 return strcspn(*buf, spaces); /* Return token length */
3112}
3113
3114/*
3115 * Finds the next token in *buf, and if the provided token buffer is
3116 * big enough, copies the found token into it. The result, if
593a9e7b
AE
3117 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3118 * must be terminated with '\0' on entry.
e28fff26
AE
3119 *
3120 * Returns the length of the token found (not including the '\0').
3121 * Return value will be 0 if no token is found, and it will be >=
3122 * token_size if the token would not fit.
3123 *
593a9e7b 3124 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
3125 * found token. Note that this occurs even if the token buffer is
3126 * too small to hold it.
3127 */
3128static inline size_t copy_token(const char **buf,
3129 char *token,
3130 size_t token_size)
3131{
3132 size_t len;
3133
3134 len = next_token(buf);
3135 if (len < token_size) {
3136 memcpy(token, *buf, len);
3137 *(token + len) = '\0';
3138 }
3139 *buf += len;
3140
3141 return len;
3142}
3143
ea3352f4
AE
3144/*
3145 * Finds the next token in *buf, dynamically allocates a buffer big
3146 * enough to hold a copy of it, and copies the token into the new
3147 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3148 * that a duplicate buffer is created even for a zero-length token.
3149 *
3150 * Returns a pointer to the newly-allocated duplicate, or a null
3151 * pointer if memory for the duplicate was not available. If
3152 * the lenp argument is a non-null pointer, the length of the token
3153 * (not including the '\0') is returned in *lenp.
3154 *
3155 * If successful, the *buf pointer will be updated to point beyond
3156 * the end of the found token.
3157 *
3158 * Note: uses GFP_KERNEL for allocation.
3159 */
3160static inline char *dup_token(const char **buf, size_t *lenp)
3161{
3162 char *dup;
3163 size_t len;
3164
3165 len = next_token(buf);
4caf35f9 3166 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
ea3352f4
AE
3167 if (!dup)
3168 return NULL;
ea3352f4
AE
3169 *(dup + len) = '\0';
3170 *buf += len;
3171
3172 if (lenp)
3173 *lenp = len;
3174
3175 return dup;
3176}
3177
a725f65e 3178/*
859c31df
AE
3179 * Parse the options provided for an "rbd add" (i.e., rbd image
3180 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3181 * and the data written is passed here via a NUL-terminated buffer.
3182 * Returns 0 if successful or an error code otherwise.
d22f76e7 3183 *
859c31df
AE
3184 * The information extracted from these options is recorded in
3185 * the other parameters which return dynamically-allocated
3186 * structures:
3187 * ceph_opts
3188 * The address of a pointer that will refer to a ceph options
3189 * structure. Caller must release the returned pointer using
3190 * ceph_destroy_options() when it is no longer needed.
3191 * rbd_opts
3192 * Address of an rbd options pointer. Fully initialized by
3193 * this function; caller must release with kfree().
3194 * spec
3195 * Address of an rbd image specification pointer. Fully
3196 * initialized by this function based on parsed options.
3197 * Caller must release with rbd_spec_put().
3198 *
3199 * The options passed take this form:
3200 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3201 * where:
3202 * <mon_addrs>
3203 * A comma-separated list of one or more monitor addresses.
3204 * A monitor address is an ip address, optionally followed
3205 * by a port number (separated by a colon).
3206 * I.e.: ip1[:port1][,ip2[:port2]...]
3207 * <options>
3208 * A comma-separated list of ceph and/or rbd options.
3209 * <pool_name>
3210 * The name of the rados pool containing the rbd image.
3211 * <image_name>
3212 * The name of the image in that pool to map.
3213 * <snap_id>
3214 * An optional snapshot id. If provided, the mapping will
3215 * present data from the image at the time that snapshot was
3216 * created. The image head is used if no snapshot id is
3217 * provided. Snapshot mappings are always read-only.
a725f65e 3218 */
859c31df 3219static int rbd_add_parse_args(const char *buf,
dc79b113 3220 struct ceph_options **ceph_opts,
859c31df
AE
3221 struct rbd_options **opts,
3222 struct rbd_spec **rbd_spec)
e28fff26 3223{
d22f76e7 3224 size_t len;
859c31df 3225 char *options;
0ddebc0c
AE
3226 const char *mon_addrs;
3227 size_t mon_addrs_size;
859c31df 3228 struct rbd_spec *spec = NULL;
4e9afeba 3229 struct rbd_options *rbd_opts = NULL;
859c31df 3230 struct ceph_options *copts;
dc79b113 3231 int ret;
e28fff26
AE
3232
3233 /* The first four tokens are required */
3234
7ef3214a 3235 len = next_token(&buf);
4fb5d671
AE
3236 if (!len) {
3237 rbd_warn(NULL, "no monitor address(es) provided");
3238 return -EINVAL;
3239 }
0ddebc0c 3240 mon_addrs = buf;
f28e565a 3241 mon_addrs_size = len + 1;
7ef3214a 3242 buf += len;
a725f65e 3243
dc79b113 3244 ret = -EINVAL;
f28e565a
AE
3245 options = dup_token(&buf, NULL);
3246 if (!options)
dc79b113 3247 return -ENOMEM;
4fb5d671
AE
3248 if (!*options) {
3249 rbd_warn(NULL, "no options provided");
3250 goto out_err;
3251 }
e28fff26 3252
859c31df
AE
3253 spec = rbd_spec_alloc();
3254 if (!spec)
f28e565a 3255 goto out_mem;
859c31df
AE
3256
3257 spec->pool_name = dup_token(&buf, NULL);
3258 if (!spec->pool_name)
3259 goto out_mem;
4fb5d671
AE
3260 if (!*spec->pool_name) {
3261 rbd_warn(NULL, "no pool name provided");
3262 goto out_err;
3263 }
e28fff26 3264
69e7a02f 3265 spec->image_name = dup_token(&buf, NULL);
859c31df 3266 if (!spec->image_name)
f28e565a 3267 goto out_mem;
4fb5d671
AE
3268 if (!*spec->image_name) {
3269 rbd_warn(NULL, "no image name provided");
3270 goto out_err;
3271 }
d4b125e9 3272
f28e565a
AE
3273 /*
3274 * Snapshot name is optional; default is to use "-"
3275 * (indicating the head/no snapshot).
3276 */
3feeb894 3277 len = next_token(&buf);
820a5f3e 3278 if (!len) {
3feeb894
AE
3279 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3280 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a 3281 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
dc79b113 3282 ret = -ENAMETOOLONG;
f28e565a 3283 goto out_err;
849b4260 3284 }
4caf35f9 3285 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
859c31df 3286 if (!spec->snap_name)
f28e565a 3287 goto out_mem;
859c31df 3288 *(spec->snap_name + len) = '\0';
e5c35534 3289
0ddebc0c 3290 /* Initialize all rbd options to the defaults */
e28fff26 3291
4e9afeba
AE
3292 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3293 if (!rbd_opts)
3294 goto out_mem;
3295
3296 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
d22f76e7 3297
859c31df 3298 copts = ceph_parse_options(options, mon_addrs,
0ddebc0c 3299 mon_addrs + mon_addrs_size - 1,
4e9afeba 3300 parse_rbd_opts_token, rbd_opts);
859c31df
AE
3301 if (IS_ERR(copts)) {
3302 ret = PTR_ERR(copts);
dc79b113
AE
3303 goto out_err;
3304 }
859c31df
AE
3305 kfree(options);
3306
3307 *ceph_opts = copts;
4e9afeba 3308 *opts = rbd_opts;
859c31df 3309 *rbd_spec = spec;
0ddebc0c 3310
dc79b113 3311 return 0;
f28e565a 3312out_mem:
dc79b113 3313 ret = -ENOMEM;
d22f76e7 3314out_err:
859c31df
AE
3315 kfree(rbd_opts);
3316 rbd_spec_put(spec);
f28e565a 3317 kfree(options);
d22f76e7 3318
dc79b113 3319 return ret;
a725f65e
AE
3320}
3321
589d30e0
AE
3322/*
3323 * An rbd format 2 image has a unique identifier, distinct from the
3324 * name given to it by the user. Internally, that identifier is
3325 * what's used to specify the names of objects related to the image.
3326 *
3327 * A special "rbd id" object is used to map an rbd image name to its
3328 * id. If that object doesn't exist, then there is no v2 rbd image
3329 * with the supplied name.
3330 *
3331 * This function will record the given rbd_dev's image_id field if
3332 * it can be determined, and in that case will return 0. If any
3333 * errors occur a negative errno will be returned and the rbd_dev's
3334 * image_id field will be unchanged (and should be NULL).
3335 */
3336static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3337{
3338 int ret;
3339 size_t size;
3340 char *object_name;
3341 void *response;
3342 void *p;
3343
2c0d0a10
AE
3344 /*
3345 * When probing a parent image, the image id is already
3346 * known (and the image name likely is not). There's no
3347 * need to fetch the image id again in this case.
3348 */
3349 if (rbd_dev->spec->image_id)
3350 return 0;
3351
589d30e0
AE
3352 /*
3353 * First, see if the format 2 image id file exists, and if
3354 * so, get the image's persistent id from it.
3355 */
69e7a02f 3356 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
589d30e0
AE
3357 object_name = kmalloc(size, GFP_NOIO);
3358 if (!object_name)
3359 return -ENOMEM;
0d7dbfce 3360 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
589d30e0
AE
3361 dout("rbd id object name is %s\n", object_name);
3362
3363 /* Response will be an encoded string, which includes a length */
3364
3365 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3366 response = kzalloc(size, GFP_NOIO);
3367 if (!response) {
3368 ret = -ENOMEM;
3369 goto out;
3370 }
3371
3372 ret = rbd_req_sync_exec(rbd_dev, object_name,
3373 "rbd", "get_id",
3374 NULL, 0,
07b2391f 3375 response, RBD_IMAGE_ID_LEN_MAX, NULL);
589d30e0
AE
3376 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3377 if (ret < 0)
3378 goto out;
a0ea3a40 3379 ret = 0; /* rbd_req_sync_exec() can return positive */
589d30e0
AE
3380
3381 p = response;
0d7dbfce 3382 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
589d30e0 3383 p + RBD_IMAGE_ID_LEN_MAX,
979ed480 3384 NULL, GFP_NOIO);
0d7dbfce
AE
3385 if (IS_ERR(rbd_dev->spec->image_id)) {
3386 ret = PTR_ERR(rbd_dev->spec->image_id);
3387 rbd_dev->spec->image_id = NULL;
589d30e0 3388 } else {
0d7dbfce 3389 dout("image_id is %s\n", rbd_dev->spec->image_id);
589d30e0
AE
3390 }
3391out:
3392 kfree(response);
3393 kfree(object_name);
3394
3395 return ret;
3396}
3397
a30b71b9
AE
3398static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3399{
3400 int ret;
3401 size_t size;
3402
3403 /* Version 1 images have no id; empty string is used */
3404
0d7dbfce
AE
3405 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3406 if (!rbd_dev->spec->image_id)
a30b71b9 3407 return -ENOMEM;
a30b71b9
AE
3408
3409 /* Record the header object name for this rbd image. */
3410
69e7a02f 3411 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
a30b71b9
AE
3412 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3413 if (!rbd_dev->header_name) {
3414 ret = -ENOMEM;
3415 goto out_err;
3416 }
0d7dbfce
AE
3417 sprintf(rbd_dev->header_name, "%s%s",
3418 rbd_dev->spec->image_name, RBD_SUFFIX);
a30b71b9
AE
3419
3420 /* Populate rbd image metadata */
3421
3422 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3423 if (ret < 0)
3424 goto out_err;
86b00e0d
AE
3425
3426 /* Version 1 images have no parent (no layering) */
3427
3428 rbd_dev->parent_spec = NULL;
3429 rbd_dev->parent_overlap = 0;
3430
a30b71b9
AE
3431 rbd_dev->image_format = 1;
3432
3433 dout("discovered version 1 image, header name is %s\n",
3434 rbd_dev->header_name);
3435
3436 return 0;
3437
3438out_err:
3439 kfree(rbd_dev->header_name);
3440 rbd_dev->header_name = NULL;
0d7dbfce
AE
3441 kfree(rbd_dev->spec->image_id);
3442 rbd_dev->spec->image_id = NULL;
a30b71b9
AE
3443
3444 return ret;
3445}
3446
3447static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3448{
3449 size_t size;
9d475de5 3450 int ret;
6e14b1a6 3451 u64 ver = 0;
a30b71b9
AE
3452
3453 /*
3454 * Image id was filled in by the caller. Record the header
3455 * object name for this rbd image.
3456 */
979ed480 3457 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
a30b71b9
AE
3458 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3459 if (!rbd_dev->header_name)
3460 return -ENOMEM;
3461 sprintf(rbd_dev->header_name, "%s%s",
0d7dbfce 3462 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
9d475de5
AE
3463
3464 /* Get the size and object order for the image */
3465
3466 ret = rbd_dev_v2_image_size(rbd_dev);
1e130199
AE
3467 if (ret < 0)
3468 goto out_err;
3469
3470 /* Get the object prefix (a.k.a. block_name) for the image */
3471
3472 ret = rbd_dev_v2_object_prefix(rbd_dev);
b1b5402a
AE
3473 if (ret < 0)
3474 goto out_err;
3475
d889140c 3476 /* Get the and check features for the image */
b1b5402a
AE
3477
3478 ret = rbd_dev_v2_features(rbd_dev);
9d475de5
AE
3479 if (ret < 0)
3480 goto out_err;
35d489f9 3481
86b00e0d
AE
3482 /* If the image supports layering, get the parent info */
3483
3484 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3485 ret = rbd_dev_v2_parent_info(rbd_dev);
3486 if (ret < 0)
3487 goto out_err;
3488 }
3489
6e14b1a6
AE
3490 /* crypto and compression type aren't (yet) supported for v2 images */
3491
3492 rbd_dev->header.crypt_type = 0;
3493 rbd_dev->header.comp_type = 0;
35d489f9 3494
6e14b1a6
AE
3495 /* Get the snapshot context, plus the header version */
3496
3497 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
35d489f9
AE
3498 if (ret)
3499 goto out_err;
6e14b1a6
AE
3500 rbd_dev->header.obj_version = ver;
3501
a30b71b9
AE
3502 rbd_dev->image_format = 2;
3503
3504 dout("discovered version 2 image, header name is %s\n",
3505 rbd_dev->header_name);
3506
35152979 3507 return 0;
9d475de5 3508out_err:
86b00e0d
AE
3509 rbd_dev->parent_overlap = 0;
3510 rbd_spec_put(rbd_dev->parent_spec);
3511 rbd_dev->parent_spec = NULL;
9d475de5
AE
3512 kfree(rbd_dev->header_name);
3513 rbd_dev->header_name = NULL;
1e130199
AE
3514 kfree(rbd_dev->header.object_prefix);
3515 rbd_dev->header.object_prefix = NULL;
9d475de5
AE
3516
3517 return ret;
a30b71b9
AE
3518}
3519
83a06263
AE
3520static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3521{
3522 int ret;
3523
3524 /* no need to lock here, as rbd_dev is not registered yet */
3525 ret = rbd_dev_snaps_update(rbd_dev);
3526 if (ret)
3527 return ret;
3528
9e15b77d
AE
3529 ret = rbd_dev_probe_update_spec(rbd_dev);
3530 if (ret)
3531 goto err_out_snaps;
3532
83a06263
AE
3533 ret = rbd_dev_set_mapping(rbd_dev);
3534 if (ret)
3535 goto err_out_snaps;
3536
3537 /* generate unique id: find highest unique id, add one */
3538 rbd_dev_id_get(rbd_dev);
3539
3540 /* Fill in the device name, now that we have its id. */
3541 BUILD_BUG_ON(DEV_NAME_LEN
3542 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3543 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3544
3545 /* Get our block major device number. */
3546
3547 ret = register_blkdev(0, rbd_dev->name);
3548 if (ret < 0)
3549 goto err_out_id;
3550 rbd_dev->major = ret;
3551
3552 /* Set up the blkdev mapping. */
3553
3554 ret = rbd_init_disk(rbd_dev);
3555 if (ret)
3556 goto err_out_blkdev;
3557
3558 ret = rbd_bus_add_dev(rbd_dev);
3559 if (ret)
3560 goto err_out_disk;
3561
3562 /*
3563 * At this point cleanup in the event of an error is the job
3564 * of the sysfs code (initiated by rbd_bus_del_dev()).
3565 */
3566 down_write(&rbd_dev->header_rwsem);
3567 ret = rbd_dev_snaps_register(rbd_dev);
3568 up_write(&rbd_dev->header_rwsem);
3569 if (ret)
3570 goto err_out_bus;
3571
c0430647 3572 ret = rbd_req_sync_watch(rbd_dev, 1);
83a06263
AE
3573 if (ret)
3574 goto err_out_bus;
3575
3576 /* Everything's ready. Announce the disk to the world. */
3577
3578 add_disk(rbd_dev->disk);
3579
3580 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3581 (unsigned long long) rbd_dev->mapping.size);
3582
3583 return ret;
3584err_out_bus:
3585 /* this will also clean up rest of rbd_dev stuff */
3586
3587 rbd_bus_del_dev(rbd_dev);
3588
3589 return ret;
3590err_out_disk:
3591 rbd_free_disk(rbd_dev);
3592err_out_blkdev:
3593 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3594err_out_id:
3595 rbd_dev_id_put(rbd_dev);
3596err_out_snaps:
3597 rbd_remove_all_snaps(rbd_dev);
3598
3599 return ret;
3600}
3601
a30b71b9
AE
3602/*
3603 * Probe for the existence of the header object for the given rbd
3604 * device. For format 2 images this includes determining the image
3605 * id.
3606 */
3607static int rbd_dev_probe(struct rbd_device *rbd_dev)
3608{
3609 int ret;
3610
3611 /*
3612 * Get the id from the image id object. If it's not a
3613 * format 2 image, we'll get ENOENT back, and we'll assume
3614 * it's a format 1 image.
3615 */
3616 ret = rbd_dev_image_id(rbd_dev);
3617 if (ret)
3618 ret = rbd_dev_v1_probe(rbd_dev);
3619 else
3620 ret = rbd_dev_v2_probe(rbd_dev);
83a06263 3621 if (ret) {
a30b71b9
AE
3622 dout("probe failed, returning %d\n", ret);
3623
83a06263
AE
3624 return ret;
3625 }
3626
3627 ret = rbd_dev_probe_finish(rbd_dev);
3628 if (ret)
3629 rbd_header_free(&rbd_dev->header);
3630
a30b71b9
AE
3631 return ret;
3632}
3633
59c2be1e
YS
3634static ssize_t rbd_add(struct bus_type *bus,
3635 const char *buf,
3636 size_t count)
602adf40 3637{
cb8627c7 3638 struct rbd_device *rbd_dev = NULL;
dc79b113 3639 struct ceph_options *ceph_opts = NULL;
4e9afeba 3640 struct rbd_options *rbd_opts = NULL;
859c31df 3641 struct rbd_spec *spec = NULL;
9d3997fd 3642 struct rbd_client *rbdc;
27cc2594
AE
3643 struct ceph_osd_client *osdc;
3644 int rc = -ENOMEM;
602adf40
YS
3645
3646 if (!try_module_get(THIS_MODULE))
3647 return -ENODEV;
3648
602adf40 3649 /* parse add command */
859c31df 3650 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
dc79b113 3651 if (rc < 0)
bd4ba655 3652 goto err_out_module;
78cea76e 3653
9d3997fd
AE
3654 rbdc = rbd_get_client(ceph_opts);
3655 if (IS_ERR(rbdc)) {
3656 rc = PTR_ERR(rbdc);
0ddebc0c 3657 goto err_out_args;
9d3997fd 3658 }
c53d5893 3659 ceph_opts = NULL; /* rbd_dev client now owns this */
602adf40 3660
602adf40 3661 /* pick the pool */
9d3997fd 3662 osdc = &rbdc->client->osdc;
859c31df 3663 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
602adf40
YS
3664 if (rc < 0)
3665 goto err_out_client;
859c31df
AE
3666 spec->pool_id = (u64) rc;
3667
0903e875
AE
3668 /* The ceph file layout needs to fit pool id in 32 bits */
3669
3670 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
3671 rc = -EIO;
3672 goto err_out_client;
3673 }
3674
c53d5893 3675 rbd_dev = rbd_dev_create(rbdc, spec);
bd4ba655
AE
3676 if (!rbd_dev)
3677 goto err_out_client;
c53d5893
AE
3678 rbdc = NULL; /* rbd_dev now owns this */
3679 spec = NULL; /* rbd_dev now owns this */
602adf40 3680
bd4ba655 3681 rbd_dev->mapping.read_only = rbd_opts->read_only;
c53d5893
AE
3682 kfree(rbd_opts);
3683 rbd_opts = NULL; /* done with this */
bd4ba655 3684
a30b71b9
AE
3685 rc = rbd_dev_probe(rbd_dev);
3686 if (rc < 0)
c53d5893 3687 goto err_out_rbd_dev;
05fd6f6f 3688
602adf40 3689 return count;
c53d5893
AE
3690err_out_rbd_dev:
3691 rbd_dev_destroy(rbd_dev);
bd4ba655 3692err_out_client:
9d3997fd 3693 rbd_put_client(rbdc);
0ddebc0c 3694err_out_args:
78cea76e
AE
3695 if (ceph_opts)
3696 ceph_destroy_options(ceph_opts);
4e9afeba 3697 kfree(rbd_opts);
859c31df 3698 rbd_spec_put(spec);
bd4ba655
AE
3699err_out_module:
3700 module_put(THIS_MODULE);
27cc2594 3701
602adf40 3702 dout("Error adding device %s\n", buf);
27cc2594
AE
3703
3704 return (ssize_t) rc;
602adf40
YS
3705}
3706
de71a297 3707static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
3708{
3709 struct list_head *tmp;
3710 struct rbd_device *rbd_dev;
3711
e124a82f 3712 spin_lock(&rbd_dev_list_lock);
602adf40
YS
3713 list_for_each(tmp, &rbd_dev_list) {
3714 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 3715 if (rbd_dev->dev_id == dev_id) {
e124a82f 3716 spin_unlock(&rbd_dev_list_lock);
602adf40 3717 return rbd_dev;
e124a82f 3718 }
602adf40 3719 }
e124a82f 3720 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
3721 return NULL;
3722}
3723
dfc5606d 3724static void rbd_dev_release(struct device *dev)
602adf40 3725{
593a9e7b 3726 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 3727
1dbb4399
AE
3728 if (rbd_dev->watch_request) {
3729 struct ceph_client *client = rbd_dev->rbd_client->client;
3730
3731 ceph_osdc_unregister_linger_request(&client->osdc,
59c2be1e 3732 rbd_dev->watch_request);
1dbb4399 3733 }
59c2be1e 3734 if (rbd_dev->watch_event)
907703d0 3735 rbd_req_sync_watch(rbd_dev, 0);
602adf40
YS
3736
3737 /* clean up and free blkdev */
3738 rbd_free_disk(rbd_dev);
3739 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d 3740
2ac4e75d
AE
3741 /* release allocated disk header fields */
3742 rbd_header_free(&rbd_dev->header);
3743
32eec68d 3744 /* done with the id, and with the rbd_dev */
e2839308 3745 rbd_dev_id_put(rbd_dev);
c53d5893
AE
3746 rbd_assert(rbd_dev->rbd_client != NULL);
3747 rbd_dev_destroy(rbd_dev);
602adf40
YS
3748
3749 /* release module ref */
3750 module_put(THIS_MODULE);
602adf40
YS
3751}
3752
dfc5606d
YS
3753static ssize_t rbd_remove(struct bus_type *bus,
3754 const char *buf,
3755 size_t count)
602adf40
YS
3756{
3757 struct rbd_device *rbd_dev = NULL;
3758 int target_id, rc;
3759 unsigned long ul;
3760 int ret = count;
3761
3762 rc = strict_strtoul(buf, 10, &ul);
3763 if (rc)
3764 return rc;
3765
3766 /* convert to int; abort if we lost anything in the conversion */
3767 target_id = (int) ul;
3768 if (target_id != ul)
3769 return -EINVAL;
3770
3771 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3772
3773 rbd_dev = __rbd_get_dev(target_id);
3774 if (!rbd_dev) {
3775 ret = -ENOENT;
3776 goto done;
42382b70
AE
3777 }
3778
3779 if (rbd_dev->open_count) {
3780 ret = -EBUSY;
3781 goto done;
602adf40
YS
3782 }
3783
41f38c2b 3784 rbd_remove_all_snaps(rbd_dev);
dfc5606d 3785 rbd_bus_del_dev(rbd_dev);
602adf40
YS
3786
3787done:
3788 mutex_unlock(&ctl_mutex);
aafb230e 3789
602adf40
YS
3790 return ret;
3791}
3792
602adf40
YS
3793/*
3794 * create control files in sysfs
dfc5606d 3795 * /sys/bus/rbd/...
602adf40
YS
3796 */
3797static int rbd_sysfs_init(void)
3798{
dfc5606d 3799 int ret;
602adf40 3800
fed4c143 3801 ret = device_register(&rbd_root_dev);
21079786 3802 if (ret < 0)
dfc5606d 3803 return ret;
602adf40 3804
fed4c143
AE
3805 ret = bus_register(&rbd_bus_type);
3806 if (ret < 0)
3807 device_unregister(&rbd_root_dev);
602adf40 3808
602adf40
YS
3809 return ret;
3810}
3811
3812static void rbd_sysfs_cleanup(void)
3813{
dfc5606d 3814 bus_unregister(&rbd_bus_type);
fed4c143 3815 device_unregister(&rbd_root_dev);
602adf40
YS
3816}
3817
3818int __init rbd_init(void)
3819{
3820 int rc;
3821
3822 rc = rbd_sysfs_init();
3823 if (rc)
3824 return rc;
f0f8cef5 3825 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
3826 return 0;
3827}
3828
3829void __exit rbd_exit(void)
3830{
3831 rbd_sysfs_cleanup();
3832}
3833
3834module_init(rbd_init);
3835module_exit(rbd_exit);
3836
3837MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3838MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3839MODULE_DESCRIPTION("rados block device");
3840
3841/* following authorship retained from original osdblk.c */
3842MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3843
3844MODULE_LICENSE("GPL");
This page took 0.381912 seconds and 5 git commands to generate.