rbd: define image request layered flag
[deliverable/linux.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
f0f8cef5
AE
55#define RBD_DRV_NAME "rbd"
56#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
57
58#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
59
d4b125e9
AE
60#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
61#define RBD_MAX_SNAP_NAME_LEN \
62 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
63
35d489f9 64#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
65
66#define RBD_SNAP_HEAD_NAME "-"
67
9e15b77d
AE
68/* This allows a single page to hold an image name sent by OSD */
69#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
1e130199 70#define RBD_IMAGE_ID_LEN_MAX 64
9e15b77d 71
1e130199 72#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 73
d889140c
AE
74/* Feature bits */
75
5cbf6f12
AE
76#define RBD_FEATURE_LAYERING (1<<0)
77#define RBD_FEATURE_STRIPINGV2 (1<<1)
78#define RBD_FEATURES_ALL \
79 (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
d889140c
AE
80
81/* Features supported by this (client software) implementation. */
82
5cbf6f12 83#define RBD_FEATURES_SUPPORTED (0)
d889140c 84
81a89793
AE
85/*
86 * An RBD device name will be "rbd#", where the "rbd" comes from
87 * RBD_DRV_NAME above, and # is a unique integer identifier.
88 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
89 * enough to hold all possible device names.
90 */
602adf40 91#define DEV_NAME_LEN 32
81a89793 92#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40
YS
93
94/*
95 * block device image metadata (in-memory version)
96 */
97struct rbd_image_header {
f84344f3 98 /* These four fields never change for a given rbd image */
849b4260 99 char *object_prefix;
34b13184 100 u64 features;
602adf40
YS
101 __u8 obj_order;
102 __u8 crypt_type;
103 __u8 comp_type;
602adf40 104
f84344f3
AE
105 /* The remaining fields need to be updated occasionally */
106 u64 image_size;
107 struct ceph_snap_context *snapc;
602adf40
YS
108 char *snap_names;
109 u64 *snap_sizes;
59c2be1e
YS
110
111 u64 obj_version;
112};
113
0d7dbfce
AE
114/*
115 * An rbd image specification.
116 *
117 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
c66c6e0c
AE
118 * identify an image. Each rbd_dev structure includes a pointer to
119 * an rbd_spec structure that encapsulates this identity.
120 *
121 * Each of the id's in an rbd_spec has an associated name. For a
122 * user-mapped image, the names are supplied and the id's associated
123 * with them are looked up. For a layered image, a parent image is
124 * defined by the tuple, and the names are looked up.
125 *
126 * An rbd_dev structure contains a parent_spec pointer which is
127 * non-null if the image it represents is a child in a layered
128 * image. This pointer will refer to the rbd_spec structure used
129 * by the parent rbd_dev for its own identity (i.e., the structure
130 * is shared between the parent and child).
131 *
132 * Since these structures are populated once, during the discovery
133 * phase of image construction, they are effectively immutable so
134 * we make no effort to synchronize access to them.
135 *
136 * Note that code herein does not assume the image name is known (it
137 * could be a null pointer).
0d7dbfce
AE
138 */
139struct rbd_spec {
140 u64 pool_id;
141 char *pool_name;
142
143 char *image_id;
0d7dbfce 144 char *image_name;
0d7dbfce
AE
145
146 u64 snap_id;
147 char *snap_name;
148
149 struct kref kref;
150};
151
602adf40 152/*
f0f8cef5 153 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
154 */
155struct rbd_client {
156 struct ceph_client *client;
157 struct kref kref;
158 struct list_head node;
159};
160
bf0d5f50
AE
161struct rbd_img_request;
162typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
163
164#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
165
166struct rbd_obj_request;
167typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
168
9969ebc5
AE
169enum obj_request_type {
170 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
171};
bf0d5f50
AE
172
173struct rbd_obj_request {
174 const char *object_name;
175 u64 offset; /* object start byte */
176 u64 length; /* bytes from offset */
177
178 struct rbd_img_request *img_request;
7da22d29 179 u64 img_offset; /* image relative offset */
bf0d5f50
AE
180 struct list_head links; /* img_request->obj_requests */
181 u32 which; /* posn image request list */
182
183 enum obj_request_type type;
788e2df3
AE
184 union {
185 struct bio *bio_list;
186 struct {
187 struct page **pages;
188 u32 page_count;
189 };
190 };
bf0d5f50
AE
191
192 struct ceph_osd_request *osd_req;
193
194 u64 xferred; /* bytes transferred */
195 u64 version;
1b83bef2 196 int result;
bf0d5f50
AE
197 atomic_t done;
198
199 rbd_obj_callback_t callback;
788e2df3 200 struct completion completion;
bf0d5f50
AE
201
202 struct kref kref;
203};
204
0c425248 205enum img_req_flags {
9849e986
AE
206 IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */
207 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
d0b2e944 208 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
0c425248
AE
209};
210
bf0d5f50 211struct rbd_img_request {
bf0d5f50
AE
212 struct rbd_device *rbd_dev;
213 u64 offset; /* starting image byte offset */
214 u64 length; /* byte count from offset */
0c425248 215 unsigned long flags;
bf0d5f50 216 union {
9849e986 217 u64 snap_id; /* for reads */
bf0d5f50 218 struct ceph_snap_context *snapc; /* for writes */
9849e986
AE
219 };
220 union {
221 struct request *rq; /* block request */
222 struct rbd_obj_request *obj_request; /* obj req initiator */
bf0d5f50
AE
223 };
224 spinlock_t completion_lock;/* protects next_completion */
225 u32 next_completion;
226 rbd_img_callback_t callback;
55f27e09 227 u64 xferred;/* aggregate bytes transferred */
a5a337d4 228 int result; /* first nonzero obj_request result */
bf0d5f50
AE
229
230 u32 obj_request_count;
231 struct list_head obj_requests; /* rbd_obj_request structs */
232
233 struct kref kref;
234};
235
236#define for_each_obj_request(ireq, oreq) \
ef06f4d3 237 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
bf0d5f50 238#define for_each_obj_request_from(ireq, oreq) \
ef06f4d3 239 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
bf0d5f50 240#define for_each_obj_request_safe(ireq, oreq, n) \
ef06f4d3 241 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
bf0d5f50 242
dfc5606d
YS
243struct rbd_snap {
244 struct device dev;
245 const char *name;
3591538f 246 u64 size;
dfc5606d
YS
247 struct list_head node;
248 u64 id;
34b13184 249 u64 features;
dfc5606d
YS
250};
251
f84344f3 252struct rbd_mapping {
99c1f08f 253 u64 size;
34b13184 254 u64 features;
f84344f3
AE
255 bool read_only;
256};
257
602adf40
YS
258/*
259 * a single device
260 */
261struct rbd_device {
de71a297 262 int dev_id; /* blkdev unique id */
602adf40
YS
263
264 int major; /* blkdev assigned major */
265 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 266
a30b71b9 267 u32 image_format; /* Either 1 or 2 */
602adf40
YS
268 struct rbd_client *rbd_client;
269
270 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
271
b82d167b 272 spinlock_t lock; /* queue, flags, open_count */
602adf40
YS
273
274 struct rbd_image_header header;
b82d167b 275 unsigned long flags; /* possibly lock protected */
0d7dbfce 276 struct rbd_spec *spec;
602adf40 277
0d7dbfce 278 char *header_name;
971f839a 279
0903e875
AE
280 struct ceph_file_layout layout;
281
59c2be1e 282 struct ceph_osd_event *watch_event;
975241af 283 struct rbd_obj_request *watch_request;
59c2be1e 284
86b00e0d
AE
285 struct rbd_spec *parent_spec;
286 u64 parent_overlap;
287
c666601a
JD
288 /* protects updating the header */
289 struct rw_semaphore header_rwsem;
f84344f3
AE
290
291 struct rbd_mapping mapping;
602adf40
YS
292
293 struct list_head node;
dfc5606d
YS
294
295 /* list of snapshots */
296 struct list_head snaps;
297
298 /* sysfs related */
299 struct device dev;
b82d167b 300 unsigned long open_count; /* protected by lock */
dfc5606d
YS
301};
302
b82d167b
AE
303/*
304 * Flag bits for rbd_dev->flags. If atomicity is required,
305 * rbd_dev->lock is used to protect access.
306 *
307 * Currently, only the "removing" flag (which is coupled with the
308 * "open_count" field) requires atomic access.
309 */
6d292906
AE
310enum rbd_dev_flags {
311 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
b82d167b 312 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
6d292906
AE
313};
314
602adf40 315static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 316
602adf40 317static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
318static DEFINE_SPINLOCK(rbd_dev_list_lock);
319
432b8587
AE
320static LIST_HEAD(rbd_client_list); /* clients */
321static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 322
304f6808
AE
323static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
324static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
325
dfc5606d 326static void rbd_dev_release(struct device *dev);
41f38c2b 327static void rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 328
f0f8cef5
AE
329static ssize_t rbd_add(struct bus_type *bus, const char *buf,
330 size_t count);
331static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
332 size_t count);
333
334static struct bus_attribute rbd_bus_attrs[] = {
335 __ATTR(add, S_IWUSR, NULL, rbd_add),
336 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
337 __ATTR_NULL
338};
339
340static struct bus_type rbd_bus_type = {
341 .name = "rbd",
342 .bus_attrs = rbd_bus_attrs,
343};
344
345static void rbd_root_dev_release(struct device *dev)
346{
347}
348
349static struct device rbd_root_dev = {
350 .init_name = "rbd",
351 .release = rbd_root_dev_release,
352};
353
06ecc6cb
AE
354static __printf(2, 3)
355void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
356{
357 struct va_format vaf;
358 va_list args;
359
360 va_start(args, fmt);
361 vaf.fmt = fmt;
362 vaf.va = &args;
363
364 if (!rbd_dev)
365 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
366 else if (rbd_dev->disk)
367 printk(KERN_WARNING "%s: %s: %pV\n",
368 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
369 else if (rbd_dev->spec && rbd_dev->spec->image_name)
370 printk(KERN_WARNING "%s: image %s: %pV\n",
371 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
372 else if (rbd_dev->spec && rbd_dev->spec->image_id)
373 printk(KERN_WARNING "%s: id %s: %pV\n",
374 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
375 else /* punt */
376 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
377 RBD_DRV_NAME, rbd_dev, &vaf);
378 va_end(args);
379}
380
aafb230e
AE
381#ifdef RBD_DEBUG
382#define rbd_assert(expr) \
383 if (unlikely(!(expr))) { \
384 printk(KERN_ERR "\nAssertion failure in %s() " \
385 "at line %d:\n\n" \
386 "\trbd_assert(%s);\n\n", \
387 __func__, __LINE__, #expr); \
388 BUG(); \
389 }
390#else /* !RBD_DEBUG */
391# define rbd_assert(expr) ((void) 0)
392#endif /* !RBD_DEBUG */
dfc5606d 393
117973fb
AE
394static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
395static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 396
602adf40
YS
397static int rbd_open(struct block_device *bdev, fmode_t mode)
398{
f0f8cef5 399 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
b82d167b 400 bool removing = false;
602adf40 401
f84344f3 402 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
403 return -EROFS;
404
a14ea269 405 spin_lock_irq(&rbd_dev->lock);
b82d167b
AE
406 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
407 removing = true;
408 else
409 rbd_dev->open_count++;
a14ea269 410 spin_unlock_irq(&rbd_dev->lock);
b82d167b
AE
411 if (removing)
412 return -ENOENT;
413
42382b70 414 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
c3e946ce 415 (void) get_device(&rbd_dev->dev);
f84344f3 416 set_device_ro(bdev, rbd_dev->mapping.read_only);
42382b70 417 mutex_unlock(&ctl_mutex);
340c7a2b 418
602adf40
YS
419 return 0;
420}
421
dfc5606d
YS
422static int rbd_release(struct gendisk *disk, fmode_t mode)
423{
424 struct rbd_device *rbd_dev = disk->private_data;
b82d167b
AE
425 unsigned long open_count_before;
426
a14ea269 427 spin_lock_irq(&rbd_dev->lock);
b82d167b 428 open_count_before = rbd_dev->open_count--;
a14ea269 429 spin_unlock_irq(&rbd_dev->lock);
b82d167b 430 rbd_assert(open_count_before > 0);
dfc5606d 431
42382b70 432 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
c3e946ce 433 put_device(&rbd_dev->dev);
42382b70 434 mutex_unlock(&ctl_mutex);
dfc5606d
YS
435
436 return 0;
437}
438
602adf40
YS
439static const struct block_device_operations rbd_bd_ops = {
440 .owner = THIS_MODULE,
441 .open = rbd_open,
dfc5606d 442 .release = rbd_release,
602adf40
YS
443};
444
445/*
446 * Initialize an rbd client instance.
43ae4701 447 * We own *ceph_opts.
602adf40 448 */
f8c38929 449static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
450{
451 struct rbd_client *rbdc;
452 int ret = -ENOMEM;
453
37206ee5 454 dout("%s:\n", __func__);
602adf40
YS
455 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
456 if (!rbdc)
457 goto out_opt;
458
459 kref_init(&rbdc->kref);
460 INIT_LIST_HEAD(&rbdc->node);
461
bc534d86
AE
462 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
463
43ae4701 464 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 465 if (IS_ERR(rbdc->client))
bc534d86 466 goto out_mutex;
43ae4701 467 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
468
469 ret = ceph_open_session(rbdc->client);
470 if (ret < 0)
471 goto out_err;
472
432b8587 473 spin_lock(&rbd_client_list_lock);
602adf40 474 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 475 spin_unlock(&rbd_client_list_lock);
602adf40 476
bc534d86 477 mutex_unlock(&ctl_mutex);
37206ee5 478 dout("%s: rbdc %p\n", __func__, rbdc);
bc534d86 479
602adf40
YS
480 return rbdc;
481
482out_err:
483 ceph_destroy_client(rbdc->client);
bc534d86
AE
484out_mutex:
485 mutex_unlock(&ctl_mutex);
602adf40
YS
486 kfree(rbdc);
487out_opt:
43ae4701
AE
488 if (ceph_opts)
489 ceph_destroy_options(ceph_opts);
37206ee5
AE
490 dout("%s: error %d\n", __func__, ret);
491
28f259b7 492 return ERR_PTR(ret);
602adf40
YS
493}
494
495/*
1f7ba331
AE
496 * Find a ceph client with specific addr and configuration. If
497 * found, bump its reference count.
602adf40 498 */
1f7ba331 499static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
500{
501 struct rbd_client *client_node;
1f7ba331 502 bool found = false;
602adf40 503
43ae4701 504 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
505 return NULL;
506
1f7ba331
AE
507 spin_lock(&rbd_client_list_lock);
508 list_for_each_entry(client_node, &rbd_client_list, node) {
509 if (!ceph_compare_options(ceph_opts, client_node->client)) {
510 kref_get(&client_node->kref);
511 found = true;
512 break;
513 }
514 }
515 spin_unlock(&rbd_client_list_lock);
516
517 return found ? client_node : NULL;
602adf40
YS
518}
519
59c2be1e
YS
520/*
521 * mount options
522 */
523enum {
59c2be1e
YS
524 Opt_last_int,
525 /* int args above */
526 Opt_last_string,
527 /* string args above */
cc0538b6
AE
528 Opt_read_only,
529 Opt_read_write,
530 /* Boolean args above */
531 Opt_last_bool,
59c2be1e
YS
532};
533
43ae4701 534static match_table_t rbd_opts_tokens = {
59c2be1e
YS
535 /* int args above */
536 /* string args above */
be466c1c 537 {Opt_read_only, "read_only"},
cc0538b6
AE
538 {Opt_read_only, "ro"}, /* Alternate spelling */
539 {Opt_read_write, "read_write"},
540 {Opt_read_write, "rw"}, /* Alternate spelling */
541 /* Boolean args above */
59c2be1e
YS
542 {-1, NULL}
543};
544
98571b5a
AE
545struct rbd_options {
546 bool read_only;
547};
548
549#define RBD_READ_ONLY_DEFAULT false
550
59c2be1e
YS
551static int parse_rbd_opts_token(char *c, void *private)
552{
43ae4701 553 struct rbd_options *rbd_opts = private;
59c2be1e
YS
554 substring_t argstr[MAX_OPT_ARGS];
555 int token, intval, ret;
556
43ae4701 557 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
558 if (token < 0)
559 return -EINVAL;
560
561 if (token < Opt_last_int) {
562 ret = match_int(&argstr[0], &intval);
563 if (ret < 0) {
564 pr_err("bad mount option arg (not int) "
565 "at '%s'\n", c);
566 return ret;
567 }
568 dout("got int token %d val %d\n", token, intval);
569 } else if (token > Opt_last_int && token < Opt_last_string) {
570 dout("got string token %d val %s\n", token,
571 argstr[0].from);
cc0538b6
AE
572 } else if (token > Opt_last_string && token < Opt_last_bool) {
573 dout("got Boolean token %d\n", token);
59c2be1e
YS
574 } else {
575 dout("got token %d\n", token);
576 }
577
578 switch (token) {
cc0538b6
AE
579 case Opt_read_only:
580 rbd_opts->read_only = true;
581 break;
582 case Opt_read_write:
583 rbd_opts->read_only = false;
584 break;
59c2be1e 585 default:
aafb230e
AE
586 rbd_assert(false);
587 break;
59c2be1e
YS
588 }
589 return 0;
590}
591
602adf40
YS
592/*
593 * Get a ceph client with specific addr and configuration, if one does
594 * not exist create it.
595 */
9d3997fd 596static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
602adf40 597{
f8c38929 598 struct rbd_client *rbdc;
59c2be1e 599
1f7ba331 600 rbdc = rbd_client_find(ceph_opts);
9d3997fd 601 if (rbdc) /* using an existing client */
43ae4701 602 ceph_destroy_options(ceph_opts);
9d3997fd 603 else
f8c38929 604 rbdc = rbd_client_create(ceph_opts);
602adf40 605
9d3997fd 606 return rbdc;
602adf40
YS
607}
608
609/*
610 * Destroy ceph client
d23a4b3f 611 *
432b8587 612 * Caller must hold rbd_client_list_lock.
602adf40
YS
613 */
614static void rbd_client_release(struct kref *kref)
615{
616 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
617
37206ee5 618 dout("%s: rbdc %p\n", __func__, rbdc);
cd9d9f5d 619 spin_lock(&rbd_client_list_lock);
602adf40 620 list_del(&rbdc->node);
cd9d9f5d 621 spin_unlock(&rbd_client_list_lock);
602adf40
YS
622
623 ceph_destroy_client(rbdc->client);
624 kfree(rbdc);
625}
626
627/*
628 * Drop reference to ceph client node. If it's not referenced anymore, release
629 * it.
630 */
9d3997fd 631static void rbd_put_client(struct rbd_client *rbdc)
602adf40 632{
c53d5893
AE
633 if (rbdc)
634 kref_put(&rbdc->kref, rbd_client_release);
602adf40
YS
635}
636
a30b71b9
AE
637static bool rbd_image_format_valid(u32 image_format)
638{
639 return image_format == 1 || image_format == 2;
640}
641
8e94af8e
AE
642static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
643{
103a150f
AE
644 size_t size;
645 u32 snap_count;
646
647 /* The header has to start with the magic rbd header text */
648 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
649 return false;
650
db2388b6
AE
651 /* The bio layer requires at least sector-sized I/O */
652
653 if (ondisk->options.order < SECTOR_SHIFT)
654 return false;
655
656 /* If we use u64 in a few spots we may be able to loosen this */
657
658 if (ondisk->options.order > 8 * sizeof (int) - 1)
659 return false;
660
103a150f
AE
661 /*
662 * The size of a snapshot header has to fit in a size_t, and
663 * that limits the number of snapshots.
664 */
665 snap_count = le32_to_cpu(ondisk->snap_count);
666 size = SIZE_MAX - sizeof (struct ceph_snap_context);
667 if (snap_count > size / sizeof (__le64))
668 return false;
669
670 /*
671 * Not only that, but the size of the entire the snapshot
672 * header must also be representable in a size_t.
673 */
674 size -= snap_count * sizeof (__le64);
675 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
676 return false;
677
678 return true;
8e94af8e
AE
679}
680
602adf40
YS
681/*
682 * Create a new header structure, translate header format from the on-disk
683 * header.
684 */
685static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 686 struct rbd_image_header_ondisk *ondisk)
602adf40 687{
ccece235 688 u32 snap_count;
58c17b0e 689 size_t len;
d2bb24e5 690 size_t size;
621901d6 691 u32 i;
602adf40 692
6a52325f
AE
693 memset(header, 0, sizeof (*header));
694
103a150f
AE
695 snap_count = le32_to_cpu(ondisk->snap_count);
696
58c17b0e
AE
697 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
698 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 699 if (!header->object_prefix)
602adf40 700 return -ENOMEM;
58c17b0e
AE
701 memcpy(header->object_prefix, ondisk->object_prefix, len);
702 header->object_prefix[len] = '\0';
00f1f36f 703
602adf40 704 if (snap_count) {
f785cc1d
AE
705 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
706
621901d6
AE
707 /* Save a copy of the snapshot names */
708
f785cc1d
AE
709 if (snap_names_len > (u64) SIZE_MAX)
710 return -EIO;
711 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 712 if (!header->snap_names)
6a52325f 713 goto out_err;
f785cc1d
AE
714 /*
715 * Note that rbd_dev_v1_header_read() guarantees
716 * the ondisk buffer we're working with has
717 * snap_names_len bytes beyond the end of the
718 * snapshot id array, this memcpy() is safe.
719 */
720 memcpy(header->snap_names, &ondisk->snaps[snap_count],
721 snap_names_len);
6a52325f 722
621901d6
AE
723 /* Record each snapshot's size */
724
d2bb24e5
AE
725 size = snap_count * sizeof (*header->snap_sizes);
726 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 727 if (!header->snap_sizes)
6a52325f 728 goto out_err;
621901d6
AE
729 for (i = 0; i < snap_count; i++)
730 header->snap_sizes[i] =
731 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 732 } else {
ccece235 733 WARN_ON(ondisk->snap_names_len);
602adf40
YS
734 header->snap_names = NULL;
735 header->snap_sizes = NULL;
736 }
849b4260 737
34b13184 738 header->features = 0; /* No features support in v1 images */
602adf40
YS
739 header->obj_order = ondisk->options.order;
740 header->crypt_type = ondisk->options.crypt_type;
741 header->comp_type = ondisk->options.comp_type;
6a52325f 742
621901d6
AE
743 /* Allocate and fill in the snapshot context */
744
f84344f3 745 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
746 size = sizeof (struct ceph_snap_context);
747 size += snap_count * sizeof (header->snapc->snaps[0]);
748 header->snapc = kzalloc(size, GFP_KERNEL);
749 if (!header->snapc)
750 goto out_err;
602adf40
YS
751
752 atomic_set(&header->snapc->nref, 1);
505cbb9b 753 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 754 header->snapc->num_snaps = snap_count;
621901d6
AE
755 for (i = 0; i < snap_count; i++)
756 header->snapc->snaps[i] =
757 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
758
759 return 0;
760
6a52325f 761out_err:
849b4260 762 kfree(header->snap_sizes);
ccece235 763 header->snap_sizes = NULL;
602adf40 764 kfree(header->snap_names);
ccece235 765 header->snap_names = NULL;
6a52325f
AE
766 kfree(header->object_prefix);
767 header->object_prefix = NULL;
ccece235 768
00f1f36f 769 return -ENOMEM;
602adf40
YS
770}
771
9e15b77d
AE
772static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
773{
774 struct rbd_snap *snap;
775
776 if (snap_id == CEPH_NOSNAP)
777 return RBD_SNAP_HEAD_NAME;
778
779 list_for_each_entry(snap, &rbd_dev->snaps, node)
780 if (snap_id == snap->id)
781 return snap->name;
782
783 return NULL;
784}
785
8836b995 786static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40 787{
602adf40 788
e86924a8 789 struct rbd_snap *snap;
602adf40 790
e86924a8
AE
791 list_for_each_entry(snap, &rbd_dev->snaps, node) {
792 if (!strcmp(snap_name, snap->name)) {
0d7dbfce 793 rbd_dev->spec->snap_id = snap->id;
e86924a8 794 rbd_dev->mapping.size = snap->size;
34b13184 795 rbd_dev->mapping.features = snap->features;
602adf40 796
e86924a8 797 return 0;
00f1f36f 798 }
00f1f36f 799 }
e86924a8 800
00f1f36f 801 return -ENOENT;
602adf40
YS
802}
803
819d52bf 804static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
602adf40 805{
78dc447d 806 int ret;
602adf40 807
0d7dbfce 808 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 809 sizeof (RBD_SNAP_HEAD_NAME))) {
0d7dbfce 810 rbd_dev->spec->snap_id = CEPH_NOSNAP;
99c1f08f 811 rbd_dev->mapping.size = rbd_dev->header.image_size;
34b13184 812 rbd_dev->mapping.features = rbd_dev->header.features;
e86924a8 813 ret = 0;
602adf40 814 } else {
0d7dbfce 815 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
602adf40
YS
816 if (ret < 0)
817 goto done;
f84344f3 818 rbd_dev->mapping.read_only = true;
602adf40 819 }
6d292906
AE
820 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
821
602adf40 822done:
602adf40
YS
823 return ret;
824}
825
826static void rbd_header_free(struct rbd_image_header *header)
827{
849b4260 828 kfree(header->object_prefix);
d78fd7ae 829 header->object_prefix = NULL;
602adf40 830 kfree(header->snap_sizes);
d78fd7ae 831 header->snap_sizes = NULL;
849b4260 832 kfree(header->snap_names);
d78fd7ae 833 header->snap_names = NULL;
d1d25646 834 ceph_put_snap_context(header->snapc);
d78fd7ae 835 header->snapc = NULL;
602adf40
YS
836}
837
98571b5a 838static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 839{
65ccfe21
AE
840 char *name;
841 u64 segment;
842 int ret;
602adf40 843
2fd82b9e 844 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
65ccfe21
AE
845 if (!name)
846 return NULL;
847 segment = offset >> rbd_dev->header.obj_order;
2fd82b9e 848 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
65ccfe21 849 rbd_dev->header.object_prefix, segment);
2fd82b9e 850 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
65ccfe21
AE
851 pr_err("error formatting segment name for #%llu (%d)\n",
852 segment, ret);
853 kfree(name);
854 name = NULL;
855 }
602adf40 856
65ccfe21
AE
857 return name;
858}
602adf40 859
65ccfe21
AE
860static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
861{
862 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 863
65ccfe21
AE
864 return offset & (segment_size - 1);
865}
866
867static u64 rbd_segment_length(struct rbd_device *rbd_dev,
868 u64 offset, u64 length)
869{
870 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
871
872 offset &= segment_size - 1;
873
aafb230e 874 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
875 if (offset + length > segment_size)
876 length = segment_size - offset;
877
878 return length;
602adf40
YS
879}
880
029bcbd8
JD
881/*
882 * returns the size of an object in the image
883 */
884static u64 rbd_obj_bytes(struct rbd_image_header *header)
885{
886 return 1 << header->obj_order;
887}
888
602adf40
YS
889/*
890 * bio helpers
891 */
892
893static void bio_chain_put(struct bio *chain)
894{
895 struct bio *tmp;
896
897 while (chain) {
898 tmp = chain;
899 chain = chain->bi_next;
900 bio_put(tmp);
901 }
902}
903
904/*
905 * zeros a bio chain, starting at specific offset
906 */
907static void zero_bio_chain(struct bio *chain, int start_ofs)
908{
909 struct bio_vec *bv;
910 unsigned long flags;
911 void *buf;
912 int i;
913 int pos = 0;
914
915 while (chain) {
916 bio_for_each_segment(bv, chain, i) {
917 if (pos + bv->bv_len > start_ofs) {
918 int remainder = max(start_ofs - pos, 0);
919 buf = bvec_kmap_irq(bv, &flags);
920 memset(buf + remainder, 0,
921 bv->bv_len - remainder);
85b5aaa6 922 bvec_kunmap_irq(buf, &flags);
602adf40
YS
923 }
924 pos += bv->bv_len;
925 }
926
927 chain = chain->bi_next;
928 }
929}
930
931/*
f7760dad
AE
932 * Clone a portion of a bio, starting at the given byte offset
933 * and continuing for the number of bytes indicated.
602adf40 934 */
f7760dad
AE
935static struct bio *bio_clone_range(struct bio *bio_src,
936 unsigned int offset,
937 unsigned int len,
938 gfp_t gfpmask)
602adf40 939{
f7760dad
AE
940 struct bio_vec *bv;
941 unsigned int resid;
942 unsigned short idx;
943 unsigned int voff;
944 unsigned short end_idx;
945 unsigned short vcnt;
946 struct bio *bio;
947
948 /* Handle the easy case for the caller */
949
950 if (!offset && len == bio_src->bi_size)
951 return bio_clone(bio_src, gfpmask);
952
953 if (WARN_ON_ONCE(!len))
954 return NULL;
955 if (WARN_ON_ONCE(len > bio_src->bi_size))
956 return NULL;
957 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
958 return NULL;
959
960 /* Find first affected segment... */
961
962 resid = offset;
963 __bio_for_each_segment(bv, bio_src, idx, 0) {
964 if (resid < bv->bv_len)
965 break;
966 resid -= bv->bv_len;
602adf40 967 }
f7760dad 968 voff = resid;
602adf40 969
f7760dad 970 /* ...and the last affected segment */
602adf40 971
f7760dad
AE
972 resid += len;
973 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
974 if (resid <= bv->bv_len)
975 break;
976 resid -= bv->bv_len;
977 }
978 vcnt = end_idx - idx + 1;
979
980 /* Build the clone */
981
982 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
983 if (!bio)
984 return NULL; /* ENOMEM */
602adf40 985
f7760dad
AE
986 bio->bi_bdev = bio_src->bi_bdev;
987 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
988 bio->bi_rw = bio_src->bi_rw;
989 bio->bi_flags |= 1 << BIO_CLONED;
990
991 /*
992 * Copy over our part of the bio_vec, then update the first
993 * and last (or only) entries.
994 */
995 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
996 vcnt * sizeof (struct bio_vec));
997 bio->bi_io_vec[0].bv_offset += voff;
998 if (vcnt > 1) {
999 bio->bi_io_vec[0].bv_len -= voff;
1000 bio->bi_io_vec[vcnt - 1].bv_len = resid;
1001 } else {
1002 bio->bi_io_vec[0].bv_len = len;
602adf40
YS
1003 }
1004
f7760dad
AE
1005 bio->bi_vcnt = vcnt;
1006 bio->bi_size = len;
1007 bio->bi_idx = 0;
1008
1009 return bio;
1010}
1011
1012/*
1013 * Clone a portion of a bio chain, starting at the given byte offset
1014 * into the first bio in the source chain and continuing for the
1015 * number of bytes indicated. The result is another bio chain of
1016 * exactly the given length, or a null pointer on error.
1017 *
1018 * The bio_src and offset parameters are both in-out. On entry they
1019 * refer to the first source bio and the offset into that bio where
1020 * the start of data to be cloned is located.
1021 *
1022 * On return, bio_src is updated to refer to the bio in the source
1023 * chain that contains first un-cloned byte, and *offset will
1024 * contain the offset of that byte within that bio.
1025 */
1026static struct bio *bio_chain_clone_range(struct bio **bio_src,
1027 unsigned int *offset,
1028 unsigned int len,
1029 gfp_t gfpmask)
1030{
1031 struct bio *bi = *bio_src;
1032 unsigned int off = *offset;
1033 struct bio *chain = NULL;
1034 struct bio **end;
1035
1036 /* Build up a chain of clone bios up to the limit */
1037
1038 if (!bi || off >= bi->bi_size || !len)
1039 return NULL; /* Nothing to clone */
602adf40 1040
f7760dad
AE
1041 end = &chain;
1042 while (len) {
1043 unsigned int bi_size;
1044 struct bio *bio;
1045
f5400b7a
AE
1046 if (!bi) {
1047 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
f7760dad 1048 goto out_err; /* EINVAL; ran out of bio's */
f5400b7a 1049 }
f7760dad
AE
1050 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1051 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1052 if (!bio)
1053 goto out_err; /* ENOMEM */
1054
1055 *end = bio;
1056 end = &bio->bi_next;
602adf40 1057
f7760dad
AE
1058 off += bi_size;
1059 if (off == bi->bi_size) {
1060 bi = bi->bi_next;
1061 off = 0;
1062 }
1063 len -= bi_size;
1064 }
1065 *bio_src = bi;
1066 *offset = off;
1067
1068 return chain;
1069out_err:
1070 bio_chain_put(chain);
602adf40 1071
602adf40
YS
1072 return NULL;
1073}
1074
bf0d5f50
AE
1075static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1076{
37206ee5
AE
1077 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1078 atomic_read(&obj_request->kref.refcount));
bf0d5f50
AE
1079 kref_get(&obj_request->kref);
1080}
1081
1082static void rbd_obj_request_destroy(struct kref *kref);
1083static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1084{
1085 rbd_assert(obj_request != NULL);
37206ee5
AE
1086 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1087 atomic_read(&obj_request->kref.refcount));
bf0d5f50
AE
1088 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1089}
1090
1091static void rbd_img_request_get(struct rbd_img_request *img_request)
1092{
37206ee5
AE
1093 dout("%s: img %p (was %d)\n", __func__, img_request,
1094 atomic_read(&img_request->kref.refcount));
bf0d5f50
AE
1095 kref_get(&img_request->kref);
1096}
1097
1098static void rbd_img_request_destroy(struct kref *kref);
1099static void rbd_img_request_put(struct rbd_img_request *img_request)
1100{
1101 rbd_assert(img_request != NULL);
37206ee5
AE
1102 dout("%s: img %p (was %d)\n", __func__, img_request,
1103 atomic_read(&img_request->kref.refcount));
bf0d5f50
AE
1104 kref_put(&img_request->kref, rbd_img_request_destroy);
1105}
1106
1107static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1108 struct rbd_obj_request *obj_request)
1109{
25dcf954
AE
1110 rbd_assert(obj_request->img_request == NULL);
1111
bf0d5f50
AE
1112 rbd_obj_request_get(obj_request);
1113 obj_request->img_request = img_request;
25dcf954 1114 obj_request->which = img_request->obj_request_count;
bf0d5f50 1115 rbd_assert(obj_request->which != BAD_WHICH);
25dcf954
AE
1116 img_request->obj_request_count++;
1117 list_add_tail(&obj_request->links, &img_request->obj_requests);
37206ee5
AE
1118 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1119 obj_request->which);
bf0d5f50
AE
1120}
1121
1122static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1123 struct rbd_obj_request *obj_request)
1124{
1125 rbd_assert(obj_request->which != BAD_WHICH);
25dcf954 1126
37206ee5
AE
1127 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1128 obj_request->which);
bf0d5f50 1129 list_del(&obj_request->links);
25dcf954
AE
1130 rbd_assert(img_request->obj_request_count > 0);
1131 img_request->obj_request_count--;
1132 rbd_assert(obj_request->which == img_request->obj_request_count);
1133 obj_request->which = BAD_WHICH;
bf0d5f50 1134 rbd_assert(obj_request->img_request == img_request);
bf0d5f50 1135 obj_request->img_request = NULL;
25dcf954 1136 obj_request->callback = NULL;
bf0d5f50
AE
1137 rbd_obj_request_put(obj_request);
1138}
1139
1140static bool obj_request_type_valid(enum obj_request_type type)
1141{
1142 switch (type) {
9969ebc5 1143 case OBJ_REQUEST_NODATA:
bf0d5f50 1144 case OBJ_REQUEST_BIO:
788e2df3 1145 case OBJ_REQUEST_PAGES:
bf0d5f50
AE
1146 return true;
1147 default:
1148 return false;
1149 }
1150}
1151
bf0d5f50
AE
1152static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1153 struct rbd_obj_request *obj_request)
1154{
37206ee5
AE
1155 dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1156
bf0d5f50
AE
1157 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1158}
1159
1160static void rbd_img_request_complete(struct rbd_img_request *img_request)
1161{
55f27e09 1162
37206ee5 1163 dout("%s: img %p\n", __func__, img_request);
55f27e09
AE
1164
1165 /*
1166 * If no error occurred, compute the aggregate transfer
1167 * count for the image request. We could instead use
1168 * atomic64_cmpxchg() to update it as each object request
1169 * completes; not clear which way is better off hand.
1170 */
1171 if (!img_request->result) {
1172 struct rbd_obj_request *obj_request;
1173 u64 xferred = 0;
1174
1175 for_each_obj_request(img_request, obj_request)
1176 xferred += obj_request->xferred;
1177 img_request->xferred = xferred;
1178 }
1179
bf0d5f50
AE
1180 if (img_request->callback)
1181 img_request->callback(img_request);
1182 else
1183 rbd_img_request_put(img_request);
1184}
1185
788e2df3
AE
1186/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1187
1188static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1189{
37206ee5
AE
1190 dout("%s: obj %p\n", __func__, obj_request);
1191
788e2df3
AE
1192 return wait_for_completion_interruptible(&obj_request->completion);
1193}
1194
07741308
AE
1195static void obj_request_done_init(struct rbd_obj_request *obj_request)
1196{
1197 atomic_set(&obj_request->done, 0);
1198 smp_wmb();
1199}
1200
1201static void obj_request_done_set(struct rbd_obj_request *obj_request)
1202{
632b88ca
AE
1203 int done;
1204
1205 done = atomic_inc_return(&obj_request->done);
1206 if (done > 1) {
1207 struct rbd_img_request *img_request = obj_request->img_request;
1208 struct rbd_device *rbd_dev;
1209
1210 rbd_dev = img_request ? img_request->rbd_dev : NULL;
1211 rbd_warn(rbd_dev, "obj_request %p was already done\n",
1212 obj_request);
1213 }
07741308
AE
1214}
1215
1216static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1217{
632b88ca 1218 smp_mb();
07741308
AE
1219 return atomic_read(&obj_request->done) != 0;
1220}
1221
0c425248
AE
1222/*
1223 * The default/initial value for all image request flags is 0. Each
1224 * is conditionally set to 1 at image request initialization time
1225 * and currently never change thereafter.
1226 */
1227static void img_request_write_set(struct rbd_img_request *img_request)
1228{
1229 set_bit(IMG_REQ_WRITE, &img_request->flags);
1230 smp_mb();
1231}
1232
1233static bool img_request_write_test(struct rbd_img_request *img_request)
1234{
1235 smp_mb();
1236 return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1237}
1238
9849e986
AE
1239static void img_request_child_set(struct rbd_img_request *img_request)
1240{
1241 set_bit(IMG_REQ_CHILD, &img_request->flags);
1242 smp_mb();
1243}
1244
1245static bool img_request_child_test(struct rbd_img_request *img_request)
1246{
1247 smp_mb();
1248 return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1249}
1250
d0b2e944
AE
1251static void img_request_layered_set(struct rbd_img_request *img_request)
1252{
1253 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1254 smp_mb();
1255}
1256
1257static bool img_request_layered_test(struct rbd_img_request *img_request)
1258{
1259 smp_mb();
1260 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1261}
1262
6e2a4505
AE
1263static void
1264rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1265{
1266 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1267 obj_request, obj_request->img_request, obj_request->result,
1268 obj_request->xferred, obj_request->length);
1269 /*
1270 * ENOENT means a hole in the image. We zero-fill the
1271 * entire length of the request. A short read also implies
1272 * zero-fill to the end of the request. Either way we
1273 * update the xferred count to indicate the whole request
1274 * was satisfied.
1275 */
1276 BUG_ON(obj_request->type != OBJ_REQUEST_BIO);
1277 if (obj_request->result == -ENOENT) {
1278 zero_bio_chain(obj_request->bio_list, 0);
1279 obj_request->result = 0;
1280 obj_request->xferred = obj_request->length;
1281 } else if (obj_request->xferred < obj_request->length &&
1282 !obj_request->result) {
1283 zero_bio_chain(obj_request->bio_list, obj_request->xferred);
1284 obj_request->xferred = obj_request->length;
1285 }
1286 obj_request_done_set(obj_request);
1287}
1288
bf0d5f50
AE
1289static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1290{
37206ee5
AE
1291 dout("%s: obj %p cb %p\n", __func__, obj_request,
1292 obj_request->callback);
bf0d5f50
AE
1293 if (obj_request->callback)
1294 obj_request->callback(obj_request);
788e2df3
AE
1295 else
1296 complete_all(&obj_request->completion);
bf0d5f50
AE
1297}
1298
c47f9371 1299static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
39bf2c5d
AE
1300{
1301 dout("%s: obj %p\n", __func__, obj_request);
1302 obj_request_done_set(obj_request);
1303}
1304
c47f9371 1305static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
bf0d5f50 1306{
37206ee5 1307 dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
c47f9371 1308 obj_request->result, obj_request->xferred, obj_request->length);
6e2a4505
AE
1309 if (obj_request->img_request)
1310 rbd_img_obj_request_read_callback(obj_request);
1311 else
1312 obj_request_done_set(obj_request);
bf0d5f50
AE
1313}
1314
c47f9371 1315static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
bf0d5f50 1316{
1b83bef2
SW
1317 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1318 obj_request->result, obj_request->length);
1319 /*
1320 * There is no such thing as a successful short write.
1321 * Our xferred value is the number of bytes transferred
1322 * back. Set it to our originally-requested length.
1323 */
1324 obj_request->xferred = obj_request->length;
07741308 1325 obj_request_done_set(obj_request);
bf0d5f50
AE
1326}
1327
fbfab539
AE
1328/*
1329 * For a simple stat call there's nothing to do. We'll do more if
1330 * this is part of a write sequence for a layered image.
1331 */
c47f9371 1332static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
fbfab539 1333{
37206ee5 1334 dout("%s: obj %p\n", __func__, obj_request);
fbfab539
AE
1335 obj_request_done_set(obj_request);
1336}
1337
bf0d5f50
AE
1338static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1339 struct ceph_msg *msg)
1340{
1341 struct rbd_obj_request *obj_request = osd_req->r_priv;
bf0d5f50
AE
1342 u16 opcode;
1343
37206ee5 1344 dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
bf0d5f50
AE
1345 rbd_assert(osd_req == obj_request->osd_req);
1346 rbd_assert(!!obj_request->img_request ^
1347 (obj_request->which == BAD_WHICH));
1348
1b83bef2
SW
1349 if (osd_req->r_result < 0)
1350 obj_request->result = osd_req->r_result;
bf0d5f50
AE
1351 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1352
1b83bef2 1353 WARN_ON(osd_req->r_num_ops != 1); /* For now */
bf0d5f50 1354
c47f9371
AE
1355 /*
1356 * We support a 64-bit length, but ultimately it has to be
1357 * passed to blk_end_request(), which takes an unsigned int.
1358 */
1b83bef2 1359 obj_request->xferred = osd_req->r_reply_op_len[0];
c47f9371 1360 rbd_assert(obj_request->xferred < (u64) UINT_MAX);
79528734 1361 opcode = osd_req->r_ops[0].op;
bf0d5f50
AE
1362 switch (opcode) {
1363 case CEPH_OSD_OP_READ:
c47f9371 1364 rbd_osd_read_callback(obj_request);
bf0d5f50
AE
1365 break;
1366 case CEPH_OSD_OP_WRITE:
c47f9371 1367 rbd_osd_write_callback(obj_request);
bf0d5f50 1368 break;
fbfab539 1369 case CEPH_OSD_OP_STAT:
c47f9371 1370 rbd_osd_stat_callback(obj_request);
fbfab539 1371 break;
36be9a76 1372 case CEPH_OSD_OP_CALL:
b8d70035 1373 case CEPH_OSD_OP_NOTIFY_ACK:
9969ebc5 1374 case CEPH_OSD_OP_WATCH:
c47f9371 1375 rbd_osd_trivial_callback(obj_request);
9969ebc5 1376 break;
bf0d5f50
AE
1377 default:
1378 rbd_warn(NULL, "%s: unsupported op %hu\n",
1379 obj_request->object_name, (unsigned short) opcode);
1380 break;
1381 }
1382
07741308 1383 if (obj_request_done_test(obj_request))
bf0d5f50
AE
1384 rbd_obj_request_complete(obj_request);
1385}
1386
2fa12320 1387static void rbd_osd_req_format(struct rbd_obj_request *obj_request,
79528734 1388 bool write_request)
430c28c3
AE
1389{
1390 struct rbd_img_request *img_request = obj_request->img_request;
8c042b0d 1391 struct ceph_osd_request *osd_req = obj_request->osd_req;
430c28c3
AE
1392 struct ceph_snap_context *snapc = NULL;
1393 u64 snap_id = CEPH_NOSNAP;
1394 struct timespec *mtime = NULL;
1395 struct timespec now;
1396
8c042b0d 1397 rbd_assert(osd_req != NULL);
430c28c3
AE
1398
1399 if (write_request) {
1400 now = CURRENT_TIME;
1401 mtime = &now;
1402 if (img_request)
1403 snapc = img_request->snapc;
2fa12320
AE
1404 } else if (img_request) {
1405 snap_id = img_request->snap_id;
8c042b0d
AE
1406 }
1407 ceph_osdc_build_request(osd_req, obj_request->offset,
79528734 1408 snapc, snap_id, mtime);
430c28c3
AE
1409}
1410
bf0d5f50
AE
1411static struct ceph_osd_request *rbd_osd_req_create(
1412 struct rbd_device *rbd_dev,
1413 bool write_request,
430c28c3 1414 struct rbd_obj_request *obj_request)
bf0d5f50
AE
1415{
1416 struct rbd_img_request *img_request = obj_request->img_request;
1417 struct ceph_snap_context *snapc = NULL;
1418 struct ceph_osd_client *osdc;
1419 struct ceph_osd_request *osd_req;
bf0d5f50
AE
1420
1421 if (img_request) {
0c425248
AE
1422 rbd_assert(write_request ==
1423 img_request_write_test(img_request));
1424 if (write_request)
bf0d5f50 1425 snapc = img_request->snapc;
bf0d5f50
AE
1426 }
1427
1428 /* Allocate and initialize the request, for the single op */
1429
1430 osdc = &rbd_dev->rbd_client->client->osdc;
1431 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1432 if (!osd_req)
1433 return NULL; /* ENOMEM */
bf0d5f50 1434
430c28c3 1435 if (write_request)
bf0d5f50 1436 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
430c28c3 1437 else
bf0d5f50 1438 osd_req->r_flags = CEPH_OSD_FLAG_READ;
bf0d5f50
AE
1439
1440 osd_req->r_callback = rbd_osd_req_callback;
1441 osd_req->r_priv = obj_request;
1442
1443 osd_req->r_oid_len = strlen(obj_request->object_name);
1444 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1445 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1446
1447 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1448
bf0d5f50
AE
1449 return osd_req;
1450}
1451
1452static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1453{
1454 ceph_osdc_put_request(osd_req);
1455}
1456
1457/* object_name is assumed to be a non-null pointer and NUL-terminated */
1458
1459static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1460 u64 offset, u64 length,
1461 enum obj_request_type type)
1462{
1463 struct rbd_obj_request *obj_request;
1464 size_t size;
1465 char *name;
1466
1467 rbd_assert(obj_request_type_valid(type));
1468
1469 size = strlen(object_name) + 1;
1470 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1471 if (!obj_request)
1472 return NULL;
1473
1474 name = (char *)(obj_request + 1);
1475 obj_request->object_name = memcpy(name, object_name, size);
1476 obj_request->offset = offset;
1477 obj_request->length = length;
1478 obj_request->which = BAD_WHICH;
1479 obj_request->type = type;
1480 INIT_LIST_HEAD(&obj_request->links);
07741308 1481 obj_request_done_init(obj_request);
788e2df3 1482 init_completion(&obj_request->completion);
bf0d5f50
AE
1483 kref_init(&obj_request->kref);
1484
37206ee5
AE
1485 dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1486 offset, length, (int)type, obj_request);
1487
bf0d5f50
AE
1488 return obj_request;
1489}
1490
1491static void rbd_obj_request_destroy(struct kref *kref)
1492{
1493 struct rbd_obj_request *obj_request;
1494
1495 obj_request = container_of(kref, struct rbd_obj_request, kref);
1496
37206ee5
AE
1497 dout("%s: obj %p\n", __func__, obj_request);
1498
bf0d5f50
AE
1499 rbd_assert(obj_request->img_request == NULL);
1500 rbd_assert(obj_request->which == BAD_WHICH);
1501
1502 if (obj_request->osd_req)
1503 rbd_osd_req_destroy(obj_request->osd_req);
1504
1505 rbd_assert(obj_request_type_valid(obj_request->type));
1506 switch (obj_request->type) {
9969ebc5
AE
1507 case OBJ_REQUEST_NODATA:
1508 break; /* Nothing to do */
bf0d5f50
AE
1509 case OBJ_REQUEST_BIO:
1510 if (obj_request->bio_list)
1511 bio_chain_put(obj_request->bio_list);
1512 break;
788e2df3
AE
1513 case OBJ_REQUEST_PAGES:
1514 if (obj_request->pages)
1515 ceph_release_page_vector(obj_request->pages,
1516 obj_request->page_count);
1517 break;
bf0d5f50
AE
1518 }
1519
1520 kfree(obj_request);
1521}
1522
1523/*
1524 * Caller is responsible for filling in the list of object requests
1525 * that comprises the image request, and the Linux request pointer
1526 * (if there is one).
1527 */
cc344fa1
AE
1528static struct rbd_img_request *rbd_img_request_create(
1529 struct rbd_device *rbd_dev,
bf0d5f50 1530 u64 offset, u64 length,
9849e986
AE
1531 bool write_request,
1532 bool child_request)
bf0d5f50
AE
1533{
1534 struct rbd_img_request *img_request;
1535 struct ceph_snap_context *snapc = NULL;
1536
1537 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1538 if (!img_request)
1539 return NULL;
1540
1541 if (write_request) {
1542 down_read(&rbd_dev->header_rwsem);
1543 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1544 up_read(&rbd_dev->header_rwsem);
1545 if (WARN_ON(!snapc)) {
1546 kfree(img_request);
1547 return NULL; /* Shouldn't happen */
1548 }
0c425248 1549
bf0d5f50
AE
1550 }
1551
1552 img_request->rq = NULL;
1553 img_request->rbd_dev = rbd_dev;
1554 img_request->offset = offset;
1555 img_request->length = length;
0c425248
AE
1556 img_request->flags = 0;
1557 if (write_request) {
1558 img_request_write_set(img_request);
bf0d5f50 1559 img_request->snapc = snapc;
0c425248 1560 } else {
bf0d5f50 1561 img_request->snap_id = rbd_dev->spec->snap_id;
0c425248 1562 }
9849e986
AE
1563 if (child_request)
1564 img_request_child_set(img_request);
d0b2e944
AE
1565 if (rbd_dev->parent_spec)
1566 img_request_layered_set(img_request);
bf0d5f50
AE
1567 spin_lock_init(&img_request->completion_lock);
1568 img_request->next_completion = 0;
1569 img_request->callback = NULL;
a5a337d4 1570 img_request->result = 0;
bf0d5f50
AE
1571 img_request->obj_request_count = 0;
1572 INIT_LIST_HEAD(&img_request->obj_requests);
1573 kref_init(&img_request->kref);
1574
d0b2e944 1575 (void) img_request_layered_test(img_request); /* Avoid a warning */
bf0d5f50
AE
1576 rbd_img_request_get(img_request); /* Avoid a warning */
1577 rbd_img_request_put(img_request); /* TEMPORARY */
1578
37206ee5
AE
1579 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1580 write_request ? "write" : "read", offset, length,
1581 img_request);
1582
bf0d5f50
AE
1583 return img_request;
1584}
1585
1586static void rbd_img_request_destroy(struct kref *kref)
1587{
1588 struct rbd_img_request *img_request;
1589 struct rbd_obj_request *obj_request;
1590 struct rbd_obj_request *next_obj_request;
1591
1592 img_request = container_of(kref, struct rbd_img_request, kref);
1593
37206ee5
AE
1594 dout("%s: img %p\n", __func__, img_request);
1595
bf0d5f50
AE
1596 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1597 rbd_img_obj_request_del(img_request, obj_request);
25dcf954 1598 rbd_assert(img_request->obj_request_count == 0);
bf0d5f50 1599
0c425248 1600 if (img_request_write_test(img_request))
bf0d5f50
AE
1601 ceph_put_snap_context(img_request->snapc);
1602
1603 kfree(img_request);
1604}
1605
2169238d
AE
1606static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1607{
1608 struct rbd_img_request *img_request;
1609 u32 which = obj_request->which;
1610 bool more = true;
1611
1612 img_request = obj_request->img_request;
1613
1614 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1615 rbd_assert(img_request != NULL);
9849e986 1616 rbd_assert(!img_request_child_test(img_request))
2169238d 1617 rbd_assert(img_request->rq != NULL);
9849e986 1618
2169238d
AE
1619 rbd_assert(img_request->obj_request_count > 0);
1620 rbd_assert(which != BAD_WHICH);
1621 rbd_assert(which < img_request->obj_request_count);
1622 rbd_assert(which >= img_request->next_completion);
1623
1624 spin_lock_irq(&img_request->completion_lock);
1625 if (which != img_request->next_completion)
1626 goto out;
1627
1628 for_each_obj_request_from(img_request, obj_request) {
1629 unsigned int xferred;
1630 int result;
1631
1632 rbd_assert(more);
1633 rbd_assert(which < img_request->obj_request_count);
1634
1635 if (!obj_request_done_test(obj_request))
1636 break;
1637
a5a337d4
AE
1638 rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
1639 xferred = (unsigned int)obj_request->xferred;
1640 result = obj_request->result;
1641 if (result) {
7da22d29
AE
1642 struct rbd_device *rbd_dev = img_request->rbd_dev;
1643
1644 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
0c425248
AE
1645 img_request_write_test(img_request) ? "write"
1646 : "read",
7da22d29
AE
1647 obj_request->length, obj_request->img_offset,
1648 obj_request->offset);
1649 rbd_warn(rbd_dev, " result %d xferred %x\n",
2169238d 1650 result, xferred);
a5a337d4
AE
1651 if (!img_request->result)
1652 img_request->result = result;
1653 }
2169238d
AE
1654
1655 more = blk_end_request(img_request->rq, result, xferred);
1656 which++;
1657 }
1658
1659 rbd_assert(more ^ (which == img_request->obj_request_count));
1660 img_request->next_completion = which;
1661out:
1662 spin_unlock_irq(&img_request->completion_lock);
1663
1664 if (!more)
1665 rbd_img_request_complete(img_request);
1666}
1667
bf0d5f50
AE
1668static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1669 struct bio *bio_list)
1670{
1671 struct rbd_device *rbd_dev = img_request->rbd_dev;
1672 struct rbd_obj_request *obj_request = NULL;
1673 struct rbd_obj_request *next_obj_request;
0c425248 1674 bool write_request = img_request_write_test(img_request);
bf0d5f50 1675 unsigned int bio_offset;
7da22d29 1676 u64 img_offset;
bf0d5f50
AE
1677 u64 resid;
1678 u16 opcode;
1679
37206ee5
AE
1680 dout("%s: img %p bio %p\n", __func__, img_request, bio_list);
1681
430c28c3 1682 opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
bf0d5f50 1683 bio_offset = 0;
7da22d29
AE
1684 img_offset = img_request->offset;
1685 rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
bf0d5f50 1686 resid = img_request->length;
4dda41d3 1687 rbd_assert(resid > 0);
bf0d5f50 1688 while (resid) {
2fa12320 1689 struct ceph_osd_request *osd_req;
bf0d5f50
AE
1690 const char *object_name;
1691 unsigned int clone_size;
bf0d5f50
AE
1692 u64 offset;
1693 u64 length;
1694
7da22d29 1695 object_name = rbd_segment_name(rbd_dev, img_offset);
bf0d5f50
AE
1696 if (!object_name)
1697 goto out_unwind;
7da22d29
AE
1698 offset = rbd_segment_offset(rbd_dev, img_offset);
1699 length = rbd_segment_length(rbd_dev, img_offset, resid);
bf0d5f50
AE
1700 obj_request = rbd_obj_request_create(object_name,
1701 offset, length,
1702 OBJ_REQUEST_BIO);
1703 kfree(object_name); /* object request has its own copy */
1704 if (!obj_request)
1705 goto out_unwind;
1706
1707 rbd_assert(length <= (u64) UINT_MAX);
1708 clone_size = (unsigned int) length;
1709 obj_request->bio_list = bio_chain_clone_range(&bio_list,
1710 &bio_offset, clone_size,
1711 GFP_ATOMIC);
1712 if (!obj_request->bio_list)
1713 goto out_partial;
1714
2fa12320
AE
1715 osd_req = rbd_osd_req_create(rbd_dev, write_request,
1716 obj_request);
1717 if (!osd_req)
bf0d5f50 1718 goto out_partial;
2fa12320 1719 obj_request->osd_req = osd_req;
2169238d 1720 obj_request->callback = rbd_img_obj_callback;
430c28c3 1721
2fa12320
AE
1722 osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
1723 0, 0);
a4ce40a9
AE
1724 osd_req_op_extent_osd_data_bio(osd_req, 0, write_request,
1725 obj_request->bio_list, obj_request->length);
2fa12320 1726 rbd_osd_req_format(obj_request, write_request);
430c28c3 1727
7da22d29 1728 obj_request->img_offset = img_offset;
bf0d5f50
AE
1729 rbd_img_obj_request_add(img_request, obj_request);
1730
7da22d29 1731 img_offset += length;
bf0d5f50
AE
1732 resid -= length;
1733 }
1734
1735 return 0;
1736
1737out_partial:
1738 rbd_obj_request_put(obj_request);
1739out_unwind:
1740 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1741 rbd_obj_request_put(obj_request);
1742
1743 return -ENOMEM;
1744}
1745
bf0d5f50
AE
1746static int rbd_img_request_submit(struct rbd_img_request *img_request)
1747{
1748 struct rbd_device *rbd_dev = img_request->rbd_dev;
1749 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1750 struct rbd_obj_request *obj_request;
46faeed4 1751 struct rbd_obj_request *next_obj_request;
bf0d5f50 1752
37206ee5 1753 dout("%s: img %p\n", __func__, img_request);
46faeed4 1754 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
bf0d5f50
AE
1755 int ret;
1756
bf0d5f50
AE
1757 ret = rbd_obj_request_submit(osdc, obj_request);
1758 if (ret)
1759 return ret;
1760 /*
1761 * The image request has its own reference to each
1762 * of its object requests, so we can safely drop the
1763 * initial one here.
1764 */
1765 rbd_obj_request_put(obj_request);
1766 }
1767
1768 return 0;
1769}
1770
cf81b60e 1771static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
b8d70035
AE
1772 u64 ver, u64 notify_id)
1773{
1774 struct rbd_obj_request *obj_request;
2169238d 1775 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
b8d70035
AE
1776 int ret;
1777
1778 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1779 OBJ_REQUEST_NODATA);
1780 if (!obj_request)
1781 return -ENOMEM;
1782
1783 ret = -ENOMEM;
430c28c3 1784 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
b8d70035
AE
1785 if (!obj_request->osd_req)
1786 goto out;
2169238d 1787 obj_request->callback = rbd_obj_request_put;
b8d70035 1788
c99d2d4a
AE
1789 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
1790 notify_id, ver, 0);
2fa12320 1791 rbd_osd_req_format(obj_request, false);
430c28c3 1792
b8d70035 1793 ret = rbd_obj_request_submit(osdc, obj_request);
b8d70035 1794out:
cf81b60e
AE
1795 if (ret)
1796 rbd_obj_request_put(obj_request);
b8d70035
AE
1797
1798 return ret;
1799}
1800
1801static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1802{
1803 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1804 u64 hver;
1805 int rc;
1806
1807 if (!rbd_dev)
1808 return;
1809
37206ee5 1810 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
b8d70035
AE
1811 rbd_dev->header_name, (unsigned long long) notify_id,
1812 (unsigned int) opcode);
1813 rc = rbd_dev_refresh(rbd_dev, &hver);
1814 if (rc)
1815 rbd_warn(rbd_dev, "got notification but failed to "
1816 " update snaps: %d\n", rc);
1817
cf81b60e 1818 rbd_obj_notify_ack(rbd_dev, hver, notify_id);
b8d70035
AE
1819}
1820
9969ebc5
AE
1821/*
1822 * Request sync osd watch/unwatch. The value of "start" determines
1823 * whether a watch request is being initiated or torn down.
1824 */
1825static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1826{
1827 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1828 struct rbd_obj_request *obj_request;
9969ebc5
AE
1829 int ret;
1830
1831 rbd_assert(start ^ !!rbd_dev->watch_event);
1832 rbd_assert(start ^ !!rbd_dev->watch_request);
1833
1834 if (start) {
3c663bbd 1835 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
9969ebc5
AE
1836 &rbd_dev->watch_event);
1837 if (ret < 0)
1838 return ret;
8eb87565 1839 rbd_assert(rbd_dev->watch_event != NULL);
9969ebc5
AE
1840 }
1841
1842 ret = -ENOMEM;
1843 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1844 OBJ_REQUEST_NODATA);
1845 if (!obj_request)
1846 goto out_cancel;
1847
430c28c3
AE
1848 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
1849 if (!obj_request->osd_req)
1850 goto out_cancel;
1851
8eb87565 1852 if (start)
975241af 1853 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
8eb87565 1854 else
6977c3f9 1855 ceph_osdc_unregister_linger_request(osdc,
975241af 1856 rbd_dev->watch_request->osd_req);
2169238d
AE
1857
1858 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
1859 rbd_dev->watch_event->cookie,
1860 rbd_dev->header.obj_version, start);
1861 rbd_osd_req_format(obj_request, true);
1862
9969ebc5
AE
1863 ret = rbd_obj_request_submit(osdc, obj_request);
1864 if (ret)
1865 goto out_cancel;
1866 ret = rbd_obj_request_wait(obj_request);
1867 if (ret)
1868 goto out_cancel;
9969ebc5
AE
1869 ret = obj_request->result;
1870 if (ret)
1871 goto out_cancel;
1872
8eb87565
AE
1873 /*
1874 * A watch request is set to linger, so the underlying osd
1875 * request won't go away until we unregister it. We retain
1876 * a pointer to the object request during that time (in
1877 * rbd_dev->watch_request), so we'll keep a reference to
1878 * it. We'll drop that reference (below) after we've
1879 * unregistered it.
1880 */
1881 if (start) {
1882 rbd_dev->watch_request = obj_request;
1883
1884 return 0;
1885 }
1886
1887 /* We have successfully torn down the watch request */
1888
1889 rbd_obj_request_put(rbd_dev->watch_request);
1890 rbd_dev->watch_request = NULL;
9969ebc5
AE
1891out_cancel:
1892 /* Cancel the event if we're tearing down, or on error */
1893 ceph_osdc_cancel_event(rbd_dev->watch_event);
1894 rbd_dev->watch_event = NULL;
9969ebc5
AE
1895 if (obj_request)
1896 rbd_obj_request_put(obj_request);
1897
1898 return ret;
1899}
1900
36be9a76
AE
1901/*
1902 * Synchronous osd object method call
1903 */
1904static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
1905 const char *object_name,
1906 const char *class_name,
1907 const char *method_name,
1908 const char *outbound,
1909 size_t outbound_size,
1910 char *inbound,
1911 size_t inbound_size,
1912 u64 *version)
1913{
2169238d 1914 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
36be9a76 1915 struct rbd_obj_request *obj_request;
36be9a76
AE
1916 struct page **pages;
1917 u32 page_count;
1918 int ret;
1919
1920 /*
6010a451
AE
1921 * Method calls are ultimately read operations. The result
1922 * should placed into the inbound buffer provided. They
1923 * also supply outbound data--parameters for the object
1924 * method. Currently if this is present it will be a
1925 * snapshot id.
36be9a76
AE
1926 */
1927 page_count = (u32) calc_pages_for(0, inbound_size);
1928 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
1929 if (IS_ERR(pages))
1930 return PTR_ERR(pages);
1931
1932 ret = -ENOMEM;
6010a451 1933 obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
36be9a76
AE
1934 OBJ_REQUEST_PAGES);
1935 if (!obj_request)
1936 goto out;
1937
1938 obj_request->pages = pages;
1939 obj_request->page_count = page_count;
1940
430c28c3 1941 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
36be9a76
AE
1942 if (!obj_request->osd_req)
1943 goto out;
1944
c99d2d4a 1945 osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
04017e29
AE
1946 class_name, method_name);
1947 if (outbound_size) {
1948 struct ceph_pagelist *pagelist;
1949
1950 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
1951 if (!pagelist)
1952 goto out;
1953
1954 ceph_pagelist_init(pagelist);
1955 ceph_pagelist_append(pagelist, outbound, outbound_size);
1956 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
1957 pagelist);
1958 }
a4ce40a9
AE
1959 osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
1960 obj_request->pages, inbound_size,
44cd188d 1961 0, false, false);
2fa12320 1962 rbd_osd_req_format(obj_request, false);
430c28c3 1963
36be9a76
AE
1964 ret = rbd_obj_request_submit(osdc, obj_request);
1965 if (ret)
1966 goto out;
1967 ret = rbd_obj_request_wait(obj_request);
1968 if (ret)
1969 goto out;
1970
1971 ret = obj_request->result;
1972 if (ret < 0)
1973 goto out;
23ed6e13 1974 ret = 0;
903bb32e 1975 ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
36be9a76
AE
1976 if (version)
1977 *version = obj_request->version;
1978out:
1979 if (obj_request)
1980 rbd_obj_request_put(obj_request);
1981 else
1982 ceph_release_page_vector(pages, page_count);
1983
1984 return ret;
1985}
1986
bf0d5f50 1987static void rbd_request_fn(struct request_queue *q)
cc344fa1 1988 __releases(q->queue_lock) __acquires(q->queue_lock)
bf0d5f50
AE
1989{
1990 struct rbd_device *rbd_dev = q->queuedata;
1991 bool read_only = rbd_dev->mapping.read_only;
1992 struct request *rq;
1993 int result;
1994
1995 while ((rq = blk_fetch_request(q))) {
1996 bool write_request = rq_data_dir(rq) == WRITE;
1997 struct rbd_img_request *img_request;
1998 u64 offset;
1999 u64 length;
2000
2001 /* Ignore any non-FS requests that filter through. */
2002
2003 if (rq->cmd_type != REQ_TYPE_FS) {
4dda41d3
AE
2004 dout("%s: non-fs request type %d\n", __func__,
2005 (int) rq->cmd_type);
2006 __blk_end_request_all(rq, 0);
2007 continue;
2008 }
2009
2010 /* Ignore/skip any zero-length requests */
2011
2012 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
2013 length = (u64) blk_rq_bytes(rq);
2014
2015 if (!length) {
2016 dout("%s: zero-length request\n", __func__);
bf0d5f50
AE
2017 __blk_end_request_all(rq, 0);
2018 continue;
2019 }
2020
2021 spin_unlock_irq(q->queue_lock);
2022
2023 /* Disallow writes to a read-only device */
2024
2025 if (write_request) {
2026 result = -EROFS;
2027 if (read_only)
2028 goto end_request;
2029 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2030 }
2031
6d292906
AE
2032 /*
2033 * Quit early if the mapped snapshot no longer
2034 * exists. It's still possible the snapshot will
2035 * have disappeared by the time our request arrives
2036 * at the osd, but there's no sense in sending it if
2037 * we already know.
2038 */
2039 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
bf0d5f50
AE
2040 dout("request for non-existent snapshot");
2041 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2042 result = -ENXIO;
2043 goto end_request;
2044 }
2045
bf0d5f50
AE
2046 result = -EINVAL;
2047 if (WARN_ON(offset && length > U64_MAX - offset + 1))
2048 goto end_request; /* Shouldn't happen */
2049
2050 result = -ENOMEM;
2051 img_request = rbd_img_request_create(rbd_dev, offset, length,
9849e986 2052 write_request, false);
bf0d5f50
AE
2053 if (!img_request)
2054 goto end_request;
2055
2056 img_request->rq = rq;
2057
2058 result = rbd_img_request_fill_bio(img_request, rq->bio);
2059 if (!result)
2060 result = rbd_img_request_submit(img_request);
2061 if (result)
2062 rbd_img_request_put(img_request);
2063end_request:
2064 spin_lock_irq(q->queue_lock);
2065 if (result < 0) {
7da22d29
AE
2066 rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
2067 write_request ? "write" : "read",
2068 length, offset, result);
2069
bf0d5f50
AE
2070 __blk_end_request_all(rq, result);
2071 }
2072 }
2073}
2074
602adf40
YS
2075/*
2076 * a queue callback. Makes sure that we don't create a bio that spans across
2077 * multiple osd objects. One exception would be with a single page bios,
f7760dad 2078 * which we handle later at bio_chain_clone_range()
602adf40
YS
2079 */
2080static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2081 struct bio_vec *bvec)
2082{
2083 struct rbd_device *rbd_dev = q->queuedata;
e5cfeed2
AE
2084 sector_t sector_offset;
2085 sector_t sectors_per_obj;
2086 sector_t obj_sector_offset;
2087 int ret;
2088
2089 /*
2090 * Find how far into its rbd object the partition-relative
2091 * bio start sector is to offset relative to the enclosing
2092 * device.
2093 */
2094 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2095 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2096 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2097
2098 /*
2099 * Compute the number of bytes from that offset to the end
2100 * of the object. Account for what's already used by the bio.
2101 */
2102 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2103 if (ret > bmd->bi_size)
2104 ret -= bmd->bi_size;
2105 else
2106 ret = 0;
2107
2108 /*
2109 * Don't send back more than was asked for. And if the bio
2110 * was empty, let the whole thing through because: "Note
2111 * that a block device *must* allow a single page to be
2112 * added to an empty bio."
2113 */
2114 rbd_assert(bvec->bv_len <= PAGE_SIZE);
2115 if (ret > (int) bvec->bv_len || !bmd->bi_size)
2116 ret = (int) bvec->bv_len;
2117
2118 return ret;
602adf40
YS
2119}
2120
2121static void rbd_free_disk(struct rbd_device *rbd_dev)
2122{
2123 struct gendisk *disk = rbd_dev->disk;
2124
2125 if (!disk)
2126 return;
2127
602adf40
YS
2128 if (disk->flags & GENHD_FL_UP)
2129 del_gendisk(disk);
2130 if (disk->queue)
2131 blk_cleanup_queue(disk->queue);
2132 put_disk(disk);
2133}
2134
788e2df3
AE
2135static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2136 const char *object_name,
2137 u64 offset, u64 length,
2138 char *buf, u64 *version)
2139
2140{
2169238d 2141 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
788e2df3 2142 struct rbd_obj_request *obj_request;
788e2df3
AE
2143 struct page **pages = NULL;
2144 u32 page_count;
1ceae7ef 2145 size_t size;
788e2df3
AE
2146 int ret;
2147
2148 page_count = (u32) calc_pages_for(offset, length);
2149 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2150 if (IS_ERR(pages))
2151 ret = PTR_ERR(pages);
2152
2153 ret = -ENOMEM;
2154 obj_request = rbd_obj_request_create(object_name, offset, length,
36be9a76 2155 OBJ_REQUEST_PAGES);
788e2df3
AE
2156 if (!obj_request)
2157 goto out;
2158
2159 obj_request->pages = pages;
2160 obj_request->page_count = page_count;
2161
430c28c3 2162 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
788e2df3
AE
2163 if (!obj_request->osd_req)
2164 goto out;
2165
c99d2d4a
AE
2166 osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2167 offset, length, 0, 0);
a4ce40a9
AE
2168 osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, false,
2169 obj_request->pages,
44cd188d
AE
2170 obj_request->length,
2171 obj_request->offset & ~PAGE_MASK,
2172 false, false);
2fa12320 2173 rbd_osd_req_format(obj_request, false);
430c28c3 2174
788e2df3
AE
2175 ret = rbd_obj_request_submit(osdc, obj_request);
2176 if (ret)
2177 goto out;
2178 ret = rbd_obj_request_wait(obj_request);
2179 if (ret)
2180 goto out;
2181
2182 ret = obj_request->result;
2183 if (ret < 0)
2184 goto out;
1ceae7ef
AE
2185
2186 rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2187 size = (size_t) obj_request->xferred;
903bb32e 2188 ceph_copy_from_page_vector(pages, buf, 0, size);
23ed6e13
AE
2189 rbd_assert(size <= (size_t) INT_MAX);
2190 ret = (int) size;
788e2df3
AE
2191 if (version)
2192 *version = obj_request->version;
2193out:
2194 if (obj_request)
2195 rbd_obj_request_put(obj_request);
2196 else
2197 ceph_release_page_vector(pages, page_count);
2198
2199 return ret;
2200}
2201
602adf40 2202/*
4156d998
AE
2203 * Read the complete header for the given rbd device.
2204 *
2205 * Returns a pointer to a dynamically-allocated buffer containing
2206 * the complete and validated header. Caller can pass the address
2207 * of a variable that will be filled in with the version of the
2208 * header object at the time it was read.
2209 *
2210 * Returns a pointer-coded errno if a failure occurs.
602adf40 2211 */
4156d998
AE
2212static struct rbd_image_header_ondisk *
2213rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 2214{
4156d998 2215 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 2216 u32 snap_count = 0;
4156d998
AE
2217 u64 names_size = 0;
2218 u32 want_count;
2219 int ret;
602adf40 2220
00f1f36f 2221 /*
4156d998
AE
2222 * The complete header will include an array of its 64-bit
2223 * snapshot ids, followed by the names of those snapshots as
2224 * a contiguous block of NUL-terminated strings. Note that
2225 * the number of snapshots could change by the time we read
2226 * it in, in which case we re-read it.
00f1f36f 2227 */
4156d998
AE
2228 do {
2229 size_t size;
2230
2231 kfree(ondisk);
2232
2233 size = sizeof (*ondisk);
2234 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2235 size += names_size;
2236 ondisk = kmalloc(size, GFP_KERNEL);
2237 if (!ondisk)
2238 return ERR_PTR(-ENOMEM);
2239
788e2df3 2240 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
4156d998
AE
2241 0, size,
2242 (char *) ondisk, version);
4156d998
AE
2243 if (ret < 0)
2244 goto out_err;
2245 if (WARN_ON((size_t) ret < size)) {
2246 ret = -ENXIO;
06ecc6cb
AE
2247 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2248 size, ret);
4156d998
AE
2249 goto out_err;
2250 }
2251 if (!rbd_dev_ondisk_valid(ondisk)) {
2252 ret = -ENXIO;
06ecc6cb 2253 rbd_warn(rbd_dev, "invalid header");
4156d998 2254 goto out_err;
81e759fb 2255 }
602adf40 2256
4156d998
AE
2257 names_size = le64_to_cpu(ondisk->snap_names_len);
2258 want_count = snap_count;
2259 snap_count = le32_to_cpu(ondisk->snap_count);
2260 } while (snap_count != want_count);
00f1f36f 2261
4156d998 2262 return ondisk;
00f1f36f 2263
4156d998
AE
2264out_err:
2265 kfree(ondisk);
2266
2267 return ERR_PTR(ret);
2268}
2269
2270/*
2271 * reload the ondisk the header
2272 */
2273static int rbd_read_header(struct rbd_device *rbd_dev,
2274 struct rbd_image_header *header)
2275{
2276 struct rbd_image_header_ondisk *ondisk;
2277 u64 ver = 0;
2278 int ret;
602adf40 2279
4156d998
AE
2280 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2281 if (IS_ERR(ondisk))
2282 return PTR_ERR(ondisk);
2283 ret = rbd_header_from_disk(header, ondisk);
2284 if (ret >= 0)
2285 header->obj_version = ver;
2286 kfree(ondisk);
2287
2288 return ret;
602adf40
YS
2289}
2290
41f38c2b 2291static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
dfc5606d
YS
2292{
2293 struct rbd_snap *snap;
a0593290 2294 struct rbd_snap *next;
dfc5606d 2295
a0593290 2296 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
41f38c2b 2297 rbd_remove_snap_dev(snap);
dfc5606d
YS
2298}
2299
9478554a
AE
2300static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2301{
2302 sector_t size;
2303
0d7dbfce 2304 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
9478554a
AE
2305 return;
2306
2307 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
2308 dout("setting size to %llu sectors", (unsigned long long) size);
2309 rbd_dev->mapping.size = (u64) size;
2310 set_capacity(rbd_dev->disk, size);
2311}
2312
602adf40
YS
2313/*
2314 * only read the first part of the ondisk header, without the snaps info
2315 */
117973fb 2316static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
2317{
2318 int ret;
2319 struct rbd_image_header h;
602adf40
YS
2320
2321 ret = rbd_read_header(rbd_dev, &h);
2322 if (ret < 0)
2323 return ret;
2324
a51aa0c0
JD
2325 down_write(&rbd_dev->header_rwsem);
2326
9478554a
AE
2327 /* Update image size, and check for resize of mapped image */
2328 rbd_dev->header.image_size = h.image_size;
2329 rbd_update_mapping_size(rbd_dev);
9db4b3e3 2330
849b4260 2331 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 2332 kfree(rbd_dev->header.snap_sizes);
849b4260 2333 kfree(rbd_dev->header.snap_names);
d1d25646
JD
2334 /* osd requests may still refer to snapc */
2335 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 2336
b813623a
AE
2337 if (hver)
2338 *hver = h.obj_version;
a71b891b 2339 rbd_dev->header.obj_version = h.obj_version;
93a24e08 2340 rbd_dev->header.image_size = h.image_size;
602adf40
YS
2341 rbd_dev->header.snapc = h.snapc;
2342 rbd_dev->header.snap_names = h.snap_names;
2343 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
2344 /* Free the extra copy of the object prefix */
2345 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2346 kfree(h.object_prefix);
2347
304f6808
AE
2348 ret = rbd_dev_snaps_update(rbd_dev);
2349 if (!ret)
2350 ret = rbd_dev_snaps_register(rbd_dev);
dfc5606d 2351
c666601a 2352 up_write(&rbd_dev->header_rwsem);
602adf40 2353
dfc5606d 2354 return ret;
602adf40
YS
2355}
2356
117973fb 2357static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
1fe5e993
AE
2358{
2359 int ret;
2360
117973fb 2361 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1fe5e993 2362 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
117973fb
AE
2363 if (rbd_dev->image_format == 1)
2364 ret = rbd_dev_v1_refresh(rbd_dev, hver);
2365 else
2366 ret = rbd_dev_v2_refresh(rbd_dev, hver);
1fe5e993
AE
2367 mutex_unlock(&ctl_mutex);
2368
2369 return ret;
2370}
2371
602adf40
YS
2372static int rbd_init_disk(struct rbd_device *rbd_dev)
2373{
2374 struct gendisk *disk;
2375 struct request_queue *q;
593a9e7b 2376 u64 segment_size;
602adf40 2377
602adf40 2378 /* create gendisk info */
602adf40
YS
2379 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2380 if (!disk)
1fcdb8aa 2381 return -ENOMEM;
602adf40 2382
f0f8cef5 2383 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 2384 rbd_dev->dev_id);
602adf40
YS
2385 disk->major = rbd_dev->major;
2386 disk->first_minor = 0;
2387 disk->fops = &rbd_bd_ops;
2388 disk->private_data = rbd_dev;
2389
bf0d5f50 2390 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
602adf40
YS
2391 if (!q)
2392 goto out_disk;
029bcbd8 2393
593a9e7b
AE
2394 /* We use the default size, but let's be explicit about it. */
2395 blk_queue_physical_block_size(q, SECTOR_SIZE);
2396
029bcbd8 2397 /* set io sizes to object size */
593a9e7b
AE
2398 segment_size = rbd_obj_bytes(&rbd_dev->header);
2399 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2400 blk_queue_max_segment_size(q, segment_size);
2401 blk_queue_io_min(q, segment_size);
2402 blk_queue_io_opt(q, segment_size);
029bcbd8 2403
602adf40
YS
2404 blk_queue_merge_bvec(q, rbd_merge_bvec);
2405 disk->queue = q;
2406
2407 q->queuedata = rbd_dev;
2408
2409 rbd_dev->disk = disk;
602adf40 2410
12f02944
AE
2411 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2412
602adf40 2413 return 0;
602adf40
YS
2414out_disk:
2415 put_disk(disk);
1fcdb8aa
AE
2416
2417 return -ENOMEM;
602adf40
YS
2418}
2419
dfc5606d
YS
2420/*
2421 sysfs
2422*/
2423
593a9e7b
AE
2424static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2425{
2426 return container_of(dev, struct rbd_device, dev);
2427}
2428
dfc5606d
YS
2429static ssize_t rbd_size_show(struct device *dev,
2430 struct device_attribute *attr, char *buf)
2431{
593a9e7b 2432 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
2433 sector_t size;
2434
2435 down_read(&rbd_dev->header_rwsem);
2436 size = get_capacity(rbd_dev->disk);
2437 up_read(&rbd_dev->header_rwsem);
dfc5606d 2438
a51aa0c0 2439 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
2440}
2441
34b13184
AE
2442/*
2443 * Note this shows the features for whatever's mapped, which is not
2444 * necessarily the base image.
2445 */
2446static ssize_t rbd_features_show(struct device *dev,
2447 struct device_attribute *attr, char *buf)
2448{
2449 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2450
2451 return sprintf(buf, "0x%016llx\n",
2452 (unsigned long long) rbd_dev->mapping.features);
2453}
2454
dfc5606d
YS
2455static ssize_t rbd_major_show(struct device *dev,
2456 struct device_attribute *attr, char *buf)
2457{
593a9e7b 2458 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 2459
dfc5606d
YS
2460 return sprintf(buf, "%d\n", rbd_dev->major);
2461}
2462
2463static ssize_t rbd_client_id_show(struct device *dev,
2464 struct device_attribute *attr, char *buf)
602adf40 2465{
593a9e7b 2466 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2467
1dbb4399
AE
2468 return sprintf(buf, "client%lld\n",
2469 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
2470}
2471
dfc5606d
YS
2472static ssize_t rbd_pool_show(struct device *dev,
2473 struct device_attribute *attr, char *buf)
602adf40 2474{
593a9e7b 2475 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2476
0d7dbfce 2477 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
dfc5606d
YS
2478}
2479
9bb2f334
AE
2480static ssize_t rbd_pool_id_show(struct device *dev,
2481 struct device_attribute *attr, char *buf)
2482{
2483 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2484
0d7dbfce
AE
2485 return sprintf(buf, "%llu\n",
2486 (unsigned long long) rbd_dev->spec->pool_id);
9bb2f334
AE
2487}
2488
dfc5606d
YS
2489static ssize_t rbd_name_show(struct device *dev,
2490 struct device_attribute *attr, char *buf)
2491{
593a9e7b 2492 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2493
a92ffdf8
AE
2494 if (rbd_dev->spec->image_name)
2495 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2496
2497 return sprintf(buf, "(unknown)\n");
dfc5606d
YS
2498}
2499
589d30e0
AE
2500static ssize_t rbd_image_id_show(struct device *dev,
2501 struct device_attribute *attr, char *buf)
2502{
2503 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2504
0d7dbfce 2505 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
589d30e0
AE
2506}
2507
34b13184
AE
2508/*
2509 * Shows the name of the currently-mapped snapshot (or
2510 * RBD_SNAP_HEAD_NAME for the base image).
2511 */
dfc5606d
YS
2512static ssize_t rbd_snap_show(struct device *dev,
2513 struct device_attribute *attr,
2514 char *buf)
2515{
593a9e7b 2516 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2517
0d7dbfce 2518 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
dfc5606d
YS
2519}
2520
86b00e0d
AE
2521/*
2522 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2523 * for the parent image. If there is no parent, simply shows
2524 * "(no parent image)".
2525 */
2526static ssize_t rbd_parent_show(struct device *dev,
2527 struct device_attribute *attr,
2528 char *buf)
2529{
2530 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2531 struct rbd_spec *spec = rbd_dev->parent_spec;
2532 int count;
2533 char *bufp = buf;
2534
2535 if (!spec)
2536 return sprintf(buf, "(no parent image)\n");
2537
2538 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2539 (unsigned long long) spec->pool_id, spec->pool_name);
2540 if (count < 0)
2541 return count;
2542 bufp += count;
2543
2544 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2545 spec->image_name ? spec->image_name : "(unknown)");
2546 if (count < 0)
2547 return count;
2548 bufp += count;
2549
2550 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2551 (unsigned long long) spec->snap_id, spec->snap_name);
2552 if (count < 0)
2553 return count;
2554 bufp += count;
2555
2556 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2557 if (count < 0)
2558 return count;
2559 bufp += count;
2560
2561 return (ssize_t) (bufp - buf);
2562}
2563
dfc5606d
YS
2564static ssize_t rbd_image_refresh(struct device *dev,
2565 struct device_attribute *attr,
2566 const char *buf,
2567 size_t size)
2568{
593a9e7b 2569 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 2570 int ret;
602adf40 2571
117973fb 2572 ret = rbd_dev_refresh(rbd_dev, NULL);
b813623a
AE
2573
2574 return ret < 0 ? ret : size;
dfc5606d 2575}
602adf40 2576
dfc5606d 2577static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
34b13184 2578static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
dfc5606d
YS
2579static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2580static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2581static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 2582static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d 2583static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
589d30e0 2584static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
dfc5606d
YS
2585static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2586static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
86b00e0d 2587static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
dfc5606d
YS
2588
2589static struct attribute *rbd_attrs[] = {
2590 &dev_attr_size.attr,
34b13184 2591 &dev_attr_features.attr,
dfc5606d
YS
2592 &dev_attr_major.attr,
2593 &dev_attr_client_id.attr,
2594 &dev_attr_pool.attr,
9bb2f334 2595 &dev_attr_pool_id.attr,
dfc5606d 2596 &dev_attr_name.attr,
589d30e0 2597 &dev_attr_image_id.attr,
dfc5606d 2598 &dev_attr_current_snap.attr,
86b00e0d 2599 &dev_attr_parent.attr,
dfc5606d 2600 &dev_attr_refresh.attr,
dfc5606d
YS
2601 NULL
2602};
2603
2604static struct attribute_group rbd_attr_group = {
2605 .attrs = rbd_attrs,
2606};
2607
2608static const struct attribute_group *rbd_attr_groups[] = {
2609 &rbd_attr_group,
2610 NULL
2611};
2612
2613static void rbd_sysfs_dev_release(struct device *dev)
2614{
2615}
2616
2617static struct device_type rbd_device_type = {
2618 .name = "rbd",
2619 .groups = rbd_attr_groups,
2620 .release = rbd_sysfs_dev_release,
2621};
2622
2623
2624/*
2625 sysfs - snapshots
2626*/
2627
2628static ssize_t rbd_snap_size_show(struct device *dev,
2629 struct device_attribute *attr,
2630 char *buf)
2631{
2632 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2633
3591538f 2634 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
2635}
2636
2637static ssize_t rbd_snap_id_show(struct device *dev,
2638 struct device_attribute *attr,
2639 char *buf)
2640{
2641 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2642
3591538f 2643 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
2644}
2645
34b13184
AE
2646static ssize_t rbd_snap_features_show(struct device *dev,
2647 struct device_attribute *attr,
2648 char *buf)
2649{
2650 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2651
2652 return sprintf(buf, "0x%016llx\n",
2653 (unsigned long long) snap->features);
2654}
2655
dfc5606d
YS
2656static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2657static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
34b13184 2658static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
dfc5606d
YS
2659
2660static struct attribute *rbd_snap_attrs[] = {
2661 &dev_attr_snap_size.attr,
2662 &dev_attr_snap_id.attr,
34b13184 2663 &dev_attr_snap_features.attr,
dfc5606d
YS
2664 NULL,
2665};
2666
2667static struct attribute_group rbd_snap_attr_group = {
2668 .attrs = rbd_snap_attrs,
2669};
2670
2671static void rbd_snap_dev_release(struct device *dev)
2672{
2673 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2674 kfree(snap->name);
2675 kfree(snap);
2676}
2677
2678static const struct attribute_group *rbd_snap_attr_groups[] = {
2679 &rbd_snap_attr_group,
2680 NULL
2681};
2682
2683static struct device_type rbd_snap_device_type = {
2684 .groups = rbd_snap_attr_groups,
2685 .release = rbd_snap_dev_release,
2686};
2687
8b8fb99c
AE
2688static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2689{
2690 kref_get(&spec->kref);
2691
2692 return spec;
2693}
2694
2695static void rbd_spec_free(struct kref *kref);
2696static void rbd_spec_put(struct rbd_spec *spec)
2697{
2698 if (spec)
2699 kref_put(&spec->kref, rbd_spec_free);
2700}
2701
2702static struct rbd_spec *rbd_spec_alloc(void)
2703{
2704 struct rbd_spec *spec;
2705
2706 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2707 if (!spec)
2708 return NULL;
2709 kref_init(&spec->kref);
2710
2711 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2712
2713 return spec;
2714}
2715
2716static void rbd_spec_free(struct kref *kref)
2717{
2718 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2719
2720 kfree(spec->pool_name);
2721 kfree(spec->image_id);
2722 kfree(spec->image_name);
2723 kfree(spec->snap_name);
2724 kfree(spec);
2725}
2726
cc344fa1 2727static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
c53d5893
AE
2728 struct rbd_spec *spec)
2729{
2730 struct rbd_device *rbd_dev;
2731
2732 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2733 if (!rbd_dev)
2734 return NULL;
2735
2736 spin_lock_init(&rbd_dev->lock);
6d292906 2737 rbd_dev->flags = 0;
c53d5893
AE
2738 INIT_LIST_HEAD(&rbd_dev->node);
2739 INIT_LIST_HEAD(&rbd_dev->snaps);
2740 init_rwsem(&rbd_dev->header_rwsem);
2741
2742 rbd_dev->spec = spec;
2743 rbd_dev->rbd_client = rbdc;
2744
0903e875
AE
2745 /* Initialize the layout used for all rbd requests */
2746
2747 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2748 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2749 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2750 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2751
c53d5893
AE
2752 return rbd_dev;
2753}
2754
2755static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2756{
86b00e0d 2757 rbd_spec_put(rbd_dev->parent_spec);
c53d5893
AE
2758 kfree(rbd_dev->header_name);
2759 rbd_put_client(rbd_dev->rbd_client);
2760 rbd_spec_put(rbd_dev->spec);
2761 kfree(rbd_dev);
2762}
2763
304f6808
AE
2764static bool rbd_snap_registered(struct rbd_snap *snap)
2765{
2766 bool ret = snap->dev.type == &rbd_snap_device_type;
2767 bool reg = device_is_registered(&snap->dev);
2768
2769 rbd_assert(!ret ^ reg);
2770
2771 return ret;
2772}
2773
41f38c2b 2774static void rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2775{
2776 list_del(&snap->node);
304f6808
AE
2777 if (device_is_registered(&snap->dev))
2778 device_unregister(&snap->dev);
dfc5606d
YS
2779}
2780
14e7085d 2781static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2782 struct device *parent)
2783{
2784 struct device *dev = &snap->dev;
2785 int ret;
2786
2787 dev->type = &rbd_snap_device_type;
2788 dev->parent = parent;
2789 dev->release = rbd_snap_dev_release;
d4b125e9 2790 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
304f6808
AE
2791 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2792
dfc5606d
YS
2793 ret = device_register(dev);
2794
2795 return ret;
2796}
2797
4e891e0a 2798static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
c8d18425 2799 const char *snap_name,
34b13184
AE
2800 u64 snap_id, u64 snap_size,
2801 u64 snap_features)
dfc5606d 2802{
4e891e0a 2803 struct rbd_snap *snap;
dfc5606d 2804 int ret;
4e891e0a
AE
2805
2806 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2807 if (!snap)
4e891e0a
AE
2808 return ERR_PTR(-ENOMEM);
2809
2810 ret = -ENOMEM;
c8d18425 2811 snap->name = kstrdup(snap_name, GFP_KERNEL);
4e891e0a
AE
2812 if (!snap->name)
2813 goto err;
2814
c8d18425
AE
2815 snap->id = snap_id;
2816 snap->size = snap_size;
34b13184 2817 snap->features = snap_features;
4e891e0a
AE
2818
2819 return snap;
2820
dfc5606d
YS
2821err:
2822 kfree(snap->name);
2823 kfree(snap);
4e891e0a
AE
2824
2825 return ERR_PTR(ret);
dfc5606d
YS
2826}
2827
cd892126
AE
2828static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2829 u64 *snap_size, u64 *snap_features)
2830{
2831 char *snap_name;
2832
2833 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2834
2835 *snap_size = rbd_dev->header.snap_sizes[which];
2836 *snap_features = 0; /* No features for v1 */
2837
2838 /* Skip over names until we find the one we are looking for */
2839
2840 snap_name = rbd_dev->header.snap_names;
2841 while (which--)
2842 snap_name += strlen(snap_name) + 1;
2843
2844 return snap_name;
2845}
2846
9d475de5
AE
2847/*
2848 * Get the size and object order for an image snapshot, or if
2849 * snap_id is CEPH_NOSNAP, gets this information for the base
2850 * image.
2851 */
2852static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2853 u8 *order, u64 *snap_size)
2854{
2855 __le64 snapid = cpu_to_le64(snap_id);
2856 int ret;
2857 struct {
2858 u8 order;
2859 __le64 size;
2860 } __attribute__ ((packed)) size_buf = { 0 };
2861
36be9a76 2862 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
9d475de5
AE
2863 "rbd", "get_size",
2864 (char *) &snapid, sizeof (snapid),
07b2391f 2865 (char *) &size_buf, sizeof (size_buf), NULL);
36be9a76 2866 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
9d475de5
AE
2867 if (ret < 0)
2868 return ret;
2869
2870 *order = size_buf.order;
2871 *snap_size = le64_to_cpu(size_buf.size);
2872
2873 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2874 (unsigned long long) snap_id, (unsigned int) *order,
2875 (unsigned long long) *snap_size);
2876
2877 return 0;
2878}
2879
2880static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2881{
2882 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2883 &rbd_dev->header.obj_order,
2884 &rbd_dev->header.image_size);
2885}
2886
1e130199
AE
2887static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2888{
2889 void *reply_buf;
2890 int ret;
2891 void *p;
2892
2893 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2894 if (!reply_buf)
2895 return -ENOMEM;
2896
36be9a76 2897 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
1e130199
AE
2898 "rbd", "get_object_prefix",
2899 NULL, 0,
07b2391f 2900 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
36be9a76 2901 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
1e130199
AE
2902 if (ret < 0)
2903 goto out;
2904
2905 p = reply_buf;
2906 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2907 p + RBD_OBJ_PREFIX_LEN_MAX,
2908 NULL, GFP_NOIO);
2909
2910 if (IS_ERR(rbd_dev->header.object_prefix)) {
2911 ret = PTR_ERR(rbd_dev->header.object_prefix);
2912 rbd_dev->header.object_prefix = NULL;
2913 } else {
2914 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2915 }
2916
2917out:
2918 kfree(reply_buf);
2919
2920 return ret;
2921}
2922
b1b5402a
AE
2923static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2924 u64 *snap_features)
2925{
2926 __le64 snapid = cpu_to_le64(snap_id);
2927 struct {
2928 __le64 features;
2929 __le64 incompat;
2930 } features_buf = { 0 };
d889140c 2931 u64 incompat;
b1b5402a
AE
2932 int ret;
2933
36be9a76 2934 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
b1b5402a
AE
2935 "rbd", "get_features",
2936 (char *) &snapid, sizeof (snapid),
2937 (char *) &features_buf, sizeof (features_buf),
07b2391f 2938 NULL);
36be9a76 2939 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
b1b5402a
AE
2940 if (ret < 0)
2941 return ret;
d889140c
AE
2942
2943 incompat = le64_to_cpu(features_buf.incompat);
5cbf6f12 2944 if (incompat & ~RBD_FEATURES_SUPPORTED)
b8f5c6ed 2945 return -ENXIO;
d889140c 2946
b1b5402a
AE
2947 *snap_features = le64_to_cpu(features_buf.features);
2948
2949 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2950 (unsigned long long) snap_id,
2951 (unsigned long long) *snap_features,
2952 (unsigned long long) le64_to_cpu(features_buf.incompat));
2953
2954 return 0;
2955}
2956
2957static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2958{
2959 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2960 &rbd_dev->header.features);
2961}
2962
86b00e0d
AE
2963static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2964{
2965 struct rbd_spec *parent_spec;
2966 size_t size;
2967 void *reply_buf = NULL;
2968 __le64 snapid;
2969 void *p;
2970 void *end;
2971 char *image_id;
2972 u64 overlap;
86b00e0d
AE
2973 int ret;
2974
2975 parent_spec = rbd_spec_alloc();
2976 if (!parent_spec)
2977 return -ENOMEM;
2978
2979 size = sizeof (__le64) + /* pool_id */
2980 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2981 sizeof (__le64) + /* snap_id */
2982 sizeof (__le64); /* overlap */
2983 reply_buf = kmalloc(size, GFP_KERNEL);
2984 if (!reply_buf) {
2985 ret = -ENOMEM;
2986 goto out_err;
2987 }
2988
2989 snapid = cpu_to_le64(CEPH_NOSNAP);
36be9a76 2990 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
86b00e0d
AE
2991 "rbd", "get_parent",
2992 (char *) &snapid, sizeof (snapid),
07b2391f 2993 (char *) reply_buf, size, NULL);
36be9a76 2994 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
86b00e0d
AE
2995 if (ret < 0)
2996 goto out_err;
2997
2998 ret = -ERANGE;
2999 p = reply_buf;
3000 end = (char *) reply_buf + size;
3001 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
3002 if (parent_spec->pool_id == CEPH_NOPOOL)
3003 goto out; /* No parent? No problem. */
3004
0903e875
AE
3005 /* The ceph file layout needs to fit pool id in 32 bits */
3006
3007 ret = -EIO;
3008 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
3009 goto out;
3010
979ed480 3011 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
86b00e0d
AE
3012 if (IS_ERR(image_id)) {
3013 ret = PTR_ERR(image_id);
3014 goto out_err;
3015 }
3016 parent_spec->image_id = image_id;
3017 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
3018 ceph_decode_64_safe(&p, end, overlap, out_err);
3019
3020 rbd_dev->parent_overlap = overlap;
3021 rbd_dev->parent_spec = parent_spec;
3022 parent_spec = NULL; /* rbd_dev now owns this */
3023out:
3024 ret = 0;
3025out_err:
3026 kfree(reply_buf);
3027 rbd_spec_put(parent_spec);
3028
3029 return ret;
3030}
3031
9e15b77d
AE
3032static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3033{
3034 size_t image_id_size;
3035 char *image_id;
3036 void *p;
3037 void *end;
3038 size_t size;
3039 void *reply_buf = NULL;
3040 size_t len = 0;
3041 char *image_name = NULL;
3042 int ret;
3043
3044 rbd_assert(!rbd_dev->spec->image_name);
3045
69e7a02f
AE
3046 len = strlen(rbd_dev->spec->image_id);
3047 image_id_size = sizeof (__le32) + len;
9e15b77d
AE
3048 image_id = kmalloc(image_id_size, GFP_KERNEL);
3049 if (!image_id)
3050 return NULL;
3051
3052 p = image_id;
3053 end = (char *) image_id + image_id_size;
69e7a02f 3054 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
9e15b77d
AE
3055
3056 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3057 reply_buf = kmalloc(size, GFP_KERNEL);
3058 if (!reply_buf)
3059 goto out;
3060
36be9a76 3061 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
9e15b77d
AE
3062 "rbd", "dir_get_name",
3063 image_id, image_id_size,
07b2391f 3064 (char *) reply_buf, size, NULL);
9e15b77d
AE
3065 if (ret < 0)
3066 goto out;
3067 p = reply_buf;
3068 end = (char *) reply_buf + size;
3069 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3070 if (IS_ERR(image_name))
3071 image_name = NULL;
3072 else
3073 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3074out:
3075 kfree(reply_buf);
3076 kfree(image_id);
3077
3078 return image_name;
3079}
3080
3081/*
3082 * When a parent image gets probed, we only have the pool, image,
3083 * and snapshot ids but not the names of any of them. This call
3084 * is made later to fill in those names. It has to be done after
3085 * rbd_dev_snaps_update() has completed because some of the
3086 * information (in particular, snapshot name) is not available
3087 * until then.
3088 */
3089static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
3090{
3091 struct ceph_osd_client *osdc;
3092 const char *name;
3093 void *reply_buf = NULL;
3094 int ret;
3095
3096 if (rbd_dev->spec->pool_name)
3097 return 0; /* Already have the names */
3098
3099 /* Look up the pool name */
3100
3101 osdc = &rbd_dev->rbd_client->client->osdc;
3102 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
935dc89f
AE
3103 if (!name) {
3104 rbd_warn(rbd_dev, "there is no pool with id %llu",
3105 rbd_dev->spec->pool_id); /* Really a BUG() */
3106 return -EIO;
3107 }
9e15b77d
AE
3108
3109 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
3110 if (!rbd_dev->spec->pool_name)
3111 return -ENOMEM;
3112
3113 /* Fetch the image name; tolerate failure here */
3114
3115 name = rbd_dev_image_name(rbd_dev);
69e7a02f 3116 if (name)
9e15b77d 3117 rbd_dev->spec->image_name = (char *) name;
69e7a02f 3118 else
06ecc6cb 3119 rbd_warn(rbd_dev, "unable to get image name");
9e15b77d
AE
3120
3121 /* Look up the snapshot name. */
3122
3123 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
3124 if (!name) {
935dc89f
AE
3125 rbd_warn(rbd_dev, "no snapshot with id %llu",
3126 rbd_dev->spec->snap_id); /* Really a BUG() */
9e15b77d
AE
3127 ret = -EIO;
3128 goto out_err;
3129 }
3130 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
3131 if(!rbd_dev->spec->snap_name)
3132 goto out_err;
3133
3134 return 0;
3135out_err:
3136 kfree(reply_buf);
3137 kfree(rbd_dev->spec->pool_name);
3138 rbd_dev->spec->pool_name = NULL;
3139
3140 return ret;
3141}
3142
6e14b1a6 3143static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
35d489f9
AE
3144{
3145 size_t size;
3146 int ret;
3147 void *reply_buf;
3148 void *p;
3149 void *end;
3150 u64 seq;
3151 u32 snap_count;
3152 struct ceph_snap_context *snapc;
3153 u32 i;
3154
3155 /*
3156 * We'll need room for the seq value (maximum snapshot id),
3157 * snapshot count, and array of that many snapshot ids.
3158 * For now we have a fixed upper limit on the number we're
3159 * prepared to receive.
3160 */
3161 size = sizeof (__le64) + sizeof (__le32) +
3162 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3163 reply_buf = kzalloc(size, GFP_KERNEL);
3164 if (!reply_buf)
3165 return -ENOMEM;
3166
36be9a76 3167 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
35d489f9
AE
3168 "rbd", "get_snapcontext",
3169 NULL, 0,
07b2391f 3170 reply_buf, size, ver);
36be9a76 3171 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
35d489f9
AE
3172 if (ret < 0)
3173 goto out;
3174
3175 ret = -ERANGE;
3176 p = reply_buf;
3177 end = (char *) reply_buf + size;
3178 ceph_decode_64_safe(&p, end, seq, out);
3179 ceph_decode_32_safe(&p, end, snap_count, out);
3180
3181 /*
3182 * Make sure the reported number of snapshot ids wouldn't go
3183 * beyond the end of our buffer. But before checking that,
3184 * make sure the computed size of the snapshot context we
3185 * allocate is representable in a size_t.
3186 */
3187 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3188 / sizeof (u64)) {
3189 ret = -EINVAL;
3190 goto out;
3191 }
3192 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3193 goto out;
3194
3195 size = sizeof (struct ceph_snap_context) +
3196 snap_count * sizeof (snapc->snaps[0]);
3197 snapc = kmalloc(size, GFP_KERNEL);
3198 if (!snapc) {
3199 ret = -ENOMEM;
3200 goto out;
3201 }
3202
3203 atomic_set(&snapc->nref, 1);
3204 snapc->seq = seq;
3205 snapc->num_snaps = snap_count;
3206 for (i = 0; i < snap_count; i++)
3207 snapc->snaps[i] = ceph_decode_64(&p);
3208
3209 rbd_dev->header.snapc = snapc;
3210
3211 dout(" snap context seq = %llu, snap_count = %u\n",
3212 (unsigned long long) seq, (unsigned int) snap_count);
3213
3214out:
3215 kfree(reply_buf);
3216
3217 return 0;
3218}
3219
b8b1e2db
AE
3220static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3221{
3222 size_t size;
3223 void *reply_buf;
3224 __le64 snap_id;
3225 int ret;
3226 void *p;
3227 void *end;
b8b1e2db
AE
3228 char *snap_name;
3229
3230 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3231 reply_buf = kmalloc(size, GFP_KERNEL);
3232 if (!reply_buf)
3233 return ERR_PTR(-ENOMEM);
3234
3235 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
36be9a76 3236 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
b8b1e2db
AE
3237 "rbd", "get_snapshot_name",
3238 (char *) &snap_id, sizeof (snap_id),
07b2391f 3239 reply_buf, size, NULL);
36be9a76 3240 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
b8b1e2db
AE
3241 if (ret < 0)
3242 goto out;
3243
3244 p = reply_buf;
3245 end = (char *) reply_buf + size;
e5c35534 3246 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
b8b1e2db
AE
3247 if (IS_ERR(snap_name)) {
3248 ret = PTR_ERR(snap_name);
3249 goto out;
3250 } else {
3251 dout(" snap_id 0x%016llx snap_name = %s\n",
3252 (unsigned long long) le64_to_cpu(snap_id), snap_name);
3253 }
3254 kfree(reply_buf);
3255
3256 return snap_name;
3257out:
3258 kfree(reply_buf);
3259
3260 return ERR_PTR(ret);
3261}
3262
3263static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3264 u64 *snap_size, u64 *snap_features)
3265{
e0b49868 3266 u64 snap_id;
b8b1e2db
AE
3267 u8 order;
3268 int ret;
3269
3270 snap_id = rbd_dev->header.snapc->snaps[which];
3271 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3272 if (ret)
3273 return ERR_PTR(ret);
3274 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3275 if (ret)
3276 return ERR_PTR(ret);
3277
3278 return rbd_dev_v2_snap_name(rbd_dev, which);
3279}
3280
3281static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3282 u64 *snap_size, u64 *snap_features)
3283{
3284 if (rbd_dev->image_format == 1)
3285 return rbd_dev_v1_snap_info(rbd_dev, which,
3286 snap_size, snap_features);
3287 if (rbd_dev->image_format == 2)
3288 return rbd_dev_v2_snap_info(rbd_dev, which,
3289 snap_size, snap_features);
3290 return ERR_PTR(-EINVAL);
3291}
3292
117973fb
AE
3293static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3294{
3295 int ret;
3296 __u8 obj_order;
3297
3298 down_write(&rbd_dev->header_rwsem);
3299
3300 /* Grab old order first, to see if it changes */
3301
3302 obj_order = rbd_dev->header.obj_order,
3303 ret = rbd_dev_v2_image_size(rbd_dev);
3304 if (ret)
3305 goto out;
3306 if (rbd_dev->header.obj_order != obj_order) {
3307 ret = -EIO;
3308 goto out;
3309 }
3310 rbd_update_mapping_size(rbd_dev);
3311
3312 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3313 dout("rbd_dev_v2_snap_context returned %d\n", ret);
3314 if (ret)
3315 goto out;
3316 ret = rbd_dev_snaps_update(rbd_dev);
3317 dout("rbd_dev_snaps_update returned %d\n", ret);
3318 if (ret)
3319 goto out;
3320 ret = rbd_dev_snaps_register(rbd_dev);
3321 dout("rbd_dev_snaps_register returned %d\n", ret);
3322out:
3323 up_write(&rbd_dev->header_rwsem);
3324
3325 return ret;
3326}
3327
dfc5606d 3328/*
35938150
AE
3329 * Scan the rbd device's current snapshot list and compare it to the
3330 * newly-received snapshot context. Remove any existing snapshots
3331 * not present in the new snapshot context. Add a new snapshot for
3332 * any snaphots in the snapshot context not in the current list.
3333 * And verify there are no changes to snapshots we already know
3334 * about.
3335 *
3336 * Assumes the snapshots in the snapshot context are sorted by
3337 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
3338 * are also maintained in that order.)
dfc5606d 3339 */
304f6808 3340static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
dfc5606d 3341{
35938150
AE
3342 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3343 const u32 snap_count = snapc->num_snaps;
35938150
AE
3344 struct list_head *head = &rbd_dev->snaps;
3345 struct list_head *links = head->next;
3346 u32 index = 0;
dfc5606d 3347
9fcbb800 3348 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
3349 while (index < snap_count || links != head) {
3350 u64 snap_id;
3351 struct rbd_snap *snap;
cd892126
AE
3352 char *snap_name;
3353 u64 snap_size = 0;
3354 u64 snap_features = 0;
dfc5606d 3355
35938150
AE
3356 snap_id = index < snap_count ? snapc->snaps[index]
3357 : CEPH_NOSNAP;
3358 snap = links != head ? list_entry(links, struct rbd_snap, node)
3359 : NULL;
aafb230e 3360 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 3361
35938150
AE
3362 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
3363 struct list_head *next = links->next;
dfc5606d 3364
6d292906
AE
3365 /*
3366 * A previously-existing snapshot is not in
3367 * the new snap context.
3368 *
3369 * If the now missing snapshot is the one the
3370 * image is mapped to, clear its exists flag
3371 * so we can avoid sending any more requests
3372 * to it.
3373 */
0d7dbfce 3374 if (rbd_dev->spec->snap_id == snap->id)
6d292906 3375 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
41f38c2b 3376 rbd_remove_snap_dev(snap);
9fcbb800 3377 dout("%ssnap id %llu has been removed\n",
0d7dbfce
AE
3378 rbd_dev->spec->snap_id == snap->id ?
3379 "mapped " : "",
9fcbb800 3380 (unsigned long long) snap->id);
35938150
AE
3381
3382 /* Done with this list entry; advance */
3383
3384 links = next;
dfc5606d
YS
3385 continue;
3386 }
35938150 3387
b8b1e2db
AE
3388 snap_name = rbd_dev_snap_info(rbd_dev, index,
3389 &snap_size, &snap_features);
cd892126
AE
3390 if (IS_ERR(snap_name))
3391 return PTR_ERR(snap_name);
3392
9fcbb800
AE
3393 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
3394 (unsigned long long) snap_id);
35938150
AE
3395 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
3396 struct rbd_snap *new_snap;
3397
3398 /* We haven't seen this snapshot before */
3399
c8d18425 3400 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
cd892126 3401 snap_id, snap_size, snap_features);
9fcbb800
AE
3402 if (IS_ERR(new_snap)) {
3403 int err = PTR_ERR(new_snap);
3404
3405 dout(" failed to add dev, error %d\n", err);
3406
3407 return err;
3408 }
35938150
AE
3409
3410 /* New goes before existing, or at end of list */
3411
9fcbb800 3412 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
3413 if (snap)
3414 list_add_tail(&new_snap->node, &snap->node);
3415 else
523f3258 3416 list_add_tail(&new_snap->node, head);
35938150
AE
3417 } else {
3418 /* Already have this one */
3419
9fcbb800
AE
3420 dout(" already present\n");
3421
cd892126 3422 rbd_assert(snap->size == snap_size);
aafb230e 3423 rbd_assert(!strcmp(snap->name, snap_name));
cd892126 3424 rbd_assert(snap->features == snap_features);
35938150
AE
3425
3426 /* Done with this list entry; advance */
3427
3428 links = links->next;
dfc5606d 3429 }
35938150
AE
3430
3431 /* Advance to the next entry in the snapshot context */
3432
3433 index++;
dfc5606d 3434 }
9fcbb800 3435 dout("%s: done\n", __func__);
dfc5606d
YS
3436
3437 return 0;
3438}
3439
304f6808
AE
3440/*
3441 * Scan the list of snapshots and register the devices for any that
3442 * have not already been registered.
3443 */
3444static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3445{
3446 struct rbd_snap *snap;
3447 int ret = 0;
3448
37206ee5 3449 dout("%s:\n", __func__);
86ff77bb
AE
3450 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3451 return -EIO;
304f6808
AE
3452
3453 list_for_each_entry(snap, &rbd_dev->snaps, node) {
3454 if (!rbd_snap_registered(snap)) {
3455 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3456 if (ret < 0)
3457 break;
3458 }
3459 }
3460 dout("%s: returning %d\n", __func__, ret);
3461
3462 return ret;
3463}
3464
dfc5606d
YS
3465static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3466{
dfc5606d 3467 struct device *dev;
cd789ab9 3468 int ret;
dfc5606d
YS
3469
3470 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
dfc5606d 3471
cd789ab9 3472 dev = &rbd_dev->dev;
dfc5606d
YS
3473 dev->bus = &rbd_bus_type;
3474 dev->type = &rbd_device_type;
3475 dev->parent = &rbd_root_dev;
3476 dev->release = rbd_dev_release;
de71a297 3477 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d 3478 ret = device_register(dev);
dfc5606d 3479
dfc5606d 3480 mutex_unlock(&ctl_mutex);
cd789ab9 3481
dfc5606d 3482 return ret;
602adf40
YS
3483}
3484
dfc5606d
YS
3485static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3486{
3487 device_unregister(&rbd_dev->dev);
3488}
3489
e2839308 3490static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
3491
3492/*
499afd5b
AE
3493 * Get a unique rbd identifier for the given new rbd_dev, and add
3494 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 3495 */
e2839308 3496static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 3497{
e2839308 3498 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
3499
3500 spin_lock(&rbd_dev_list_lock);
3501 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3502 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
3503 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3504 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 3505}
b7f23c36 3506
1ddbe94e 3507/*
499afd5b
AE
3508 * Remove an rbd_dev from the global list, and record that its
3509 * identifier is no longer in use.
1ddbe94e 3510 */
e2839308 3511static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 3512{
d184f6bf 3513 struct list_head *tmp;
de71a297 3514 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
3515 int max_id;
3516
aafb230e 3517 rbd_assert(rbd_id > 0);
499afd5b 3518
e2839308
AE
3519 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3520 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
3521 spin_lock(&rbd_dev_list_lock);
3522 list_del_init(&rbd_dev->node);
d184f6bf
AE
3523
3524 /*
3525 * If the id being "put" is not the current maximum, there
3526 * is nothing special we need to do.
3527 */
e2839308 3528 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
3529 spin_unlock(&rbd_dev_list_lock);
3530 return;
3531 }
3532
3533 /*
3534 * We need to update the current maximum id. Search the
3535 * list to find out what it is. We're more likely to find
3536 * the maximum at the end, so search the list backward.
3537 */
3538 max_id = 0;
3539 list_for_each_prev(tmp, &rbd_dev_list) {
3540 struct rbd_device *rbd_dev;
3541
3542 rbd_dev = list_entry(tmp, struct rbd_device, node);
b213e0b1
AE
3543 if (rbd_dev->dev_id > max_id)
3544 max_id = rbd_dev->dev_id;
d184f6bf 3545 }
499afd5b 3546 spin_unlock(&rbd_dev_list_lock);
b7f23c36 3547
1ddbe94e 3548 /*
e2839308 3549 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
3550 * which case it now accurately reflects the new maximum.
3551 * Be careful not to overwrite the maximum value in that
3552 * case.
1ddbe94e 3553 */
e2839308
AE
3554 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3555 dout(" max dev id has been reset\n");
b7f23c36
AE
3556}
3557
e28fff26
AE
3558/*
3559 * Skips over white space at *buf, and updates *buf to point to the
3560 * first found non-space character (if any). Returns the length of
593a9e7b
AE
3561 * the token (string of non-white space characters) found. Note
3562 * that *buf must be terminated with '\0'.
e28fff26
AE
3563 */
3564static inline size_t next_token(const char **buf)
3565{
3566 /*
3567 * These are the characters that produce nonzero for
3568 * isspace() in the "C" and "POSIX" locales.
3569 */
3570 const char *spaces = " \f\n\r\t\v";
3571
3572 *buf += strspn(*buf, spaces); /* Find start of token */
3573
3574 return strcspn(*buf, spaces); /* Return token length */
3575}
3576
3577/*
3578 * Finds the next token in *buf, and if the provided token buffer is
3579 * big enough, copies the found token into it. The result, if
593a9e7b
AE
3580 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3581 * must be terminated with '\0' on entry.
e28fff26
AE
3582 *
3583 * Returns the length of the token found (not including the '\0').
3584 * Return value will be 0 if no token is found, and it will be >=
3585 * token_size if the token would not fit.
3586 *
593a9e7b 3587 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
3588 * found token. Note that this occurs even if the token buffer is
3589 * too small to hold it.
3590 */
3591static inline size_t copy_token(const char **buf,
3592 char *token,
3593 size_t token_size)
3594{
3595 size_t len;
3596
3597 len = next_token(buf);
3598 if (len < token_size) {
3599 memcpy(token, *buf, len);
3600 *(token + len) = '\0';
3601 }
3602 *buf += len;
3603
3604 return len;
3605}
3606
ea3352f4
AE
3607/*
3608 * Finds the next token in *buf, dynamically allocates a buffer big
3609 * enough to hold a copy of it, and copies the token into the new
3610 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3611 * that a duplicate buffer is created even for a zero-length token.
3612 *
3613 * Returns a pointer to the newly-allocated duplicate, or a null
3614 * pointer if memory for the duplicate was not available. If
3615 * the lenp argument is a non-null pointer, the length of the token
3616 * (not including the '\0') is returned in *lenp.
3617 *
3618 * If successful, the *buf pointer will be updated to point beyond
3619 * the end of the found token.
3620 *
3621 * Note: uses GFP_KERNEL for allocation.
3622 */
3623static inline char *dup_token(const char **buf, size_t *lenp)
3624{
3625 char *dup;
3626 size_t len;
3627
3628 len = next_token(buf);
4caf35f9 3629 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
ea3352f4
AE
3630 if (!dup)
3631 return NULL;
ea3352f4
AE
3632 *(dup + len) = '\0';
3633 *buf += len;
3634
3635 if (lenp)
3636 *lenp = len;
3637
3638 return dup;
3639}
3640
a725f65e 3641/*
859c31df
AE
3642 * Parse the options provided for an "rbd add" (i.e., rbd image
3643 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3644 * and the data written is passed here via a NUL-terminated buffer.
3645 * Returns 0 if successful or an error code otherwise.
d22f76e7 3646 *
859c31df
AE
3647 * The information extracted from these options is recorded in
3648 * the other parameters which return dynamically-allocated
3649 * structures:
3650 * ceph_opts
3651 * The address of a pointer that will refer to a ceph options
3652 * structure. Caller must release the returned pointer using
3653 * ceph_destroy_options() when it is no longer needed.
3654 * rbd_opts
3655 * Address of an rbd options pointer. Fully initialized by
3656 * this function; caller must release with kfree().
3657 * spec
3658 * Address of an rbd image specification pointer. Fully
3659 * initialized by this function based on parsed options.
3660 * Caller must release with rbd_spec_put().
3661 *
3662 * The options passed take this form:
3663 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3664 * where:
3665 * <mon_addrs>
3666 * A comma-separated list of one or more monitor addresses.
3667 * A monitor address is an ip address, optionally followed
3668 * by a port number (separated by a colon).
3669 * I.e.: ip1[:port1][,ip2[:port2]...]
3670 * <options>
3671 * A comma-separated list of ceph and/or rbd options.
3672 * <pool_name>
3673 * The name of the rados pool containing the rbd image.
3674 * <image_name>
3675 * The name of the image in that pool to map.
3676 * <snap_id>
3677 * An optional snapshot id. If provided, the mapping will
3678 * present data from the image at the time that snapshot was
3679 * created. The image head is used if no snapshot id is
3680 * provided. Snapshot mappings are always read-only.
a725f65e 3681 */
859c31df 3682static int rbd_add_parse_args(const char *buf,
dc79b113 3683 struct ceph_options **ceph_opts,
859c31df
AE
3684 struct rbd_options **opts,
3685 struct rbd_spec **rbd_spec)
e28fff26 3686{
d22f76e7 3687 size_t len;
859c31df 3688 char *options;
0ddebc0c
AE
3689 const char *mon_addrs;
3690 size_t mon_addrs_size;
859c31df 3691 struct rbd_spec *spec = NULL;
4e9afeba 3692 struct rbd_options *rbd_opts = NULL;
859c31df 3693 struct ceph_options *copts;
dc79b113 3694 int ret;
e28fff26
AE
3695
3696 /* The first four tokens are required */
3697
7ef3214a 3698 len = next_token(&buf);
4fb5d671
AE
3699 if (!len) {
3700 rbd_warn(NULL, "no monitor address(es) provided");
3701 return -EINVAL;
3702 }
0ddebc0c 3703 mon_addrs = buf;
f28e565a 3704 mon_addrs_size = len + 1;
7ef3214a 3705 buf += len;
a725f65e 3706
dc79b113 3707 ret = -EINVAL;
f28e565a
AE
3708 options = dup_token(&buf, NULL);
3709 if (!options)
dc79b113 3710 return -ENOMEM;
4fb5d671
AE
3711 if (!*options) {
3712 rbd_warn(NULL, "no options provided");
3713 goto out_err;
3714 }
e28fff26 3715
859c31df
AE
3716 spec = rbd_spec_alloc();
3717 if (!spec)
f28e565a 3718 goto out_mem;
859c31df
AE
3719
3720 spec->pool_name = dup_token(&buf, NULL);
3721 if (!spec->pool_name)
3722 goto out_mem;
4fb5d671
AE
3723 if (!*spec->pool_name) {
3724 rbd_warn(NULL, "no pool name provided");
3725 goto out_err;
3726 }
e28fff26 3727
69e7a02f 3728 spec->image_name = dup_token(&buf, NULL);
859c31df 3729 if (!spec->image_name)
f28e565a 3730 goto out_mem;
4fb5d671
AE
3731 if (!*spec->image_name) {
3732 rbd_warn(NULL, "no image name provided");
3733 goto out_err;
3734 }
d4b125e9 3735
f28e565a
AE
3736 /*
3737 * Snapshot name is optional; default is to use "-"
3738 * (indicating the head/no snapshot).
3739 */
3feeb894 3740 len = next_token(&buf);
820a5f3e 3741 if (!len) {
3feeb894
AE
3742 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3743 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a 3744 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
dc79b113 3745 ret = -ENAMETOOLONG;
f28e565a 3746 goto out_err;
849b4260 3747 }
4caf35f9 3748 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
859c31df 3749 if (!spec->snap_name)
f28e565a 3750 goto out_mem;
859c31df 3751 *(spec->snap_name + len) = '\0';
e5c35534 3752
0ddebc0c 3753 /* Initialize all rbd options to the defaults */
e28fff26 3754
4e9afeba
AE
3755 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3756 if (!rbd_opts)
3757 goto out_mem;
3758
3759 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
d22f76e7 3760
859c31df 3761 copts = ceph_parse_options(options, mon_addrs,
0ddebc0c 3762 mon_addrs + mon_addrs_size - 1,
4e9afeba 3763 parse_rbd_opts_token, rbd_opts);
859c31df
AE
3764 if (IS_ERR(copts)) {
3765 ret = PTR_ERR(copts);
dc79b113
AE
3766 goto out_err;
3767 }
859c31df
AE
3768 kfree(options);
3769
3770 *ceph_opts = copts;
4e9afeba 3771 *opts = rbd_opts;
859c31df 3772 *rbd_spec = spec;
0ddebc0c 3773
dc79b113 3774 return 0;
f28e565a 3775out_mem:
dc79b113 3776 ret = -ENOMEM;
d22f76e7 3777out_err:
859c31df
AE
3778 kfree(rbd_opts);
3779 rbd_spec_put(spec);
f28e565a 3780 kfree(options);
d22f76e7 3781
dc79b113 3782 return ret;
a725f65e
AE
3783}
3784
589d30e0
AE
3785/*
3786 * An rbd format 2 image has a unique identifier, distinct from the
3787 * name given to it by the user. Internally, that identifier is
3788 * what's used to specify the names of objects related to the image.
3789 *
3790 * A special "rbd id" object is used to map an rbd image name to its
3791 * id. If that object doesn't exist, then there is no v2 rbd image
3792 * with the supplied name.
3793 *
3794 * This function will record the given rbd_dev's image_id field if
3795 * it can be determined, and in that case will return 0. If any
3796 * errors occur a negative errno will be returned and the rbd_dev's
3797 * image_id field will be unchanged (and should be NULL).
3798 */
3799static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3800{
3801 int ret;
3802 size_t size;
3803 char *object_name;
3804 void *response;
3805 void *p;
3806
2c0d0a10
AE
3807 /*
3808 * When probing a parent image, the image id is already
3809 * known (and the image name likely is not). There's no
3810 * need to fetch the image id again in this case.
3811 */
3812 if (rbd_dev->spec->image_id)
3813 return 0;
3814
589d30e0
AE
3815 /*
3816 * First, see if the format 2 image id file exists, and if
3817 * so, get the image's persistent id from it.
3818 */
69e7a02f 3819 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
589d30e0
AE
3820 object_name = kmalloc(size, GFP_NOIO);
3821 if (!object_name)
3822 return -ENOMEM;
0d7dbfce 3823 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
589d30e0
AE
3824 dout("rbd id object name is %s\n", object_name);
3825
3826 /* Response will be an encoded string, which includes a length */
3827
3828 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3829 response = kzalloc(size, GFP_NOIO);
3830 if (!response) {
3831 ret = -ENOMEM;
3832 goto out;
3833 }
3834
36be9a76 3835 ret = rbd_obj_method_sync(rbd_dev, object_name,
589d30e0
AE
3836 "rbd", "get_id",
3837 NULL, 0,
07b2391f 3838 response, RBD_IMAGE_ID_LEN_MAX, NULL);
36be9a76 3839 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
589d30e0
AE
3840 if (ret < 0)
3841 goto out;
3842
3843 p = response;
0d7dbfce 3844 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
589d30e0 3845 p + RBD_IMAGE_ID_LEN_MAX,
979ed480 3846 NULL, GFP_NOIO);
0d7dbfce
AE
3847 if (IS_ERR(rbd_dev->spec->image_id)) {
3848 ret = PTR_ERR(rbd_dev->spec->image_id);
3849 rbd_dev->spec->image_id = NULL;
589d30e0 3850 } else {
0d7dbfce 3851 dout("image_id is %s\n", rbd_dev->spec->image_id);
589d30e0
AE
3852 }
3853out:
3854 kfree(response);
3855 kfree(object_name);
3856
3857 return ret;
3858}
3859
a30b71b9
AE
3860static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3861{
3862 int ret;
3863 size_t size;
3864
3865 /* Version 1 images have no id; empty string is used */
3866
0d7dbfce
AE
3867 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3868 if (!rbd_dev->spec->image_id)
a30b71b9 3869 return -ENOMEM;
a30b71b9
AE
3870
3871 /* Record the header object name for this rbd image. */
3872
69e7a02f 3873 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
a30b71b9
AE
3874 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3875 if (!rbd_dev->header_name) {
3876 ret = -ENOMEM;
3877 goto out_err;
3878 }
0d7dbfce
AE
3879 sprintf(rbd_dev->header_name, "%s%s",
3880 rbd_dev->spec->image_name, RBD_SUFFIX);
a30b71b9
AE
3881
3882 /* Populate rbd image metadata */
3883
3884 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3885 if (ret < 0)
3886 goto out_err;
86b00e0d
AE
3887
3888 /* Version 1 images have no parent (no layering) */
3889
3890 rbd_dev->parent_spec = NULL;
3891 rbd_dev->parent_overlap = 0;
3892
a30b71b9
AE
3893 rbd_dev->image_format = 1;
3894
3895 dout("discovered version 1 image, header name is %s\n",
3896 rbd_dev->header_name);
3897
3898 return 0;
3899
3900out_err:
3901 kfree(rbd_dev->header_name);
3902 rbd_dev->header_name = NULL;
0d7dbfce
AE
3903 kfree(rbd_dev->spec->image_id);
3904 rbd_dev->spec->image_id = NULL;
a30b71b9
AE
3905
3906 return ret;
3907}
3908
3909static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3910{
3911 size_t size;
9d475de5 3912 int ret;
6e14b1a6 3913 u64 ver = 0;
a30b71b9
AE
3914
3915 /*
3916 * Image id was filled in by the caller. Record the header
3917 * object name for this rbd image.
3918 */
979ed480 3919 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
a30b71b9
AE
3920 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3921 if (!rbd_dev->header_name)
3922 return -ENOMEM;
3923 sprintf(rbd_dev->header_name, "%s%s",
0d7dbfce 3924 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
9d475de5
AE
3925
3926 /* Get the size and object order for the image */
3927
3928 ret = rbd_dev_v2_image_size(rbd_dev);
1e130199
AE
3929 if (ret < 0)
3930 goto out_err;
3931
3932 /* Get the object prefix (a.k.a. block_name) for the image */
3933
3934 ret = rbd_dev_v2_object_prefix(rbd_dev);
b1b5402a
AE
3935 if (ret < 0)
3936 goto out_err;
3937
d889140c 3938 /* Get the and check features for the image */
b1b5402a
AE
3939
3940 ret = rbd_dev_v2_features(rbd_dev);
9d475de5
AE
3941 if (ret < 0)
3942 goto out_err;
35d489f9 3943
86b00e0d
AE
3944 /* If the image supports layering, get the parent info */
3945
3946 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3947 ret = rbd_dev_v2_parent_info(rbd_dev);
3948 if (ret < 0)
3949 goto out_err;
3950 }
3951
6e14b1a6
AE
3952 /* crypto and compression type aren't (yet) supported for v2 images */
3953
3954 rbd_dev->header.crypt_type = 0;
3955 rbd_dev->header.comp_type = 0;
35d489f9 3956
6e14b1a6
AE
3957 /* Get the snapshot context, plus the header version */
3958
3959 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
35d489f9
AE
3960 if (ret)
3961 goto out_err;
6e14b1a6
AE
3962 rbd_dev->header.obj_version = ver;
3963
a30b71b9
AE
3964 rbd_dev->image_format = 2;
3965
3966 dout("discovered version 2 image, header name is %s\n",
3967 rbd_dev->header_name);
3968
35152979 3969 return 0;
9d475de5 3970out_err:
86b00e0d
AE
3971 rbd_dev->parent_overlap = 0;
3972 rbd_spec_put(rbd_dev->parent_spec);
3973 rbd_dev->parent_spec = NULL;
9d475de5
AE
3974 kfree(rbd_dev->header_name);
3975 rbd_dev->header_name = NULL;
1e130199
AE
3976 kfree(rbd_dev->header.object_prefix);
3977 rbd_dev->header.object_prefix = NULL;
9d475de5
AE
3978
3979 return ret;
a30b71b9
AE
3980}
3981
83a06263
AE
3982static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3983{
3984 int ret;
3985
3986 /* no need to lock here, as rbd_dev is not registered yet */
3987 ret = rbd_dev_snaps_update(rbd_dev);
3988 if (ret)
3989 return ret;
3990
9e15b77d
AE
3991 ret = rbd_dev_probe_update_spec(rbd_dev);
3992 if (ret)
3993 goto err_out_snaps;
3994
83a06263
AE
3995 ret = rbd_dev_set_mapping(rbd_dev);
3996 if (ret)
3997 goto err_out_snaps;
3998
3999 /* generate unique id: find highest unique id, add one */
4000 rbd_dev_id_get(rbd_dev);
4001
4002 /* Fill in the device name, now that we have its id. */
4003 BUILD_BUG_ON(DEV_NAME_LEN
4004 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
4005 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
4006
4007 /* Get our block major device number. */
4008
4009 ret = register_blkdev(0, rbd_dev->name);
4010 if (ret < 0)
4011 goto err_out_id;
4012 rbd_dev->major = ret;
4013
4014 /* Set up the blkdev mapping. */
4015
4016 ret = rbd_init_disk(rbd_dev);
4017 if (ret)
4018 goto err_out_blkdev;
4019
4020 ret = rbd_bus_add_dev(rbd_dev);
4021 if (ret)
4022 goto err_out_disk;
4023
4024 /*
4025 * At this point cleanup in the event of an error is the job
4026 * of the sysfs code (initiated by rbd_bus_del_dev()).
4027 */
4028 down_write(&rbd_dev->header_rwsem);
4029 ret = rbd_dev_snaps_register(rbd_dev);
4030 up_write(&rbd_dev->header_rwsem);
4031 if (ret)
4032 goto err_out_bus;
4033
9969ebc5 4034 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
83a06263
AE
4035 if (ret)
4036 goto err_out_bus;
4037
4038 /* Everything's ready. Announce the disk to the world. */
4039
4040 add_disk(rbd_dev->disk);
4041
4042 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4043 (unsigned long long) rbd_dev->mapping.size);
4044
4045 return ret;
4046err_out_bus:
4047 /* this will also clean up rest of rbd_dev stuff */
4048
4049 rbd_bus_del_dev(rbd_dev);
4050
4051 return ret;
4052err_out_disk:
4053 rbd_free_disk(rbd_dev);
4054err_out_blkdev:
4055 unregister_blkdev(rbd_dev->major, rbd_dev->name);
4056err_out_id:
4057 rbd_dev_id_put(rbd_dev);
4058err_out_snaps:
4059 rbd_remove_all_snaps(rbd_dev);
4060
4061 return ret;
4062}
4063
a30b71b9
AE
4064/*
4065 * Probe for the existence of the header object for the given rbd
4066 * device. For format 2 images this includes determining the image
4067 * id.
4068 */
4069static int rbd_dev_probe(struct rbd_device *rbd_dev)
4070{
4071 int ret;
4072
4073 /*
4074 * Get the id from the image id object. If it's not a
4075 * format 2 image, we'll get ENOENT back, and we'll assume
4076 * it's a format 1 image.
4077 */
4078 ret = rbd_dev_image_id(rbd_dev);
4079 if (ret)
4080 ret = rbd_dev_v1_probe(rbd_dev);
4081 else
4082 ret = rbd_dev_v2_probe(rbd_dev);
83a06263 4083 if (ret) {
a30b71b9
AE
4084 dout("probe failed, returning %d\n", ret);
4085
83a06263
AE
4086 return ret;
4087 }
4088
4089 ret = rbd_dev_probe_finish(rbd_dev);
4090 if (ret)
4091 rbd_header_free(&rbd_dev->header);
4092
a30b71b9
AE
4093 return ret;
4094}
4095
59c2be1e
YS
4096static ssize_t rbd_add(struct bus_type *bus,
4097 const char *buf,
4098 size_t count)
602adf40 4099{
cb8627c7 4100 struct rbd_device *rbd_dev = NULL;
dc79b113 4101 struct ceph_options *ceph_opts = NULL;
4e9afeba 4102 struct rbd_options *rbd_opts = NULL;
859c31df 4103 struct rbd_spec *spec = NULL;
9d3997fd 4104 struct rbd_client *rbdc;
27cc2594
AE
4105 struct ceph_osd_client *osdc;
4106 int rc = -ENOMEM;
602adf40
YS
4107
4108 if (!try_module_get(THIS_MODULE))
4109 return -ENODEV;
4110
602adf40 4111 /* parse add command */
859c31df 4112 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
dc79b113 4113 if (rc < 0)
bd4ba655 4114 goto err_out_module;
78cea76e 4115
9d3997fd
AE
4116 rbdc = rbd_get_client(ceph_opts);
4117 if (IS_ERR(rbdc)) {
4118 rc = PTR_ERR(rbdc);
0ddebc0c 4119 goto err_out_args;
9d3997fd 4120 }
c53d5893 4121 ceph_opts = NULL; /* rbd_dev client now owns this */
602adf40 4122
602adf40 4123 /* pick the pool */
9d3997fd 4124 osdc = &rbdc->client->osdc;
859c31df 4125 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
602adf40
YS
4126 if (rc < 0)
4127 goto err_out_client;
859c31df
AE
4128 spec->pool_id = (u64) rc;
4129
0903e875
AE
4130 /* The ceph file layout needs to fit pool id in 32 bits */
4131
4132 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4133 rc = -EIO;
4134 goto err_out_client;
4135 }
4136
c53d5893 4137 rbd_dev = rbd_dev_create(rbdc, spec);
bd4ba655
AE
4138 if (!rbd_dev)
4139 goto err_out_client;
c53d5893
AE
4140 rbdc = NULL; /* rbd_dev now owns this */
4141 spec = NULL; /* rbd_dev now owns this */
602adf40 4142
bd4ba655 4143 rbd_dev->mapping.read_only = rbd_opts->read_only;
c53d5893
AE
4144 kfree(rbd_opts);
4145 rbd_opts = NULL; /* done with this */
bd4ba655 4146
a30b71b9
AE
4147 rc = rbd_dev_probe(rbd_dev);
4148 if (rc < 0)
c53d5893 4149 goto err_out_rbd_dev;
05fd6f6f 4150
602adf40 4151 return count;
c53d5893
AE
4152err_out_rbd_dev:
4153 rbd_dev_destroy(rbd_dev);
bd4ba655 4154err_out_client:
9d3997fd 4155 rbd_put_client(rbdc);
0ddebc0c 4156err_out_args:
78cea76e
AE
4157 if (ceph_opts)
4158 ceph_destroy_options(ceph_opts);
4e9afeba 4159 kfree(rbd_opts);
859c31df 4160 rbd_spec_put(spec);
bd4ba655
AE
4161err_out_module:
4162 module_put(THIS_MODULE);
27cc2594 4163
602adf40 4164 dout("Error adding device %s\n", buf);
27cc2594
AE
4165
4166 return (ssize_t) rc;
602adf40
YS
4167}
4168
de71a297 4169static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
4170{
4171 struct list_head *tmp;
4172 struct rbd_device *rbd_dev;
4173
e124a82f 4174 spin_lock(&rbd_dev_list_lock);
602adf40
YS
4175 list_for_each(tmp, &rbd_dev_list) {
4176 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 4177 if (rbd_dev->dev_id == dev_id) {
e124a82f 4178 spin_unlock(&rbd_dev_list_lock);
602adf40 4179 return rbd_dev;
e124a82f 4180 }
602adf40 4181 }
e124a82f 4182 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
4183 return NULL;
4184}
4185
dfc5606d 4186static void rbd_dev_release(struct device *dev)
602adf40 4187{
593a9e7b 4188 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 4189
59c2be1e 4190 if (rbd_dev->watch_event)
9969ebc5 4191 rbd_dev_header_watch_sync(rbd_dev, 0);
602adf40
YS
4192
4193 /* clean up and free blkdev */
4194 rbd_free_disk(rbd_dev);
4195 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d 4196
2ac4e75d
AE
4197 /* release allocated disk header fields */
4198 rbd_header_free(&rbd_dev->header);
4199
32eec68d 4200 /* done with the id, and with the rbd_dev */
e2839308 4201 rbd_dev_id_put(rbd_dev);
c53d5893
AE
4202 rbd_assert(rbd_dev->rbd_client != NULL);
4203 rbd_dev_destroy(rbd_dev);
602adf40
YS
4204
4205 /* release module ref */
4206 module_put(THIS_MODULE);
602adf40
YS
4207}
4208
dfc5606d
YS
4209static ssize_t rbd_remove(struct bus_type *bus,
4210 const char *buf,
4211 size_t count)
602adf40
YS
4212{
4213 struct rbd_device *rbd_dev = NULL;
4214 int target_id, rc;
4215 unsigned long ul;
4216 int ret = count;
4217
4218 rc = strict_strtoul(buf, 10, &ul);
4219 if (rc)
4220 return rc;
4221
4222 /* convert to int; abort if we lost anything in the conversion */
4223 target_id = (int) ul;
4224 if (target_id != ul)
4225 return -EINVAL;
4226
4227 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4228
4229 rbd_dev = __rbd_get_dev(target_id);
4230 if (!rbd_dev) {
4231 ret = -ENOENT;
4232 goto done;
42382b70
AE
4233 }
4234
a14ea269 4235 spin_lock_irq(&rbd_dev->lock);
b82d167b 4236 if (rbd_dev->open_count)
42382b70 4237 ret = -EBUSY;
b82d167b
AE
4238 else
4239 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
a14ea269 4240 spin_unlock_irq(&rbd_dev->lock);
b82d167b 4241 if (ret < 0)
42382b70 4242 goto done;
602adf40 4243
41f38c2b 4244 rbd_remove_all_snaps(rbd_dev);
dfc5606d 4245 rbd_bus_del_dev(rbd_dev);
602adf40
YS
4246
4247done:
4248 mutex_unlock(&ctl_mutex);
aafb230e 4249
602adf40
YS
4250 return ret;
4251}
4252
602adf40
YS
4253/*
4254 * create control files in sysfs
dfc5606d 4255 * /sys/bus/rbd/...
602adf40
YS
4256 */
4257static int rbd_sysfs_init(void)
4258{
dfc5606d 4259 int ret;
602adf40 4260
fed4c143 4261 ret = device_register(&rbd_root_dev);
21079786 4262 if (ret < 0)
dfc5606d 4263 return ret;
602adf40 4264
fed4c143
AE
4265 ret = bus_register(&rbd_bus_type);
4266 if (ret < 0)
4267 device_unregister(&rbd_root_dev);
602adf40 4268
602adf40
YS
4269 return ret;
4270}
4271
4272static void rbd_sysfs_cleanup(void)
4273{
dfc5606d 4274 bus_unregister(&rbd_bus_type);
fed4c143 4275 device_unregister(&rbd_root_dev);
602adf40
YS
4276}
4277
cc344fa1 4278static int __init rbd_init(void)
602adf40
YS
4279{
4280 int rc;
4281
1e32d34c
AE
4282 if (!libceph_compatible(NULL)) {
4283 rbd_warn(NULL, "libceph incompatibility (quitting)");
4284
4285 return -EINVAL;
4286 }
602adf40
YS
4287 rc = rbd_sysfs_init();
4288 if (rc)
4289 return rc;
f0f8cef5 4290 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
4291 return 0;
4292}
4293
cc344fa1 4294static void __exit rbd_exit(void)
602adf40
YS
4295{
4296 rbd_sysfs_cleanup();
4297}
4298
4299module_init(rbd_init);
4300module_exit(rbd_exit);
4301
4302MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4303MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4304MODULE_DESCRIPTION("rados block device");
4305
4306/* following authorship retained from original osdblk.c */
4307MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4308
4309MODULE_LICENSE("GPL");
This page took 0.386744 seconds and 5 git commands to generate.