rbd: encapsulate image object end request handling
[deliverable/linux.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
f0f8cef5
AE
55#define RBD_DRV_NAME "rbd"
56#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
57
58#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
59
d4b125e9
AE
60#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
61#define RBD_MAX_SNAP_NAME_LEN \
62 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
63
35d489f9 64#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
65
66#define RBD_SNAP_HEAD_NAME "-"
67
9e15b77d
AE
68/* This allows a single page to hold an image name sent by OSD */
69#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
1e130199 70#define RBD_IMAGE_ID_LEN_MAX 64
9e15b77d 71
1e130199 72#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 73
d889140c
AE
74/* Feature bits */
75
5cbf6f12
AE
76#define RBD_FEATURE_LAYERING (1<<0)
77#define RBD_FEATURE_STRIPINGV2 (1<<1)
78#define RBD_FEATURES_ALL \
79 (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
d889140c
AE
80
81/* Features supported by this (client software) implementation. */
82
5cbf6f12 83#define RBD_FEATURES_SUPPORTED (0)
d889140c 84
81a89793
AE
85/*
86 * An RBD device name will be "rbd#", where the "rbd" comes from
87 * RBD_DRV_NAME above, and # is a unique integer identifier.
88 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
89 * enough to hold all possible device names.
90 */
602adf40 91#define DEV_NAME_LEN 32
81a89793 92#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40
YS
93
94/*
95 * block device image metadata (in-memory version)
96 */
97struct rbd_image_header {
f84344f3 98 /* These four fields never change for a given rbd image */
849b4260 99 char *object_prefix;
34b13184 100 u64 features;
602adf40
YS
101 __u8 obj_order;
102 __u8 crypt_type;
103 __u8 comp_type;
602adf40 104
f84344f3
AE
105 /* The remaining fields need to be updated occasionally */
106 u64 image_size;
107 struct ceph_snap_context *snapc;
602adf40
YS
108 char *snap_names;
109 u64 *snap_sizes;
59c2be1e
YS
110
111 u64 obj_version;
112};
113
0d7dbfce
AE
114/*
115 * An rbd image specification.
116 *
117 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
c66c6e0c
AE
118 * identify an image. Each rbd_dev structure includes a pointer to
119 * an rbd_spec structure that encapsulates this identity.
120 *
121 * Each of the id's in an rbd_spec has an associated name. For a
122 * user-mapped image, the names are supplied and the id's associated
123 * with them are looked up. For a layered image, a parent image is
124 * defined by the tuple, and the names are looked up.
125 *
126 * An rbd_dev structure contains a parent_spec pointer which is
127 * non-null if the image it represents is a child in a layered
128 * image. This pointer will refer to the rbd_spec structure used
129 * by the parent rbd_dev for its own identity (i.e., the structure
130 * is shared between the parent and child).
131 *
132 * Since these structures are populated once, during the discovery
133 * phase of image construction, they are effectively immutable so
134 * we make no effort to synchronize access to them.
135 *
136 * Note that code herein does not assume the image name is known (it
137 * could be a null pointer).
0d7dbfce
AE
138 */
139struct rbd_spec {
140 u64 pool_id;
141 char *pool_name;
142
143 char *image_id;
0d7dbfce 144 char *image_name;
0d7dbfce
AE
145
146 u64 snap_id;
147 char *snap_name;
148
149 struct kref kref;
150};
151
602adf40 152/*
f0f8cef5 153 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
154 */
155struct rbd_client {
156 struct ceph_client *client;
157 struct kref kref;
158 struct list_head node;
159};
160
bf0d5f50
AE
161struct rbd_img_request;
162typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
163
164#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
165
166struct rbd_obj_request;
167typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
168
9969ebc5
AE
169enum obj_request_type {
170 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
171};
bf0d5f50
AE
172
173struct rbd_obj_request {
174 const char *object_name;
175 u64 offset; /* object start byte */
176 u64 length; /* bytes from offset */
177
178 struct rbd_img_request *img_request;
7da22d29 179 u64 img_offset; /* image relative offset */
bf0d5f50
AE
180 struct list_head links; /* img_request->obj_requests */
181 u32 which; /* posn image request list */
182
183 enum obj_request_type type;
788e2df3
AE
184 union {
185 struct bio *bio_list;
186 struct {
187 struct page **pages;
188 u32 page_count;
189 };
190 };
bf0d5f50
AE
191
192 struct ceph_osd_request *osd_req;
193
194 u64 xferred; /* bytes transferred */
195 u64 version;
1b83bef2 196 int result;
bf0d5f50
AE
197 atomic_t done;
198
199 rbd_obj_callback_t callback;
788e2df3 200 struct completion completion;
bf0d5f50
AE
201
202 struct kref kref;
203};
204
0c425248 205enum img_req_flags {
9849e986
AE
206 IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */
207 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
d0b2e944 208 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
0c425248
AE
209};
210
bf0d5f50 211struct rbd_img_request {
bf0d5f50
AE
212 struct rbd_device *rbd_dev;
213 u64 offset; /* starting image byte offset */
214 u64 length; /* byte count from offset */
0c425248 215 unsigned long flags;
bf0d5f50 216 union {
9849e986 217 u64 snap_id; /* for reads */
bf0d5f50 218 struct ceph_snap_context *snapc; /* for writes */
9849e986
AE
219 };
220 union {
221 struct request *rq; /* block request */
222 struct rbd_obj_request *obj_request; /* obj req initiator */
bf0d5f50
AE
223 };
224 spinlock_t completion_lock;/* protects next_completion */
225 u32 next_completion;
226 rbd_img_callback_t callback;
55f27e09 227 u64 xferred;/* aggregate bytes transferred */
a5a337d4 228 int result; /* first nonzero obj_request result */
bf0d5f50
AE
229
230 u32 obj_request_count;
231 struct list_head obj_requests; /* rbd_obj_request structs */
232
233 struct kref kref;
234};
235
236#define for_each_obj_request(ireq, oreq) \
ef06f4d3 237 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
bf0d5f50 238#define for_each_obj_request_from(ireq, oreq) \
ef06f4d3 239 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
bf0d5f50 240#define for_each_obj_request_safe(ireq, oreq, n) \
ef06f4d3 241 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
bf0d5f50 242
dfc5606d
YS
243struct rbd_snap {
244 struct device dev;
245 const char *name;
3591538f 246 u64 size;
dfc5606d
YS
247 struct list_head node;
248 u64 id;
34b13184 249 u64 features;
dfc5606d
YS
250};
251
f84344f3 252struct rbd_mapping {
99c1f08f 253 u64 size;
34b13184 254 u64 features;
f84344f3
AE
255 bool read_only;
256};
257
602adf40
YS
258/*
259 * a single device
260 */
261struct rbd_device {
de71a297 262 int dev_id; /* blkdev unique id */
602adf40
YS
263
264 int major; /* blkdev assigned major */
265 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 266
a30b71b9 267 u32 image_format; /* Either 1 or 2 */
602adf40
YS
268 struct rbd_client *rbd_client;
269
270 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
271
b82d167b 272 spinlock_t lock; /* queue, flags, open_count */
602adf40
YS
273
274 struct rbd_image_header header;
b82d167b 275 unsigned long flags; /* possibly lock protected */
0d7dbfce 276 struct rbd_spec *spec;
602adf40 277
0d7dbfce 278 char *header_name;
971f839a 279
0903e875
AE
280 struct ceph_file_layout layout;
281
59c2be1e 282 struct ceph_osd_event *watch_event;
975241af 283 struct rbd_obj_request *watch_request;
59c2be1e 284
86b00e0d
AE
285 struct rbd_spec *parent_spec;
286 u64 parent_overlap;
287
c666601a
JD
288 /* protects updating the header */
289 struct rw_semaphore header_rwsem;
f84344f3
AE
290
291 struct rbd_mapping mapping;
602adf40
YS
292
293 struct list_head node;
dfc5606d
YS
294
295 /* list of snapshots */
296 struct list_head snaps;
297
298 /* sysfs related */
299 struct device dev;
b82d167b 300 unsigned long open_count; /* protected by lock */
dfc5606d
YS
301};
302
b82d167b
AE
303/*
304 * Flag bits for rbd_dev->flags. If atomicity is required,
305 * rbd_dev->lock is used to protect access.
306 *
307 * Currently, only the "removing" flag (which is coupled with the
308 * "open_count" field) requires atomic access.
309 */
6d292906
AE
310enum rbd_dev_flags {
311 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
b82d167b 312 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
6d292906
AE
313};
314
602adf40 315static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 316
602adf40 317static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
318static DEFINE_SPINLOCK(rbd_dev_list_lock);
319
432b8587
AE
320static LIST_HEAD(rbd_client_list); /* clients */
321static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 322
304f6808
AE
323static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
324static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
325
dfc5606d 326static void rbd_dev_release(struct device *dev);
41f38c2b 327static void rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 328
f0f8cef5
AE
329static ssize_t rbd_add(struct bus_type *bus, const char *buf,
330 size_t count);
331static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
332 size_t count);
333
334static struct bus_attribute rbd_bus_attrs[] = {
335 __ATTR(add, S_IWUSR, NULL, rbd_add),
336 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
337 __ATTR_NULL
338};
339
340static struct bus_type rbd_bus_type = {
341 .name = "rbd",
342 .bus_attrs = rbd_bus_attrs,
343};
344
345static void rbd_root_dev_release(struct device *dev)
346{
347}
348
349static struct device rbd_root_dev = {
350 .init_name = "rbd",
351 .release = rbd_root_dev_release,
352};
353
06ecc6cb
AE
354static __printf(2, 3)
355void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
356{
357 struct va_format vaf;
358 va_list args;
359
360 va_start(args, fmt);
361 vaf.fmt = fmt;
362 vaf.va = &args;
363
364 if (!rbd_dev)
365 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
366 else if (rbd_dev->disk)
367 printk(KERN_WARNING "%s: %s: %pV\n",
368 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
369 else if (rbd_dev->spec && rbd_dev->spec->image_name)
370 printk(KERN_WARNING "%s: image %s: %pV\n",
371 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
372 else if (rbd_dev->spec && rbd_dev->spec->image_id)
373 printk(KERN_WARNING "%s: id %s: %pV\n",
374 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
375 else /* punt */
376 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
377 RBD_DRV_NAME, rbd_dev, &vaf);
378 va_end(args);
379}
380
aafb230e
AE
381#ifdef RBD_DEBUG
382#define rbd_assert(expr) \
383 if (unlikely(!(expr))) { \
384 printk(KERN_ERR "\nAssertion failure in %s() " \
385 "at line %d:\n\n" \
386 "\trbd_assert(%s);\n\n", \
387 __func__, __LINE__, #expr); \
388 BUG(); \
389 }
390#else /* !RBD_DEBUG */
391# define rbd_assert(expr) ((void) 0)
392#endif /* !RBD_DEBUG */
dfc5606d 393
117973fb
AE
394static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
395static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 396
602adf40
YS
397static int rbd_open(struct block_device *bdev, fmode_t mode)
398{
f0f8cef5 399 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
b82d167b 400 bool removing = false;
602adf40 401
f84344f3 402 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
403 return -EROFS;
404
a14ea269 405 spin_lock_irq(&rbd_dev->lock);
b82d167b
AE
406 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
407 removing = true;
408 else
409 rbd_dev->open_count++;
a14ea269 410 spin_unlock_irq(&rbd_dev->lock);
b82d167b
AE
411 if (removing)
412 return -ENOENT;
413
42382b70 414 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
c3e946ce 415 (void) get_device(&rbd_dev->dev);
f84344f3 416 set_device_ro(bdev, rbd_dev->mapping.read_only);
42382b70 417 mutex_unlock(&ctl_mutex);
340c7a2b 418
602adf40
YS
419 return 0;
420}
421
dfc5606d
YS
422static int rbd_release(struct gendisk *disk, fmode_t mode)
423{
424 struct rbd_device *rbd_dev = disk->private_data;
b82d167b
AE
425 unsigned long open_count_before;
426
a14ea269 427 spin_lock_irq(&rbd_dev->lock);
b82d167b 428 open_count_before = rbd_dev->open_count--;
a14ea269 429 spin_unlock_irq(&rbd_dev->lock);
b82d167b 430 rbd_assert(open_count_before > 0);
dfc5606d 431
42382b70 432 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
c3e946ce 433 put_device(&rbd_dev->dev);
42382b70 434 mutex_unlock(&ctl_mutex);
dfc5606d
YS
435
436 return 0;
437}
438
602adf40
YS
439static const struct block_device_operations rbd_bd_ops = {
440 .owner = THIS_MODULE,
441 .open = rbd_open,
dfc5606d 442 .release = rbd_release,
602adf40
YS
443};
444
445/*
446 * Initialize an rbd client instance.
43ae4701 447 * We own *ceph_opts.
602adf40 448 */
f8c38929 449static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
450{
451 struct rbd_client *rbdc;
452 int ret = -ENOMEM;
453
37206ee5 454 dout("%s:\n", __func__);
602adf40
YS
455 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
456 if (!rbdc)
457 goto out_opt;
458
459 kref_init(&rbdc->kref);
460 INIT_LIST_HEAD(&rbdc->node);
461
bc534d86
AE
462 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
463
43ae4701 464 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 465 if (IS_ERR(rbdc->client))
bc534d86 466 goto out_mutex;
43ae4701 467 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
468
469 ret = ceph_open_session(rbdc->client);
470 if (ret < 0)
471 goto out_err;
472
432b8587 473 spin_lock(&rbd_client_list_lock);
602adf40 474 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 475 spin_unlock(&rbd_client_list_lock);
602adf40 476
bc534d86 477 mutex_unlock(&ctl_mutex);
37206ee5 478 dout("%s: rbdc %p\n", __func__, rbdc);
bc534d86 479
602adf40
YS
480 return rbdc;
481
482out_err:
483 ceph_destroy_client(rbdc->client);
bc534d86
AE
484out_mutex:
485 mutex_unlock(&ctl_mutex);
602adf40
YS
486 kfree(rbdc);
487out_opt:
43ae4701
AE
488 if (ceph_opts)
489 ceph_destroy_options(ceph_opts);
37206ee5
AE
490 dout("%s: error %d\n", __func__, ret);
491
28f259b7 492 return ERR_PTR(ret);
602adf40
YS
493}
494
495/*
1f7ba331
AE
496 * Find a ceph client with specific addr and configuration. If
497 * found, bump its reference count.
602adf40 498 */
1f7ba331 499static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
500{
501 struct rbd_client *client_node;
1f7ba331 502 bool found = false;
602adf40 503
43ae4701 504 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
505 return NULL;
506
1f7ba331
AE
507 spin_lock(&rbd_client_list_lock);
508 list_for_each_entry(client_node, &rbd_client_list, node) {
509 if (!ceph_compare_options(ceph_opts, client_node->client)) {
510 kref_get(&client_node->kref);
511 found = true;
512 break;
513 }
514 }
515 spin_unlock(&rbd_client_list_lock);
516
517 return found ? client_node : NULL;
602adf40
YS
518}
519
59c2be1e
YS
520/*
521 * mount options
522 */
523enum {
59c2be1e
YS
524 Opt_last_int,
525 /* int args above */
526 Opt_last_string,
527 /* string args above */
cc0538b6
AE
528 Opt_read_only,
529 Opt_read_write,
530 /* Boolean args above */
531 Opt_last_bool,
59c2be1e
YS
532};
533
43ae4701 534static match_table_t rbd_opts_tokens = {
59c2be1e
YS
535 /* int args above */
536 /* string args above */
be466c1c 537 {Opt_read_only, "read_only"},
cc0538b6
AE
538 {Opt_read_only, "ro"}, /* Alternate spelling */
539 {Opt_read_write, "read_write"},
540 {Opt_read_write, "rw"}, /* Alternate spelling */
541 /* Boolean args above */
59c2be1e
YS
542 {-1, NULL}
543};
544
98571b5a
AE
545struct rbd_options {
546 bool read_only;
547};
548
549#define RBD_READ_ONLY_DEFAULT false
550
59c2be1e
YS
551static int parse_rbd_opts_token(char *c, void *private)
552{
43ae4701 553 struct rbd_options *rbd_opts = private;
59c2be1e
YS
554 substring_t argstr[MAX_OPT_ARGS];
555 int token, intval, ret;
556
43ae4701 557 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
558 if (token < 0)
559 return -EINVAL;
560
561 if (token < Opt_last_int) {
562 ret = match_int(&argstr[0], &intval);
563 if (ret < 0) {
564 pr_err("bad mount option arg (not int) "
565 "at '%s'\n", c);
566 return ret;
567 }
568 dout("got int token %d val %d\n", token, intval);
569 } else if (token > Opt_last_int && token < Opt_last_string) {
570 dout("got string token %d val %s\n", token,
571 argstr[0].from);
cc0538b6
AE
572 } else if (token > Opt_last_string && token < Opt_last_bool) {
573 dout("got Boolean token %d\n", token);
59c2be1e
YS
574 } else {
575 dout("got token %d\n", token);
576 }
577
578 switch (token) {
cc0538b6
AE
579 case Opt_read_only:
580 rbd_opts->read_only = true;
581 break;
582 case Opt_read_write:
583 rbd_opts->read_only = false;
584 break;
59c2be1e 585 default:
aafb230e
AE
586 rbd_assert(false);
587 break;
59c2be1e
YS
588 }
589 return 0;
590}
591
602adf40
YS
592/*
593 * Get a ceph client with specific addr and configuration, if one does
594 * not exist create it.
595 */
9d3997fd 596static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
602adf40 597{
f8c38929 598 struct rbd_client *rbdc;
59c2be1e 599
1f7ba331 600 rbdc = rbd_client_find(ceph_opts);
9d3997fd 601 if (rbdc) /* using an existing client */
43ae4701 602 ceph_destroy_options(ceph_opts);
9d3997fd 603 else
f8c38929 604 rbdc = rbd_client_create(ceph_opts);
602adf40 605
9d3997fd 606 return rbdc;
602adf40
YS
607}
608
609/*
610 * Destroy ceph client
d23a4b3f 611 *
432b8587 612 * Caller must hold rbd_client_list_lock.
602adf40
YS
613 */
614static void rbd_client_release(struct kref *kref)
615{
616 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
617
37206ee5 618 dout("%s: rbdc %p\n", __func__, rbdc);
cd9d9f5d 619 spin_lock(&rbd_client_list_lock);
602adf40 620 list_del(&rbdc->node);
cd9d9f5d 621 spin_unlock(&rbd_client_list_lock);
602adf40
YS
622
623 ceph_destroy_client(rbdc->client);
624 kfree(rbdc);
625}
626
627/*
628 * Drop reference to ceph client node. If it's not referenced anymore, release
629 * it.
630 */
9d3997fd 631static void rbd_put_client(struct rbd_client *rbdc)
602adf40 632{
c53d5893
AE
633 if (rbdc)
634 kref_put(&rbdc->kref, rbd_client_release);
602adf40
YS
635}
636
a30b71b9
AE
637static bool rbd_image_format_valid(u32 image_format)
638{
639 return image_format == 1 || image_format == 2;
640}
641
8e94af8e
AE
642static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
643{
103a150f
AE
644 size_t size;
645 u32 snap_count;
646
647 /* The header has to start with the magic rbd header text */
648 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
649 return false;
650
db2388b6
AE
651 /* The bio layer requires at least sector-sized I/O */
652
653 if (ondisk->options.order < SECTOR_SHIFT)
654 return false;
655
656 /* If we use u64 in a few spots we may be able to loosen this */
657
658 if (ondisk->options.order > 8 * sizeof (int) - 1)
659 return false;
660
103a150f
AE
661 /*
662 * The size of a snapshot header has to fit in a size_t, and
663 * that limits the number of snapshots.
664 */
665 snap_count = le32_to_cpu(ondisk->snap_count);
666 size = SIZE_MAX - sizeof (struct ceph_snap_context);
667 if (snap_count > size / sizeof (__le64))
668 return false;
669
670 /*
671 * Not only that, but the size of the entire the snapshot
672 * header must also be representable in a size_t.
673 */
674 size -= snap_count * sizeof (__le64);
675 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
676 return false;
677
678 return true;
8e94af8e
AE
679}
680
602adf40
YS
681/*
682 * Create a new header structure, translate header format from the on-disk
683 * header.
684 */
685static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 686 struct rbd_image_header_ondisk *ondisk)
602adf40 687{
ccece235 688 u32 snap_count;
58c17b0e 689 size_t len;
d2bb24e5 690 size_t size;
621901d6 691 u32 i;
602adf40 692
6a52325f
AE
693 memset(header, 0, sizeof (*header));
694
103a150f
AE
695 snap_count = le32_to_cpu(ondisk->snap_count);
696
58c17b0e
AE
697 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
698 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 699 if (!header->object_prefix)
602adf40 700 return -ENOMEM;
58c17b0e
AE
701 memcpy(header->object_prefix, ondisk->object_prefix, len);
702 header->object_prefix[len] = '\0';
00f1f36f 703
602adf40 704 if (snap_count) {
f785cc1d
AE
705 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
706
621901d6
AE
707 /* Save a copy of the snapshot names */
708
f785cc1d
AE
709 if (snap_names_len > (u64) SIZE_MAX)
710 return -EIO;
711 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 712 if (!header->snap_names)
6a52325f 713 goto out_err;
f785cc1d
AE
714 /*
715 * Note that rbd_dev_v1_header_read() guarantees
716 * the ondisk buffer we're working with has
717 * snap_names_len bytes beyond the end of the
718 * snapshot id array, this memcpy() is safe.
719 */
720 memcpy(header->snap_names, &ondisk->snaps[snap_count],
721 snap_names_len);
6a52325f 722
621901d6
AE
723 /* Record each snapshot's size */
724
d2bb24e5
AE
725 size = snap_count * sizeof (*header->snap_sizes);
726 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 727 if (!header->snap_sizes)
6a52325f 728 goto out_err;
621901d6
AE
729 for (i = 0; i < snap_count; i++)
730 header->snap_sizes[i] =
731 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 732 } else {
ccece235 733 WARN_ON(ondisk->snap_names_len);
602adf40
YS
734 header->snap_names = NULL;
735 header->snap_sizes = NULL;
736 }
849b4260 737
34b13184 738 header->features = 0; /* No features support in v1 images */
602adf40
YS
739 header->obj_order = ondisk->options.order;
740 header->crypt_type = ondisk->options.crypt_type;
741 header->comp_type = ondisk->options.comp_type;
6a52325f 742
621901d6
AE
743 /* Allocate and fill in the snapshot context */
744
f84344f3 745 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
746 size = sizeof (struct ceph_snap_context);
747 size += snap_count * sizeof (header->snapc->snaps[0]);
748 header->snapc = kzalloc(size, GFP_KERNEL);
749 if (!header->snapc)
750 goto out_err;
602adf40
YS
751
752 atomic_set(&header->snapc->nref, 1);
505cbb9b 753 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 754 header->snapc->num_snaps = snap_count;
621901d6
AE
755 for (i = 0; i < snap_count; i++)
756 header->snapc->snaps[i] =
757 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
758
759 return 0;
760
6a52325f 761out_err:
849b4260 762 kfree(header->snap_sizes);
ccece235 763 header->snap_sizes = NULL;
602adf40 764 kfree(header->snap_names);
ccece235 765 header->snap_names = NULL;
6a52325f
AE
766 kfree(header->object_prefix);
767 header->object_prefix = NULL;
ccece235 768
00f1f36f 769 return -ENOMEM;
602adf40
YS
770}
771
9e15b77d
AE
772static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
773{
774 struct rbd_snap *snap;
775
776 if (snap_id == CEPH_NOSNAP)
777 return RBD_SNAP_HEAD_NAME;
778
779 list_for_each_entry(snap, &rbd_dev->snaps, node)
780 if (snap_id == snap->id)
781 return snap->name;
782
783 return NULL;
784}
785
8836b995 786static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40 787{
602adf40 788
e86924a8 789 struct rbd_snap *snap;
602adf40 790
e86924a8
AE
791 list_for_each_entry(snap, &rbd_dev->snaps, node) {
792 if (!strcmp(snap_name, snap->name)) {
0d7dbfce 793 rbd_dev->spec->snap_id = snap->id;
e86924a8 794 rbd_dev->mapping.size = snap->size;
34b13184 795 rbd_dev->mapping.features = snap->features;
602adf40 796
e86924a8 797 return 0;
00f1f36f 798 }
00f1f36f 799 }
e86924a8 800
00f1f36f 801 return -ENOENT;
602adf40
YS
802}
803
819d52bf 804static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
602adf40 805{
78dc447d 806 int ret;
602adf40 807
0d7dbfce 808 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 809 sizeof (RBD_SNAP_HEAD_NAME))) {
0d7dbfce 810 rbd_dev->spec->snap_id = CEPH_NOSNAP;
99c1f08f 811 rbd_dev->mapping.size = rbd_dev->header.image_size;
34b13184 812 rbd_dev->mapping.features = rbd_dev->header.features;
e86924a8 813 ret = 0;
602adf40 814 } else {
0d7dbfce 815 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
602adf40
YS
816 if (ret < 0)
817 goto done;
f84344f3 818 rbd_dev->mapping.read_only = true;
602adf40 819 }
6d292906
AE
820 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
821
602adf40 822done:
602adf40
YS
823 return ret;
824}
825
826static void rbd_header_free(struct rbd_image_header *header)
827{
849b4260 828 kfree(header->object_prefix);
d78fd7ae 829 header->object_prefix = NULL;
602adf40 830 kfree(header->snap_sizes);
d78fd7ae 831 header->snap_sizes = NULL;
849b4260 832 kfree(header->snap_names);
d78fd7ae 833 header->snap_names = NULL;
d1d25646 834 ceph_put_snap_context(header->snapc);
d78fd7ae 835 header->snapc = NULL;
602adf40
YS
836}
837
98571b5a 838static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 839{
65ccfe21
AE
840 char *name;
841 u64 segment;
842 int ret;
602adf40 843
2fd82b9e 844 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
65ccfe21
AE
845 if (!name)
846 return NULL;
847 segment = offset >> rbd_dev->header.obj_order;
2fd82b9e 848 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
65ccfe21 849 rbd_dev->header.object_prefix, segment);
2fd82b9e 850 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
65ccfe21
AE
851 pr_err("error formatting segment name for #%llu (%d)\n",
852 segment, ret);
853 kfree(name);
854 name = NULL;
855 }
602adf40 856
65ccfe21
AE
857 return name;
858}
602adf40 859
65ccfe21
AE
860static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
861{
862 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 863
65ccfe21
AE
864 return offset & (segment_size - 1);
865}
866
867static u64 rbd_segment_length(struct rbd_device *rbd_dev,
868 u64 offset, u64 length)
869{
870 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
871
872 offset &= segment_size - 1;
873
aafb230e 874 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
875 if (offset + length > segment_size)
876 length = segment_size - offset;
877
878 return length;
602adf40
YS
879}
880
029bcbd8
JD
881/*
882 * returns the size of an object in the image
883 */
884static u64 rbd_obj_bytes(struct rbd_image_header *header)
885{
886 return 1 << header->obj_order;
887}
888
602adf40
YS
889/*
890 * bio helpers
891 */
892
893static void bio_chain_put(struct bio *chain)
894{
895 struct bio *tmp;
896
897 while (chain) {
898 tmp = chain;
899 chain = chain->bi_next;
900 bio_put(tmp);
901 }
902}
903
904/*
905 * zeros a bio chain, starting at specific offset
906 */
907static void zero_bio_chain(struct bio *chain, int start_ofs)
908{
909 struct bio_vec *bv;
910 unsigned long flags;
911 void *buf;
912 int i;
913 int pos = 0;
914
915 while (chain) {
916 bio_for_each_segment(bv, chain, i) {
917 if (pos + bv->bv_len > start_ofs) {
918 int remainder = max(start_ofs - pos, 0);
919 buf = bvec_kmap_irq(bv, &flags);
920 memset(buf + remainder, 0,
921 bv->bv_len - remainder);
85b5aaa6 922 bvec_kunmap_irq(buf, &flags);
602adf40
YS
923 }
924 pos += bv->bv_len;
925 }
926
927 chain = chain->bi_next;
928 }
929}
930
931/*
f7760dad
AE
932 * Clone a portion of a bio, starting at the given byte offset
933 * and continuing for the number of bytes indicated.
602adf40 934 */
f7760dad
AE
935static struct bio *bio_clone_range(struct bio *bio_src,
936 unsigned int offset,
937 unsigned int len,
938 gfp_t gfpmask)
602adf40 939{
f7760dad
AE
940 struct bio_vec *bv;
941 unsigned int resid;
942 unsigned short idx;
943 unsigned int voff;
944 unsigned short end_idx;
945 unsigned short vcnt;
946 struct bio *bio;
947
948 /* Handle the easy case for the caller */
949
950 if (!offset && len == bio_src->bi_size)
951 return bio_clone(bio_src, gfpmask);
952
953 if (WARN_ON_ONCE(!len))
954 return NULL;
955 if (WARN_ON_ONCE(len > bio_src->bi_size))
956 return NULL;
957 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
958 return NULL;
959
960 /* Find first affected segment... */
961
962 resid = offset;
963 __bio_for_each_segment(bv, bio_src, idx, 0) {
964 if (resid < bv->bv_len)
965 break;
966 resid -= bv->bv_len;
602adf40 967 }
f7760dad 968 voff = resid;
602adf40 969
f7760dad 970 /* ...and the last affected segment */
602adf40 971
f7760dad
AE
972 resid += len;
973 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
974 if (resid <= bv->bv_len)
975 break;
976 resid -= bv->bv_len;
977 }
978 vcnt = end_idx - idx + 1;
979
980 /* Build the clone */
981
982 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
983 if (!bio)
984 return NULL; /* ENOMEM */
602adf40 985
f7760dad
AE
986 bio->bi_bdev = bio_src->bi_bdev;
987 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
988 bio->bi_rw = bio_src->bi_rw;
989 bio->bi_flags |= 1 << BIO_CLONED;
990
991 /*
992 * Copy over our part of the bio_vec, then update the first
993 * and last (or only) entries.
994 */
995 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
996 vcnt * sizeof (struct bio_vec));
997 bio->bi_io_vec[0].bv_offset += voff;
998 if (vcnt > 1) {
999 bio->bi_io_vec[0].bv_len -= voff;
1000 bio->bi_io_vec[vcnt - 1].bv_len = resid;
1001 } else {
1002 bio->bi_io_vec[0].bv_len = len;
602adf40
YS
1003 }
1004
f7760dad
AE
1005 bio->bi_vcnt = vcnt;
1006 bio->bi_size = len;
1007 bio->bi_idx = 0;
1008
1009 return bio;
1010}
1011
1012/*
1013 * Clone a portion of a bio chain, starting at the given byte offset
1014 * into the first bio in the source chain and continuing for the
1015 * number of bytes indicated. The result is another bio chain of
1016 * exactly the given length, or a null pointer on error.
1017 *
1018 * The bio_src and offset parameters are both in-out. On entry they
1019 * refer to the first source bio and the offset into that bio where
1020 * the start of data to be cloned is located.
1021 *
1022 * On return, bio_src is updated to refer to the bio in the source
1023 * chain that contains first un-cloned byte, and *offset will
1024 * contain the offset of that byte within that bio.
1025 */
1026static struct bio *bio_chain_clone_range(struct bio **bio_src,
1027 unsigned int *offset,
1028 unsigned int len,
1029 gfp_t gfpmask)
1030{
1031 struct bio *bi = *bio_src;
1032 unsigned int off = *offset;
1033 struct bio *chain = NULL;
1034 struct bio **end;
1035
1036 /* Build up a chain of clone bios up to the limit */
1037
1038 if (!bi || off >= bi->bi_size || !len)
1039 return NULL; /* Nothing to clone */
602adf40 1040
f7760dad
AE
1041 end = &chain;
1042 while (len) {
1043 unsigned int bi_size;
1044 struct bio *bio;
1045
f5400b7a
AE
1046 if (!bi) {
1047 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
f7760dad 1048 goto out_err; /* EINVAL; ran out of bio's */
f5400b7a 1049 }
f7760dad
AE
1050 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1051 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1052 if (!bio)
1053 goto out_err; /* ENOMEM */
1054
1055 *end = bio;
1056 end = &bio->bi_next;
602adf40 1057
f7760dad
AE
1058 off += bi_size;
1059 if (off == bi->bi_size) {
1060 bi = bi->bi_next;
1061 off = 0;
1062 }
1063 len -= bi_size;
1064 }
1065 *bio_src = bi;
1066 *offset = off;
1067
1068 return chain;
1069out_err:
1070 bio_chain_put(chain);
602adf40 1071
602adf40
YS
1072 return NULL;
1073}
1074
bf0d5f50
AE
1075static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1076{
37206ee5
AE
1077 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1078 atomic_read(&obj_request->kref.refcount));
bf0d5f50
AE
1079 kref_get(&obj_request->kref);
1080}
1081
1082static void rbd_obj_request_destroy(struct kref *kref);
1083static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1084{
1085 rbd_assert(obj_request != NULL);
37206ee5
AE
1086 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1087 atomic_read(&obj_request->kref.refcount));
bf0d5f50
AE
1088 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1089}
1090
1091static void rbd_img_request_get(struct rbd_img_request *img_request)
1092{
37206ee5
AE
1093 dout("%s: img %p (was %d)\n", __func__, img_request,
1094 atomic_read(&img_request->kref.refcount));
bf0d5f50
AE
1095 kref_get(&img_request->kref);
1096}
1097
1098static void rbd_img_request_destroy(struct kref *kref);
1099static void rbd_img_request_put(struct rbd_img_request *img_request)
1100{
1101 rbd_assert(img_request != NULL);
37206ee5
AE
1102 dout("%s: img %p (was %d)\n", __func__, img_request,
1103 atomic_read(&img_request->kref.refcount));
bf0d5f50
AE
1104 kref_put(&img_request->kref, rbd_img_request_destroy);
1105}
1106
1107static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1108 struct rbd_obj_request *obj_request)
1109{
25dcf954
AE
1110 rbd_assert(obj_request->img_request == NULL);
1111
bf0d5f50
AE
1112 rbd_obj_request_get(obj_request);
1113 obj_request->img_request = img_request;
25dcf954 1114 obj_request->which = img_request->obj_request_count;
bf0d5f50 1115 rbd_assert(obj_request->which != BAD_WHICH);
25dcf954
AE
1116 img_request->obj_request_count++;
1117 list_add_tail(&obj_request->links, &img_request->obj_requests);
37206ee5
AE
1118 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1119 obj_request->which);
bf0d5f50
AE
1120}
1121
1122static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1123 struct rbd_obj_request *obj_request)
1124{
1125 rbd_assert(obj_request->which != BAD_WHICH);
25dcf954 1126
37206ee5
AE
1127 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1128 obj_request->which);
bf0d5f50 1129 list_del(&obj_request->links);
25dcf954
AE
1130 rbd_assert(img_request->obj_request_count > 0);
1131 img_request->obj_request_count--;
1132 rbd_assert(obj_request->which == img_request->obj_request_count);
1133 obj_request->which = BAD_WHICH;
bf0d5f50 1134 rbd_assert(obj_request->img_request == img_request);
bf0d5f50 1135 obj_request->img_request = NULL;
25dcf954 1136 obj_request->callback = NULL;
bf0d5f50
AE
1137 rbd_obj_request_put(obj_request);
1138}
1139
1140static bool obj_request_type_valid(enum obj_request_type type)
1141{
1142 switch (type) {
9969ebc5 1143 case OBJ_REQUEST_NODATA:
bf0d5f50 1144 case OBJ_REQUEST_BIO:
788e2df3 1145 case OBJ_REQUEST_PAGES:
bf0d5f50
AE
1146 return true;
1147 default:
1148 return false;
1149 }
1150}
1151
bf0d5f50
AE
1152static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1153 struct rbd_obj_request *obj_request)
1154{
37206ee5
AE
1155 dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1156
bf0d5f50
AE
1157 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1158}
1159
1160static void rbd_img_request_complete(struct rbd_img_request *img_request)
1161{
55f27e09 1162
37206ee5 1163 dout("%s: img %p\n", __func__, img_request);
55f27e09
AE
1164
1165 /*
1166 * If no error occurred, compute the aggregate transfer
1167 * count for the image request. We could instead use
1168 * atomic64_cmpxchg() to update it as each object request
1169 * completes; not clear which way is better off hand.
1170 */
1171 if (!img_request->result) {
1172 struct rbd_obj_request *obj_request;
1173 u64 xferred = 0;
1174
1175 for_each_obj_request(img_request, obj_request)
1176 xferred += obj_request->xferred;
1177 img_request->xferred = xferred;
1178 }
1179
bf0d5f50
AE
1180 if (img_request->callback)
1181 img_request->callback(img_request);
1182 else
1183 rbd_img_request_put(img_request);
1184}
1185
788e2df3
AE
1186/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1187
1188static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1189{
37206ee5
AE
1190 dout("%s: obj %p\n", __func__, obj_request);
1191
788e2df3
AE
1192 return wait_for_completion_interruptible(&obj_request->completion);
1193}
1194
07741308
AE
1195static void obj_request_done_init(struct rbd_obj_request *obj_request)
1196{
1197 atomic_set(&obj_request->done, 0);
1198 smp_wmb();
1199}
1200
1201static void obj_request_done_set(struct rbd_obj_request *obj_request)
1202{
632b88ca
AE
1203 int done;
1204
1205 done = atomic_inc_return(&obj_request->done);
1206 if (done > 1) {
1207 struct rbd_img_request *img_request = obj_request->img_request;
1208 struct rbd_device *rbd_dev;
1209
1210 rbd_dev = img_request ? img_request->rbd_dev : NULL;
1211 rbd_warn(rbd_dev, "obj_request %p was already done\n",
1212 obj_request);
1213 }
07741308
AE
1214}
1215
1216static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1217{
632b88ca 1218 smp_mb();
07741308
AE
1219 return atomic_read(&obj_request->done) != 0;
1220}
1221
0c425248
AE
1222/*
1223 * The default/initial value for all image request flags is 0. Each
1224 * is conditionally set to 1 at image request initialization time
1225 * and currently never change thereafter.
1226 */
1227static void img_request_write_set(struct rbd_img_request *img_request)
1228{
1229 set_bit(IMG_REQ_WRITE, &img_request->flags);
1230 smp_mb();
1231}
1232
1233static bool img_request_write_test(struct rbd_img_request *img_request)
1234{
1235 smp_mb();
1236 return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1237}
1238
9849e986
AE
1239static void img_request_child_set(struct rbd_img_request *img_request)
1240{
1241 set_bit(IMG_REQ_CHILD, &img_request->flags);
1242 smp_mb();
1243}
1244
1245static bool img_request_child_test(struct rbd_img_request *img_request)
1246{
1247 smp_mb();
1248 return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1249}
1250
d0b2e944
AE
1251static void img_request_layered_set(struct rbd_img_request *img_request)
1252{
1253 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1254 smp_mb();
1255}
1256
1257static bool img_request_layered_test(struct rbd_img_request *img_request)
1258{
1259 smp_mb();
1260 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1261}
1262
6e2a4505
AE
1263static void
1264rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1265{
1266 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1267 obj_request, obj_request->img_request, obj_request->result,
1268 obj_request->xferred, obj_request->length);
1269 /*
1270 * ENOENT means a hole in the image. We zero-fill the
1271 * entire length of the request. A short read also implies
1272 * zero-fill to the end of the request. Either way we
1273 * update the xferred count to indicate the whole request
1274 * was satisfied.
1275 */
1276 BUG_ON(obj_request->type != OBJ_REQUEST_BIO);
1277 if (obj_request->result == -ENOENT) {
1278 zero_bio_chain(obj_request->bio_list, 0);
1279 obj_request->result = 0;
1280 obj_request->xferred = obj_request->length;
1281 } else if (obj_request->xferred < obj_request->length &&
1282 !obj_request->result) {
1283 zero_bio_chain(obj_request->bio_list, obj_request->xferred);
1284 obj_request->xferred = obj_request->length;
1285 }
1286 obj_request_done_set(obj_request);
1287}
1288
bf0d5f50
AE
1289static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1290{
37206ee5
AE
1291 dout("%s: obj %p cb %p\n", __func__, obj_request,
1292 obj_request->callback);
bf0d5f50
AE
1293 if (obj_request->callback)
1294 obj_request->callback(obj_request);
788e2df3
AE
1295 else
1296 complete_all(&obj_request->completion);
bf0d5f50
AE
1297}
1298
c47f9371 1299static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
39bf2c5d
AE
1300{
1301 dout("%s: obj %p\n", __func__, obj_request);
1302 obj_request_done_set(obj_request);
1303}
1304
c47f9371 1305static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
bf0d5f50 1306{
37206ee5 1307 dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
c47f9371 1308 obj_request->result, obj_request->xferred, obj_request->length);
6e2a4505
AE
1309 if (obj_request->img_request)
1310 rbd_img_obj_request_read_callback(obj_request);
1311 else
1312 obj_request_done_set(obj_request);
bf0d5f50
AE
1313}
1314
c47f9371 1315static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
bf0d5f50 1316{
1b83bef2
SW
1317 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1318 obj_request->result, obj_request->length);
1319 /*
1320 * There is no such thing as a successful short write.
1321 * Our xferred value is the number of bytes transferred
1322 * back. Set it to our originally-requested length.
1323 */
1324 obj_request->xferred = obj_request->length;
07741308 1325 obj_request_done_set(obj_request);
bf0d5f50
AE
1326}
1327
fbfab539
AE
1328/*
1329 * For a simple stat call there's nothing to do. We'll do more if
1330 * this is part of a write sequence for a layered image.
1331 */
c47f9371 1332static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
fbfab539 1333{
37206ee5 1334 dout("%s: obj %p\n", __func__, obj_request);
fbfab539
AE
1335 obj_request_done_set(obj_request);
1336}
1337
bf0d5f50
AE
1338static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1339 struct ceph_msg *msg)
1340{
1341 struct rbd_obj_request *obj_request = osd_req->r_priv;
bf0d5f50
AE
1342 u16 opcode;
1343
37206ee5 1344 dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
bf0d5f50
AE
1345 rbd_assert(osd_req == obj_request->osd_req);
1346 rbd_assert(!!obj_request->img_request ^
1347 (obj_request->which == BAD_WHICH));
1348
1b83bef2
SW
1349 if (osd_req->r_result < 0)
1350 obj_request->result = osd_req->r_result;
bf0d5f50
AE
1351 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1352
1b83bef2 1353 WARN_ON(osd_req->r_num_ops != 1); /* For now */
bf0d5f50 1354
c47f9371
AE
1355 /*
1356 * We support a 64-bit length, but ultimately it has to be
1357 * passed to blk_end_request(), which takes an unsigned int.
1358 */
1b83bef2 1359 obj_request->xferred = osd_req->r_reply_op_len[0];
c47f9371 1360 rbd_assert(obj_request->xferred < (u64) UINT_MAX);
79528734 1361 opcode = osd_req->r_ops[0].op;
bf0d5f50
AE
1362 switch (opcode) {
1363 case CEPH_OSD_OP_READ:
c47f9371 1364 rbd_osd_read_callback(obj_request);
bf0d5f50
AE
1365 break;
1366 case CEPH_OSD_OP_WRITE:
c47f9371 1367 rbd_osd_write_callback(obj_request);
bf0d5f50 1368 break;
fbfab539 1369 case CEPH_OSD_OP_STAT:
c47f9371 1370 rbd_osd_stat_callback(obj_request);
fbfab539 1371 break;
36be9a76 1372 case CEPH_OSD_OP_CALL:
b8d70035 1373 case CEPH_OSD_OP_NOTIFY_ACK:
9969ebc5 1374 case CEPH_OSD_OP_WATCH:
c47f9371 1375 rbd_osd_trivial_callback(obj_request);
9969ebc5 1376 break;
bf0d5f50
AE
1377 default:
1378 rbd_warn(NULL, "%s: unsupported op %hu\n",
1379 obj_request->object_name, (unsigned short) opcode);
1380 break;
1381 }
1382
07741308 1383 if (obj_request_done_test(obj_request))
bf0d5f50
AE
1384 rbd_obj_request_complete(obj_request);
1385}
1386
2fa12320 1387static void rbd_osd_req_format(struct rbd_obj_request *obj_request,
79528734 1388 bool write_request)
430c28c3
AE
1389{
1390 struct rbd_img_request *img_request = obj_request->img_request;
8c042b0d 1391 struct ceph_osd_request *osd_req = obj_request->osd_req;
430c28c3
AE
1392 struct ceph_snap_context *snapc = NULL;
1393 u64 snap_id = CEPH_NOSNAP;
1394 struct timespec *mtime = NULL;
1395 struct timespec now;
1396
8c042b0d 1397 rbd_assert(osd_req != NULL);
430c28c3
AE
1398
1399 if (write_request) {
1400 now = CURRENT_TIME;
1401 mtime = &now;
1402 if (img_request)
1403 snapc = img_request->snapc;
2fa12320
AE
1404 } else if (img_request) {
1405 snap_id = img_request->snap_id;
8c042b0d
AE
1406 }
1407 ceph_osdc_build_request(osd_req, obj_request->offset,
79528734 1408 snapc, snap_id, mtime);
430c28c3
AE
1409}
1410
bf0d5f50
AE
1411static struct ceph_osd_request *rbd_osd_req_create(
1412 struct rbd_device *rbd_dev,
1413 bool write_request,
430c28c3 1414 struct rbd_obj_request *obj_request)
bf0d5f50
AE
1415{
1416 struct rbd_img_request *img_request = obj_request->img_request;
1417 struct ceph_snap_context *snapc = NULL;
1418 struct ceph_osd_client *osdc;
1419 struct ceph_osd_request *osd_req;
bf0d5f50
AE
1420
1421 if (img_request) {
0c425248
AE
1422 rbd_assert(write_request ==
1423 img_request_write_test(img_request));
1424 if (write_request)
bf0d5f50 1425 snapc = img_request->snapc;
bf0d5f50
AE
1426 }
1427
1428 /* Allocate and initialize the request, for the single op */
1429
1430 osdc = &rbd_dev->rbd_client->client->osdc;
1431 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1432 if (!osd_req)
1433 return NULL; /* ENOMEM */
bf0d5f50 1434
430c28c3 1435 if (write_request)
bf0d5f50 1436 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
430c28c3 1437 else
bf0d5f50 1438 osd_req->r_flags = CEPH_OSD_FLAG_READ;
bf0d5f50
AE
1439
1440 osd_req->r_callback = rbd_osd_req_callback;
1441 osd_req->r_priv = obj_request;
1442
1443 osd_req->r_oid_len = strlen(obj_request->object_name);
1444 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1445 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1446
1447 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1448
bf0d5f50
AE
1449 return osd_req;
1450}
1451
1452static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1453{
1454 ceph_osdc_put_request(osd_req);
1455}
1456
1457/* object_name is assumed to be a non-null pointer and NUL-terminated */
1458
1459static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1460 u64 offset, u64 length,
1461 enum obj_request_type type)
1462{
1463 struct rbd_obj_request *obj_request;
1464 size_t size;
1465 char *name;
1466
1467 rbd_assert(obj_request_type_valid(type));
1468
1469 size = strlen(object_name) + 1;
1470 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1471 if (!obj_request)
1472 return NULL;
1473
1474 name = (char *)(obj_request + 1);
1475 obj_request->object_name = memcpy(name, object_name, size);
1476 obj_request->offset = offset;
1477 obj_request->length = length;
1478 obj_request->which = BAD_WHICH;
1479 obj_request->type = type;
1480 INIT_LIST_HEAD(&obj_request->links);
07741308 1481 obj_request_done_init(obj_request);
788e2df3 1482 init_completion(&obj_request->completion);
bf0d5f50
AE
1483 kref_init(&obj_request->kref);
1484
37206ee5
AE
1485 dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1486 offset, length, (int)type, obj_request);
1487
bf0d5f50
AE
1488 return obj_request;
1489}
1490
1491static void rbd_obj_request_destroy(struct kref *kref)
1492{
1493 struct rbd_obj_request *obj_request;
1494
1495 obj_request = container_of(kref, struct rbd_obj_request, kref);
1496
37206ee5
AE
1497 dout("%s: obj %p\n", __func__, obj_request);
1498
bf0d5f50
AE
1499 rbd_assert(obj_request->img_request == NULL);
1500 rbd_assert(obj_request->which == BAD_WHICH);
1501
1502 if (obj_request->osd_req)
1503 rbd_osd_req_destroy(obj_request->osd_req);
1504
1505 rbd_assert(obj_request_type_valid(obj_request->type));
1506 switch (obj_request->type) {
9969ebc5
AE
1507 case OBJ_REQUEST_NODATA:
1508 break; /* Nothing to do */
bf0d5f50
AE
1509 case OBJ_REQUEST_BIO:
1510 if (obj_request->bio_list)
1511 bio_chain_put(obj_request->bio_list);
1512 break;
788e2df3
AE
1513 case OBJ_REQUEST_PAGES:
1514 if (obj_request->pages)
1515 ceph_release_page_vector(obj_request->pages,
1516 obj_request->page_count);
1517 break;
bf0d5f50
AE
1518 }
1519
1520 kfree(obj_request);
1521}
1522
1523/*
1524 * Caller is responsible for filling in the list of object requests
1525 * that comprises the image request, and the Linux request pointer
1526 * (if there is one).
1527 */
cc344fa1
AE
1528static struct rbd_img_request *rbd_img_request_create(
1529 struct rbd_device *rbd_dev,
bf0d5f50 1530 u64 offset, u64 length,
9849e986
AE
1531 bool write_request,
1532 bool child_request)
bf0d5f50
AE
1533{
1534 struct rbd_img_request *img_request;
1535 struct ceph_snap_context *snapc = NULL;
1536
1537 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1538 if (!img_request)
1539 return NULL;
1540
1541 if (write_request) {
1542 down_read(&rbd_dev->header_rwsem);
1543 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1544 up_read(&rbd_dev->header_rwsem);
1545 if (WARN_ON(!snapc)) {
1546 kfree(img_request);
1547 return NULL; /* Shouldn't happen */
1548 }
0c425248 1549
bf0d5f50
AE
1550 }
1551
1552 img_request->rq = NULL;
1553 img_request->rbd_dev = rbd_dev;
1554 img_request->offset = offset;
1555 img_request->length = length;
0c425248
AE
1556 img_request->flags = 0;
1557 if (write_request) {
1558 img_request_write_set(img_request);
bf0d5f50 1559 img_request->snapc = snapc;
0c425248 1560 } else {
bf0d5f50 1561 img_request->snap_id = rbd_dev->spec->snap_id;
0c425248 1562 }
9849e986
AE
1563 if (child_request)
1564 img_request_child_set(img_request);
d0b2e944
AE
1565 if (rbd_dev->parent_spec)
1566 img_request_layered_set(img_request);
bf0d5f50
AE
1567 spin_lock_init(&img_request->completion_lock);
1568 img_request->next_completion = 0;
1569 img_request->callback = NULL;
a5a337d4 1570 img_request->result = 0;
bf0d5f50
AE
1571 img_request->obj_request_count = 0;
1572 INIT_LIST_HEAD(&img_request->obj_requests);
1573 kref_init(&img_request->kref);
1574
d0b2e944 1575 (void) img_request_layered_test(img_request); /* Avoid a warning */
bf0d5f50
AE
1576 rbd_img_request_get(img_request); /* Avoid a warning */
1577 rbd_img_request_put(img_request); /* TEMPORARY */
1578
37206ee5
AE
1579 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1580 write_request ? "write" : "read", offset, length,
1581 img_request);
1582
bf0d5f50
AE
1583 return img_request;
1584}
1585
1586static void rbd_img_request_destroy(struct kref *kref)
1587{
1588 struct rbd_img_request *img_request;
1589 struct rbd_obj_request *obj_request;
1590 struct rbd_obj_request *next_obj_request;
1591
1592 img_request = container_of(kref, struct rbd_img_request, kref);
1593
37206ee5
AE
1594 dout("%s: img %p\n", __func__, img_request);
1595
bf0d5f50
AE
1596 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1597 rbd_img_obj_request_del(img_request, obj_request);
25dcf954 1598 rbd_assert(img_request->obj_request_count == 0);
bf0d5f50 1599
0c425248 1600 if (img_request_write_test(img_request))
bf0d5f50
AE
1601 ceph_put_snap_context(img_request->snapc);
1602
1603 kfree(img_request);
1604}
1605
1217857f
AE
1606static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
1607{
1608 struct rbd_img_request *img_request = obj_request->img_request;
1609 unsigned int xferred;
1610 int result;
1611
1612 rbd_assert(!img_request_child_test(img_request));
1613 rbd_assert(img_request->rq != NULL);
1614
1615 rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
1616 xferred = (unsigned int)obj_request->xferred;
1617 result = obj_request->result;
1618 if (result) {
1619 struct rbd_device *rbd_dev = img_request->rbd_dev;
1620
1621 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
1622 img_request_write_test(img_request) ? "write" : "read",
1623 obj_request->length, obj_request->img_offset,
1624 obj_request->offset);
1625 rbd_warn(rbd_dev, " result %d xferred %x\n",
1626 result, xferred);
1627 if (!img_request->result)
1628 img_request->result = result;
1629 }
1630
1631 return blk_end_request(img_request->rq, result, xferred);
1632}
1633
2169238d
AE
1634static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1635{
1636 struct rbd_img_request *img_request;
1637 u32 which = obj_request->which;
1638 bool more = true;
1639
1640 img_request = obj_request->img_request;
1641
1642 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1643 rbd_assert(img_request != NULL);
2169238d
AE
1644 rbd_assert(img_request->obj_request_count > 0);
1645 rbd_assert(which != BAD_WHICH);
1646 rbd_assert(which < img_request->obj_request_count);
1647 rbd_assert(which >= img_request->next_completion);
1648
1649 spin_lock_irq(&img_request->completion_lock);
1650 if (which != img_request->next_completion)
1651 goto out;
1652
1653 for_each_obj_request_from(img_request, obj_request) {
2169238d
AE
1654 rbd_assert(more);
1655 rbd_assert(which < img_request->obj_request_count);
1656
1657 if (!obj_request_done_test(obj_request))
1658 break;
1217857f 1659 more = rbd_img_obj_end_request(obj_request);
2169238d
AE
1660 which++;
1661 }
1662
1663 rbd_assert(more ^ (which == img_request->obj_request_count));
1664 img_request->next_completion = which;
1665out:
1666 spin_unlock_irq(&img_request->completion_lock);
1667
1668 if (!more)
1669 rbd_img_request_complete(img_request);
1670}
1671
bf0d5f50
AE
1672static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1673 struct bio *bio_list)
1674{
1675 struct rbd_device *rbd_dev = img_request->rbd_dev;
1676 struct rbd_obj_request *obj_request = NULL;
1677 struct rbd_obj_request *next_obj_request;
0c425248 1678 bool write_request = img_request_write_test(img_request);
bf0d5f50 1679 unsigned int bio_offset;
7da22d29 1680 u64 img_offset;
bf0d5f50
AE
1681 u64 resid;
1682 u16 opcode;
1683
37206ee5
AE
1684 dout("%s: img %p bio %p\n", __func__, img_request, bio_list);
1685
430c28c3 1686 opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
bf0d5f50 1687 bio_offset = 0;
7da22d29
AE
1688 img_offset = img_request->offset;
1689 rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
bf0d5f50 1690 resid = img_request->length;
4dda41d3 1691 rbd_assert(resid > 0);
bf0d5f50 1692 while (resid) {
2fa12320 1693 struct ceph_osd_request *osd_req;
bf0d5f50
AE
1694 const char *object_name;
1695 unsigned int clone_size;
bf0d5f50
AE
1696 u64 offset;
1697 u64 length;
1698
7da22d29 1699 object_name = rbd_segment_name(rbd_dev, img_offset);
bf0d5f50
AE
1700 if (!object_name)
1701 goto out_unwind;
7da22d29
AE
1702 offset = rbd_segment_offset(rbd_dev, img_offset);
1703 length = rbd_segment_length(rbd_dev, img_offset, resid);
bf0d5f50
AE
1704 obj_request = rbd_obj_request_create(object_name,
1705 offset, length,
1706 OBJ_REQUEST_BIO);
1707 kfree(object_name); /* object request has its own copy */
1708 if (!obj_request)
1709 goto out_unwind;
1710
1711 rbd_assert(length <= (u64) UINT_MAX);
1712 clone_size = (unsigned int) length;
1713 obj_request->bio_list = bio_chain_clone_range(&bio_list,
1714 &bio_offset, clone_size,
1715 GFP_ATOMIC);
1716 if (!obj_request->bio_list)
1717 goto out_partial;
1718
2fa12320
AE
1719 osd_req = rbd_osd_req_create(rbd_dev, write_request,
1720 obj_request);
1721 if (!osd_req)
bf0d5f50 1722 goto out_partial;
2fa12320 1723 obj_request->osd_req = osd_req;
2169238d 1724 obj_request->callback = rbd_img_obj_callback;
430c28c3 1725
2fa12320
AE
1726 osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
1727 0, 0);
a4ce40a9
AE
1728 osd_req_op_extent_osd_data_bio(osd_req, 0, write_request,
1729 obj_request->bio_list, obj_request->length);
2fa12320 1730 rbd_osd_req_format(obj_request, write_request);
430c28c3 1731
7da22d29 1732 obj_request->img_offset = img_offset;
bf0d5f50
AE
1733 rbd_img_obj_request_add(img_request, obj_request);
1734
7da22d29 1735 img_offset += length;
bf0d5f50
AE
1736 resid -= length;
1737 }
1738
1739 return 0;
1740
1741out_partial:
1742 rbd_obj_request_put(obj_request);
1743out_unwind:
1744 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1745 rbd_obj_request_put(obj_request);
1746
1747 return -ENOMEM;
1748}
1749
bf0d5f50
AE
1750static int rbd_img_request_submit(struct rbd_img_request *img_request)
1751{
1752 struct rbd_device *rbd_dev = img_request->rbd_dev;
1753 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1754 struct rbd_obj_request *obj_request;
46faeed4 1755 struct rbd_obj_request *next_obj_request;
bf0d5f50 1756
37206ee5 1757 dout("%s: img %p\n", __func__, img_request);
46faeed4 1758 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
bf0d5f50
AE
1759 int ret;
1760
bf0d5f50
AE
1761 ret = rbd_obj_request_submit(osdc, obj_request);
1762 if (ret)
1763 return ret;
1764 /*
1765 * The image request has its own reference to each
1766 * of its object requests, so we can safely drop the
1767 * initial one here.
1768 */
1769 rbd_obj_request_put(obj_request);
1770 }
1771
1772 return 0;
1773}
1774
cf81b60e 1775static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
b8d70035
AE
1776 u64 ver, u64 notify_id)
1777{
1778 struct rbd_obj_request *obj_request;
2169238d 1779 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
b8d70035
AE
1780 int ret;
1781
1782 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1783 OBJ_REQUEST_NODATA);
1784 if (!obj_request)
1785 return -ENOMEM;
1786
1787 ret = -ENOMEM;
430c28c3 1788 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
b8d70035
AE
1789 if (!obj_request->osd_req)
1790 goto out;
2169238d 1791 obj_request->callback = rbd_obj_request_put;
b8d70035 1792
c99d2d4a
AE
1793 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
1794 notify_id, ver, 0);
2fa12320 1795 rbd_osd_req_format(obj_request, false);
430c28c3 1796
b8d70035 1797 ret = rbd_obj_request_submit(osdc, obj_request);
b8d70035 1798out:
cf81b60e
AE
1799 if (ret)
1800 rbd_obj_request_put(obj_request);
b8d70035
AE
1801
1802 return ret;
1803}
1804
1805static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1806{
1807 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1808 u64 hver;
1809 int rc;
1810
1811 if (!rbd_dev)
1812 return;
1813
37206ee5 1814 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
b8d70035
AE
1815 rbd_dev->header_name, (unsigned long long) notify_id,
1816 (unsigned int) opcode);
1817 rc = rbd_dev_refresh(rbd_dev, &hver);
1818 if (rc)
1819 rbd_warn(rbd_dev, "got notification but failed to "
1820 " update snaps: %d\n", rc);
1821
cf81b60e 1822 rbd_obj_notify_ack(rbd_dev, hver, notify_id);
b8d70035
AE
1823}
1824
9969ebc5
AE
1825/*
1826 * Request sync osd watch/unwatch. The value of "start" determines
1827 * whether a watch request is being initiated or torn down.
1828 */
1829static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1830{
1831 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1832 struct rbd_obj_request *obj_request;
9969ebc5
AE
1833 int ret;
1834
1835 rbd_assert(start ^ !!rbd_dev->watch_event);
1836 rbd_assert(start ^ !!rbd_dev->watch_request);
1837
1838 if (start) {
3c663bbd 1839 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
9969ebc5
AE
1840 &rbd_dev->watch_event);
1841 if (ret < 0)
1842 return ret;
8eb87565 1843 rbd_assert(rbd_dev->watch_event != NULL);
9969ebc5
AE
1844 }
1845
1846 ret = -ENOMEM;
1847 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1848 OBJ_REQUEST_NODATA);
1849 if (!obj_request)
1850 goto out_cancel;
1851
430c28c3
AE
1852 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
1853 if (!obj_request->osd_req)
1854 goto out_cancel;
1855
8eb87565 1856 if (start)
975241af 1857 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
8eb87565 1858 else
6977c3f9 1859 ceph_osdc_unregister_linger_request(osdc,
975241af 1860 rbd_dev->watch_request->osd_req);
2169238d
AE
1861
1862 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
1863 rbd_dev->watch_event->cookie,
1864 rbd_dev->header.obj_version, start);
1865 rbd_osd_req_format(obj_request, true);
1866
9969ebc5
AE
1867 ret = rbd_obj_request_submit(osdc, obj_request);
1868 if (ret)
1869 goto out_cancel;
1870 ret = rbd_obj_request_wait(obj_request);
1871 if (ret)
1872 goto out_cancel;
9969ebc5
AE
1873 ret = obj_request->result;
1874 if (ret)
1875 goto out_cancel;
1876
8eb87565
AE
1877 /*
1878 * A watch request is set to linger, so the underlying osd
1879 * request won't go away until we unregister it. We retain
1880 * a pointer to the object request during that time (in
1881 * rbd_dev->watch_request), so we'll keep a reference to
1882 * it. We'll drop that reference (below) after we've
1883 * unregistered it.
1884 */
1885 if (start) {
1886 rbd_dev->watch_request = obj_request;
1887
1888 return 0;
1889 }
1890
1891 /* We have successfully torn down the watch request */
1892
1893 rbd_obj_request_put(rbd_dev->watch_request);
1894 rbd_dev->watch_request = NULL;
9969ebc5
AE
1895out_cancel:
1896 /* Cancel the event if we're tearing down, or on error */
1897 ceph_osdc_cancel_event(rbd_dev->watch_event);
1898 rbd_dev->watch_event = NULL;
9969ebc5
AE
1899 if (obj_request)
1900 rbd_obj_request_put(obj_request);
1901
1902 return ret;
1903}
1904
36be9a76
AE
1905/*
1906 * Synchronous osd object method call
1907 */
1908static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
1909 const char *object_name,
1910 const char *class_name,
1911 const char *method_name,
1912 const char *outbound,
1913 size_t outbound_size,
1914 char *inbound,
1915 size_t inbound_size,
1916 u64 *version)
1917{
2169238d 1918 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
36be9a76 1919 struct rbd_obj_request *obj_request;
36be9a76
AE
1920 struct page **pages;
1921 u32 page_count;
1922 int ret;
1923
1924 /*
6010a451
AE
1925 * Method calls are ultimately read operations. The result
1926 * should placed into the inbound buffer provided. They
1927 * also supply outbound data--parameters for the object
1928 * method. Currently if this is present it will be a
1929 * snapshot id.
36be9a76
AE
1930 */
1931 page_count = (u32) calc_pages_for(0, inbound_size);
1932 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
1933 if (IS_ERR(pages))
1934 return PTR_ERR(pages);
1935
1936 ret = -ENOMEM;
6010a451 1937 obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
36be9a76
AE
1938 OBJ_REQUEST_PAGES);
1939 if (!obj_request)
1940 goto out;
1941
1942 obj_request->pages = pages;
1943 obj_request->page_count = page_count;
1944
430c28c3 1945 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
36be9a76
AE
1946 if (!obj_request->osd_req)
1947 goto out;
1948
c99d2d4a 1949 osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
04017e29
AE
1950 class_name, method_name);
1951 if (outbound_size) {
1952 struct ceph_pagelist *pagelist;
1953
1954 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
1955 if (!pagelist)
1956 goto out;
1957
1958 ceph_pagelist_init(pagelist);
1959 ceph_pagelist_append(pagelist, outbound, outbound_size);
1960 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
1961 pagelist);
1962 }
a4ce40a9
AE
1963 osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
1964 obj_request->pages, inbound_size,
44cd188d 1965 0, false, false);
2fa12320 1966 rbd_osd_req_format(obj_request, false);
430c28c3 1967
36be9a76
AE
1968 ret = rbd_obj_request_submit(osdc, obj_request);
1969 if (ret)
1970 goto out;
1971 ret = rbd_obj_request_wait(obj_request);
1972 if (ret)
1973 goto out;
1974
1975 ret = obj_request->result;
1976 if (ret < 0)
1977 goto out;
23ed6e13 1978 ret = 0;
903bb32e 1979 ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
36be9a76
AE
1980 if (version)
1981 *version = obj_request->version;
1982out:
1983 if (obj_request)
1984 rbd_obj_request_put(obj_request);
1985 else
1986 ceph_release_page_vector(pages, page_count);
1987
1988 return ret;
1989}
1990
bf0d5f50 1991static void rbd_request_fn(struct request_queue *q)
cc344fa1 1992 __releases(q->queue_lock) __acquires(q->queue_lock)
bf0d5f50
AE
1993{
1994 struct rbd_device *rbd_dev = q->queuedata;
1995 bool read_only = rbd_dev->mapping.read_only;
1996 struct request *rq;
1997 int result;
1998
1999 while ((rq = blk_fetch_request(q))) {
2000 bool write_request = rq_data_dir(rq) == WRITE;
2001 struct rbd_img_request *img_request;
2002 u64 offset;
2003 u64 length;
2004
2005 /* Ignore any non-FS requests that filter through. */
2006
2007 if (rq->cmd_type != REQ_TYPE_FS) {
4dda41d3
AE
2008 dout("%s: non-fs request type %d\n", __func__,
2009 (int) rq->cmd_type);
2010 __blk_end_request_all(rq, 0);
2011 continue;
2012 }
2013
2014 /* Ignore/skip any zero-length requests */
2015
2016 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
2017 length = (u64) blk_rq_bytes(rq);
2018
2019 if (!length) {
2020 dout("%s: zero-length request\n", __func__);
bf0d5f50
AE
2021 __blk_end_request_all(rq, 0);
2022 continue;
2023 }
2024
2025 spin_unlock_irq(q->queue_lock);
2026
2027 /* Disallow writes to a read-only device */
2028
2029 if (write_request) {
2030 result = -EROFS;
2031 if (read_only)
2032 goto end_request;
2033 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2034 }
2035
6d292906
AE
2036 /*
2037 * Quit early if the mapped snapshot no longer
2038 * exists. It's still possible the snapshot will
2039 * have disappeared by the time our request arrives
2040 * at the osd, but there's no sense in sending it if
2041 * we already know.
2042 */
2043 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
bf0d5f50
AE
2044 dout("request for non-existent snapshot");
2045 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2046 result = -ENXIO;
2047 goto end_request;
2048 }
2049
bf0d5f50
AE
2050 result = -EINVAL;
2051 if (WARN_ON(offset && length > U64_MAX - offset + 1))
2052 goto end_request; /* Shouldn't happen */
2053
2054 result = -ENOMEM;
2055 img_request = rbd_img_request_create(rbd_dev, offset, length,
9849e986 2056 write_request, false);
bf0d5f50
AE
2057 if (!img_request)
2058 goto end_request;
2059
2060 img_request->rq = rq;
2061
2062 result = rbd_img_request_fill_bio(img_request, rq->bio);
2063 if (!result)
2064 result = rbd_img_request_submit(img_request);
2065 if (result)
2066 rbd_img_request_put(img_request);
2067end_request:
2068 spin_lock_irq(q->queue_lock);
2069 if (result < 0) {
7da22d29
AE
2070 rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
2071 write_request ? "write" : "read",
2072 length, offset, result);
2073
bf0d5f50
AE
2074 __blk_end_request_all(rq, result);
2075 }
2076 }
2077}
2078
602adf40
YS
2079/*
2080 * a queue callback. Makes sure that we don't create a bio that spans across
2081 * multiple osd objects. One exception would be with a single page bios,
f7760dad 2082 * which we handle later at bio_chain_clone_range()
602adf40
YS
2083 */
2084static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2085 struct bio_vec *bvec)
2086{
2087 struct rbd_device *rbd_dev = q->queuedata;
e5cfeed2
AE
2088 sector_t sector_offset;
2089 sector_t sectors_per_obj;
2090 sector_t obj_sector_offset;
2091 int ret;
2092
2093 /*
2094 * Find how far into its rbd object the partition-relative
2095 * bio start sector is to offset relative to the enclosing
2096 * device.
2097 */
2098 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2099 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2100 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2101
2102 /*
2103 * Compute the number of bytes from that offset to the end
2104 * of the object. Account for what's already used by the bio.
2105 */
2106 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2107 if (ret > bmd->bi_size)
2108 ret -= bmd->bi_size;
2109 else
2110 ret = 0;
2111
2112 /*
2113 * Don't send back more than was asked for. And if the bio
2114 * was empty, let the whole thing through because: "Note
2115 * that a block device *must* allow a single page to be
2116 * added to an empty bio."
2117 */
2118 rbd_assert(bvec->bv_len <= PAGE_SIZE);
2119 if (ret > (int) bvec->bv_len || !bmd->bi_size)
2120 ret = (int) bvec->bv_len;
2121
2122 return ret;
602adf40
YS
2123}
2124
2125static void rbd_free_disk(struct rbd_device *rbd_dev)
2126{
2127 struct gendisk *disk = rbd_dev->disk;
2128
2129 if (!disk)
2130 return;
2131
602adf40
YS
2132 if (disk->flags & GENHD_FL_UP)
2133 del_gendisk(disk);
2134 if (disk->queue)
2135 blk_cleanup_queue(disk->queue);
2136 put_disk(disk);
2137}
2138
788e2df3
AE
2139static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2140 const char *object_name,
2141 u64 offset, u64 length,
2142 char *buf, u64 *version)
2143
2144{
2169238d 2145 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
788e2df3 2146 struct rbd_obj_request *obj_request;
788e2df3
AE
2147 struct page **pages = NULL;
2148 u32 page_count;
1ceae7ef 2149 size_t size;
788e2df3
AE
2150 int ret;
2151
2152 page_count = (u32) calc_pages_for(offset, length);
2153 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2154 if (IS_ERR(pages))
2155 ret = PTR_ERR(pages);
2156
2157 ret = -ENOMEM;
2158 obj_request = rbd_obj_request_create(object_name, offset, length,
36be9a76 2159 OBJ_REQUEST_PAGES);
788e2df3
AE
2160 if (!obj_request)
2161 goto out;
2162
2163 obj_request->pages = pages;
2164 obj_request->page_count = page_count;
2165
430c28c3 2166 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
788e2df3
AE
2167 if (!obj_request->osd_req)
2168 goto out;
2169
c99d2d4a
AE
2170 osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2171 offset, length, 0, 0);
a4ce40a9
AE
2172 osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, false,
2173 obj_request->pages,
44cd188d
AE
2174 obj_request->length,
2175 obj_request->offset & ~PAGE_MASK,
2176 false, false);
2fa12320 2177 rbd_osd_req_format(obj_request, false);
430c28c3 2178
788e2df3
AE
2179 ret = rbd_obj_request_submit(osdc, obj_request);
2180 if (ret)
2181 goto out;
2182 ret = rbd_obj_request_wait(obj_request);
2183 if (ret)
2184 goto out;
2185
2186 ret = obj_request->result;
2187 if (ret < 0)
2188 goto out;
1ceae7ef
AE
2189
2190 rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2191 size = (size_t) obj_request->xferred;
903bb32e 2192 ceph_copy_from_page_vector(pages, buf, 0, size);
23ed6e13
AE
2193 rbd_assert(size <= (size_t) INT_MAX);
2194 ret = (int) size;
788e2df3
AE
2195 if (version)
2196 *version = obj_request->version;
2197out:
2198 if (obj_request)
2199 rbd_obj_request_put(obj_request);
2200 else
2201 ceph_release_page_vector(pages, page_count);
2202
2203 return ret;
2204}
2205
602adf40 2206/*
4156d998
AE
2207 * Read the complete header for the given rbd device.
2208 *
2209 * Returns a pointer to a dynamically-allocated buffer containing
2210 * the complete and validated header. Caller can pass the address
2211 * of a variable that will be filled in with the version of the
2212 * header object at the time it was read.
2213 *
2214 * Returns a pointer-coded errno if a failure occurs.
602adf40 2215 */
4156d998
AE
2216static struct rbd_image_header_ondisk *
2217rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 2218{
4156d998 2219 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 2220 u32 snap_count = 0;
4156d998
AE
2221 u64 names_size = 0;
2222 u32 want_count;
2223 int ret;
602adf40 2224
00f1f36f 2225 /*
4156d998
AE
2226 * The complete header will include an array of its 64-bit
2227 * snapshot ids, followed by the names of those snapshots as
2228 * a contiguous block of NUL-terminated strings. Note that
2229 * the number of snapshots could change by the time we read
2230 * it in, in which case we re-read it.
00f1f36f 2231 */
4156d998
AE
2232 do {
2233 size_t size;
2234
2235 kfree(ondisk);
2236
2237 size = sizeof (*ondisk);
2238 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2239 size += names_size;
2240 ondisk = kmalloc(size, GFP_KERNEL);
2241 if (!ondisk)
2242 return ERR_PTR(-ENOMEM);
2243
788e2df3 2244 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
4156d998
AE
2245 0, size,
2246 (char *) ondisk, version);
4156d998
AE
2247 if (ret < 0)
2248 goto out_err;
2249 if (WARN_ON((size_t) ret < size)) {
2250 ret = -ENXIO;
06ecc6cb
AE
2251 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2252 size, ret);
4156d998
AE
2253 goto out_err;
2254 }
2255 if (!rbd_dev_ondisk_valid(ondisk)) {
2256 ret = -ENXIO;
06ecc6cb 2257 rbd_warn(rbd_dev, "invalid header");
4156d998 2258 goto out_err;
81e759fb 2259 }
602adf40 2260
4156d998
AE
2261 names_size = le64_to_cpu(ondisk->snap_names_len);
2262 want_count = snap_count;
2263 snap_count = le32_to_cpu(ondisk->snap_count);
2264 } while (snap_count != want_count);
00f1f36f 2265
4156d998 2266 return ondisk;
00f1f36f 2267
4156d998
AE
2268out_err:
2269 kfree(ondisk);
2270
2271 return ERR_PTR(ret);
2272}
2273
2274/*
2275 * reload the ondisk the header
2276 */
2277static int rbd_read_header(struct rbd_device *rbd_dev,
2278 struct rbd_image_header *header)
2279{
2280 struct rbd_image_header_ondisk *ondisk;
2281 u64 ver = 0;
2282 int ret;
602adf40 2283
4156d998
AE
2284 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2285 if (IS_ERR(ondisk))
2286 return PTR_ERR(ondisk);
2287 ret = rbd_header_from_disk(header, ondisk);
2288 if (ret >= 0)
2289 header->obj_version = ver;
2290 kfree(ondisk);
2291
2292 return ret;
602adf40
YS
2293}
2294
41f38c2b 2295static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
dfc5606d
YS
2296{
2297 struct rbd_snap *snap;
a0593290 2298 struct rbd_snap *next;
dfc5606d 2299
a0593290 2300 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
41f38c2b 2301 rbd_remove_snap_dev(snap);
dfc5606d
YS
2302}
2303
9478554a
AE
2304static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2305{
2306 sector_t size;
2307
0d7dbfce 2308 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
9478554a
AE
2309 return;
2310
2311 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
2312 dout("setting size to %llu sectors", (unsigned long long) size);
2313 rbd_dev->mapping.size = (u64) size;
2314 set_capacity(rbd_dev->disk, size);
2315}
2316
602adf40
YS
2317/*
2318 * only read the first part of the ondisk header, without the snaps info
2319 */
117973fb 2320static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
2321{
2322 int ret;
2323 struct rbd_image_header h;
602adf40
YS
2324
2325 ret = rbd_read_header(rbd_dev, &h);
2326 if (ret < 0)
2327 return ret;
2328
a51aa0c0
JD
2329 down_write(&rbd_dev->header_rwsem);
2330
9478554a
AE
2331 /* Update image size, and check for resize of mapped image */
2332 rbd_dev->header.image_size = h.image_size;
2333 rbd_update_mapping_size(rbd_dev);
9db4b3e3 2334
849b4260 2335 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 2336 kfree(rbd_dev->header.snap_sizes);
849b4260 2337 kfree(rbd_dev->header.snap_names);
d1d25646
JD
2338 /* osd requests may still refer to snapc */
2339 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 2340
b813623a
AE
2341 if (hver)
2342 *hver = h.obj_version;
a71b891b 2343 rbd_dev->header.obj_version = h.obj_version;
93a24e08 2344 rbd_dev->header.image_size = h.image_size;
602adf40
YS
2345 rbd_dev->header.snapc = h.snapc;
2346 rbd_dev->header.snap_names = h.snap_names;
2347 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
2348 /* Free the extra copy of the object prefix */
2349 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2350 kfree(h.object_prefix);
2351
304f6808
AE
2352 ret = rbd_dev_snaps_update(rbd_dev);
2353 if (!ret)
2354 ret = rbd_dev_snaps_register(rbd_dev);
dfc5606d 2355
c666601a 2356 up_write(&rbd_dev->header_rwsem);
602adf40 2357
dfc5606d 2358 return ret;
602adf40
YS
2359}
2360
117973fb 2361static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
1fe5e993
AE
2362{
2363 int ret;
2364
117973fb 2365 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1fe5e993 2366 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
117973fb
AE
2367 if (rbd_dev->image_format == 1)
2368 ret = rbd_dev_v1_refresh(rbd_dev, hver);
2369 else
2370 ret = rbd_dev_v2_refresh(rbd_dev, hver);
1fe5e993
AE
2371 mutex_unlock(&ctl_mutex);
2372
2373 return ret;
2374}
2375
602adf40
YS
2376static int rbd_init_disk(struct rbd_device *rbd_dev)
2377{
2378 struct gendisk *disk;
2379 struct request_queue *q;
593a9e7b 2380 u64 segment_size;
602adf40 2381
602adf40 2382 /* create gendisk info */
602adf40
YS
2383 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2384 if (!disk)
1fcdb8aa 2385 return -ENOMEM;
602adf40 2386
f0f8cef5 2387 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 2388 rbd_dev->dev_id);
602adf40
YS
2389 disk->major = rbd_dev->major;
2390 disk->first_minor = 0;
2391 disk->fops = &rbd_bd_ops;
2392 disk->private_data = rbd_dev;
2393
bf0d5f50 2394 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
602adf40
YS
2395 if (!q)
2396 goto out_disk;
029bcbd8 2397
593a9e7b
AE
2398 /* We use the default size, but let's be explicit about it. */
2399 blk_queue_physical_block_size(q, SECTOR_SIZE);
2400
029bcbd8 2401 /* set io sizes to object size */
593a9e7b
AE
2402 segment_size = rbd_obj_bytes(&rbd_dev->header);
2403 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2404 blk_queue_max_segment_size(q, segment_size);
2405 blk_queue_io_min(q, segment_size);
2406 blk_queue_io_opt(q, segment_size);
029bcbd8 2407
602adf40
YS
2408 blk_queue_merge_bvec(q, rbd_merge_bvec);
2409 disk->queue = q;
2410
2411 q->queuedata = rbd_dev;
2412
2413 rbd_dev->disk = disk;
602adf40 2414
12f02944
AE
2415 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2416
602adf40 2417 return 0;
602adf40
YS
2418out_disk:
2419 put_disk(disk);
1fcdb8aa
AE
2420
2421 return -ENOMEM;
602adf40
YS
2422}
2423
dfc5606d
YS
2424/*
2425 sysfs
2426*/
2427
593a9e7b
AE
2428static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2429{
2430 return container_of(dev, struct rbd_device, dev);
2431}
2432
dfc5606d
YS
2433static ssize_t rbd_size_show(struct device *dev,
2434 struct device_attribute *attr, char *buf)
2435{
593a9e7b 2436 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
2437 sector_t size;
2438
2439 down_read(&rbd_dev->header_rwsem);
2440 size = get_capacity(rbd_dev->disk);
2441 up_read(&rbd_dev->header_rwsem);
dfc5606d 2442
a51aa0c0 2443 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
2444}
2445
34b13184
AE
2446/*
2447 * Note this shows the features for whatever's mapped, which is not
2448 * necessarily the base image.
2449 */
2450static ssize_t rbd_features_show(struct device *dev,
2451 struct device_attribute *attr, char *buf)
2452{
2453 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2454
2455 return sprintf(buf, "0x%016llx\n",
2456 (unsigned long long) rbd_dev->mapping.features);
2457}
2458
dfc5606d
YS
2459static ssize_t rbd_major_show(struct device *dev,
2460 struct device_attribute *attr, char *buf)
2461{
593a9e7b 2462 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 2463
dfc5606d
YS
2464 return sprintf(buf, "%d\n", rbd_dev->major);
2465}
2466
2467static ssize_t rbd_client_id_show(struct device *dev,
2468 struct device_attribute *attr, char *buf)
602adf40 2469{
593a9e7b 2470 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2471
1dbb4399
AE
2472 return sprintf(buf, "client%lld\n",
2473 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
2474}
2475
dfc5606d
YS
2476static ssize_t rbd_pool_show(struct device *dev,
2477 struct device_attribute *attr, char *buf)
602adf40 2478{
593a9e7b 2479 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2480
0d7dbfce 2481 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
dfc5606d
YS
2482}
2483
9bb2f334
AE
2484static ssize_t rbd_pool_id_show(struct device *dev,
2485 struct device_attribute *attr, char *buf)
2486{
2487 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2488
0d7dbfce
AE
2489 return sprintf(buf, "%llu\n",
2490 (unsigned long long) rbd_dev->spec->pool_id);
9bb2f334
AE
2491}
2492
dfc5606d
YS
2493static ssize_t rbd_name_show(struct device *dev,
2494 struct device_attribute *attr, char *buf)
2495{
593a9e7b 2496 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2497
a92ffdf8
AE
2498 if (rbd_dev->spec->image_name)
2499 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2500
2501 return sprintf(buf, "(unknown)\n");
dfc5606d
YS
2502}
2503
589d30e0
AE
2504static ssize_t rbd_image_id_show(struct device *dev,
2505 struct device_attribute *attr, char *buf)
2506{
2507 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2508
0d7dbfce 2509 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
589d30e0
AE
2510}
2511
34b13184
AE
2512/*
2513 * Shows the name of the currently-mapped snapshot (or
2514 * RBD_SNAP_HEAD_NAME for the base image).
2515 */
dfc5606d
YS
2516static ssize_t rbd_snap_show(struct device *dev,
2517 struct device_attribute *attr,
2518 char *buf)
2519{
593a9e7b 2520 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2521
0d7dbfce 2522 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
dfc5606d
YS
2523}
2524
86b00e0d
AE
2525/*
2526 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2527 * for the parent image. If there is no parent, simply shows
2528 * "(no parent image)".
2529 */
2530static ssize_t rbd_parent_show(struct device *dev,
2531 struct device_attribute *attr,
2532 char *buf)
2533{
2534 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2535 struct rbd_spec *spec = rbd_dev->parent_spec;
2536 int count;
2537 char *bufp = buf;
2538
2539 if (!spec)
2540 return sprintf(buf, "(no parent image)\n");
2541
2542 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2543 (unsigned long long) spec->pool_id, spec->pool_name);
2544 if (count < 0)
2545 return count;
2546 bufp += count;
2547
2548 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2549 spec->image_name ? spec->image_name : "(unknown)");
2550 if (count < 0)
2551 return count;
2552 bufp += count;
2553
2554 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2555 (unsigned long long) spec->snap_id, spec->snap_name);
2556 if (count < 0)
2557 return count;
2558 bufp += count;
2559
2560 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2561 if (count < 0)
2562 return count;
2563 bufp += count;
2564
2565 return (ssize_t) (bufp - buf);
2566}
2567
dfc5606d
YS
2568static ssize_t rbd_image_refresh(struct device *dev,
2569 struct device_attribute *attr,
2570 const char *buf,
2571 size_t size)
2572{
593a9e7b 2573 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 2574 int ret;
602adf40 2575
117973fb 2576 ret = rbd_dev_refresh(rbd_dev, NULL);
b813623a
AE
2577
2578 return ret < 0 ? ret : size;
dfc5606d 2579}
602adf40 2580
dfc5606d 2581static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
34b13184 2582static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
dfc5606d
YS
2583static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2584static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2585static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 2586static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d 2587static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
589d30e0 2588static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
dfc5606d
YS
2589static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2590static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
86b00e0d 2591static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
dfc5606d
YS
2592
2593static struct attribute *rbd_attrs[] = {
2594 &dev_attr_size.attr,
34b13184 2595 &dev_attr_features.attr,
dfc5606d
YS
2596 &dev_attr_major.attr,
2597 &dev_attr_client_id.attr,
2598 &dev_attr_pool.attr,
9bb2f334 2599 &dev_attr_pool_id.attr,
dfc5606d 2600 &dev_attr_name.attr,
589d30e0 2601 &dev_attr_image_id.attr,
dfc5606d 2602 &dev_attr_current_snap.attr,
86b00e0d 2603 &dev_attr_parent.attr,
dfc5606d 2604 &dev_attr_refresh.attr,
dfc5606d
YS
2605 NULL
2606};
2607
2608static struct attribute_group rbd_attr_group = {
2609 .attrs = rbd_attrs,
2610};
2611
2612static const struct attribute_group *rbd_attr_groups[] = {
2613 &rbd_attr_group,
2614 NULL
2615};
2616
2617static void rbd_sysfs_dev_release(struct device *dev)
2618{
2619}
2620
2621static struct device_type rbd_device_type = {
2622 .name = "rbd",
2623 .groups = rbd_attr_groups,
2624 .release = rbd_sysfs_dev_release,
2625};
2626
2627
2628/*
2629 sysfs - snapshots
2630*/
2631
2632static ssize_t rbd_snap_size_show(struct device *dev,
2633 struct device_attribute *attr,
2634 char *buf)
2635{
2636 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2637
3591538f 2638 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
2639}
2640
2641static ssize_t rbd_snap_id_show(struct device *dev,
2642 struct device_attribute *attr,
2643 char *buf)
2644{
2645 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2646
3591538f 2647 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
2648}
2649
34b13184
AE
2650static ssize_t rbd_snap_features_show(struct device *dev,
2651 struct device_attribute *attr,
2652 char *buf)
2653{
2654 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2655
2656 return sprintf(buf, "0x%016llx\n",
2657 (unsigned long long) snap->features);
2658}
2659
dfc5606d
YS
2660static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2661static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
34b13184 2662static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
dfc5606d
YS
2663
2664static struct attribute *rbd_snap_attrs[] = {
2665 &dev_attr_snap_size.attr,
2666 &dev_attr_snap_id.attr,
34b13184 2667 &dev_attr_snap_features.attr,
dfc5606d
YS
2668 NULL,
2669};
2670
2671static struct attribute_group rbd_snap_attr_group = {
2672 .attrs = rbd_snap_attrs,
2673};
2674
2675static void rbd_snap_dev_release(struct device *dev)
2676{
2677 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2678 kfree(snap->name);
2679 kfree(snap);
2680}
2681
2682static const struct attribute_group *rbd_snap_attr_groups[] = {
2683 &rbd_snap_attr_group,
2684 NULL
2685};
2686
2687static struct device_type rbd_snap_device_type = {
2688 .groups = rbd_snap_attr_groups,
2689 .release = rbd_snap_dev_release,
2690};
2691
8b8fb99c
AE
2692static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2693{
2694 kref_get(&spec->kref);
2695
2696 return spec;
2697}
2698
2699static void rbd_spec_free(struct kref *kref);
2700static void rbd_spec_put(struct rbd_spec *spec)
2701{
2702 if (spec)
2703 kref_put(&spec->kref, rbd_spec_free);
2704}
2705
2706static struct rbd_spec *rbd_spec_alloc(void)
2707{
2708 struct rbd_spec *spec;
2709
2710 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2711 if (!spec)
2712 return NULL;
2713 kref_init(&spec->kref);
2714
2715 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2716
2717 return spec;
2718}
2719
2720static void rbd_spec_free(struct kref *kref)
2721{
2722 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2723
2724 kfree(spec->pool_name);
2725 kfree(spec->image_id);
2726 kfree(spec->image_name);
2727 kfree(spec->snap_name);
2728 kfree(spec);
2729}
2730
cc344fa1 2731static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
c53d5893
AE
2732 struct rbd_spec *spec)
2733{
2734 struct rbd_device *rbd_dev;
2735
2736 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2737 if (!rbd_dev)
2738 return NULL;
2739
2740 spin_lock_init(&rbd_dev->lock);
6d292906 2741 rbd_dev->flags = 0;
c53d5893
AE
2742 INIT_LIST_HEAD(&rbd_dev->node);
2743 INIT_LIST_HEAD(&rbd_dev->snaps);
2744 init_rwsem(&rbd_dev->header_rwsem);
2745
2746 rbd_dev->spec = spec;
2747 rbd_dev->rbd_client = rbdc;
2748
0903e875
AE
2749 /* Initialize the layout used for all rbd requests */
2750
2751 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2752 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2753 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2754 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2755
c53d5893
AE
2756 return rbd_dev;
2757}
2758
2759static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2760{
86b00e0d 2761 rbd_spec_put(rbd_dev->parent_spec);
c53d5893
AE
2762 kfree(rbd_dev->header_name);
2763 rbd_put_client(rbd_dev->rbd_client);
2764 rbd_spec_put(rbd_dev->spec);
2765 kfree(rbd_dev);
2766}
2767
304f6808
AE
2768static bool rbd_snap_registered(struct rbd_snap *snap)
2769{
2770 bool ret = snap->dev.type == &rbd_snap_device_type;
2771 bool reg = device_is_registered(&snap->dev);
2772
2773 rbd_assert(!ret ^ reg);
2774
2775 return ret;
2776}
2777
41f38c2b 2778static void rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2779{
2780 list_del(&snap->node);
304f6808
AE
2781 if (device_is_registered(&snap->dev))
2782 device_unregister(&snap->dev);
dfc5606d
YS
2783}
2784
14e7085d 2785static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2786 struct device *parent)
2787{
2788 struct device *dev = &snap->dev;
2789 int ret;
2790
2791 dev->type = &rbd_snap_device_type;
2792 dev->parent = parent;
2793 dev->release = rbd_snap_dev_release;
d4b125e9 2794 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
304f6808
AE
2795 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2796
dfc5606d
YS
2797 ret = device_register(dev);
2798
2799 return ret;
2800}
2801
4e891e0a 2802static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
c8d18425 2803 const char *snap_name,
34b13184
AE
2804 u64 snap_id, u64 snap_size,
2805 u64 snap_features)
dfc5606d 2806{
4e891e0a 2807 struct rbd_snap *snap;
dfc5606d 2808 int ret;
4e891e0a
AE
2809
2810 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2811 if (!snap)
4e891e0a
AE
2812 return ERR_PTR(-ENOMEM);
2813
2814 ret = -ENOMEM;
c8d18425 2815 snap->name = kstrdup(snap_name, GFP_KERNEL);
4e891e0a
AE
2816 if (!snap->name)
2817 goto err;
2818
c8d18425
AE
2819 snap->id = snap_id;
2820 snap->size = snap_size;
34b13184 2821 snap->features = snap_features;
4e891e0a
AE
2822
2823 return snap;
2824
dfc5606d
YS
2825err:
2826 kfree(snap->name);
2827 kfree(snap);
4e891e0a
AE
2828
2829 return ERR_PTR(ret);
dfc5606d
YS
2830}
2831
cd892126
AE
2832static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2833 u64 *snap_size, u64 *snap_features)
2834{
2835 char *snap_name;
2836
2837 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2838
2839 *snap_size = rbd_dev->header.snap_sizes[which];
2840 *snap_features = 0; /* No features for v1 */
2841
2842 /* Skip over names until we find the one we are looking for */
2843
2844 snap_name = rbd_dev->header.snap_names;
2845 while (which--)
2846 snap_name += strlen(snap_name) + 1;
2847
2848 return snap_name;
2849}
2850
9d475de5
AE
2851/*
2852 * Get the size and object order for an image snapshot, or if
2853 * snap_id is CEPH_NOSNAP, gets this information for the base
2854 * image.
2855 */
2856static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2857 u8 *order, u64 *snap_size)
2858{
2859 __le64 snapid = cpu_to_le64(snap_id);
2860 int ret;
2861 struct {
2862 u8 order;
2863 __le64 size;
2864 } __attribute__ ((packed)) size_buf = { 0 };
2865
36be9a76 2866 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
9d475de5
AE
2867 "rbd", "get_size",
2868 (char *) &snapid, sizeof (snapid),
07b2391f 2869 (char *) &size_buf, sizeof (size_buf), NULL);
36be9a76 2870 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
9d475de5
AE
2871 if (ret < 0)
2872 return ret;
2873
2874 *order = size_buf.order;
2875 *snap_size = le64_to_cpu(size_buf.size);
2876
2877 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2878 (unsigned long long) snap_id, (unsigned int) *order,
2879 (unsigned long long) *snap_size);
2880
2881 return 0;
2882}
2883
2884static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2885{
2886 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2887 &rbd_dev->header.obj_order,
2888 &rbd_dev->header.image_size);
2889}
2890
1e130199
AE
2891static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2892{
2893 void *reply_buf;
2894 int ret;
2895 void *p;
2896
2897 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2898 if (!reply_buf)
2899 return -ENOMEM;
2900
36be9a76 2901 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
1e130199
AE
2902 "rbd", "get_object_prefix",
2903 NULL, 0,
07b2391f 2904 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
36be9a76 2905 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
1e130199
AE
2906 if (ret < 0)
2907 goto out;
2908
2909 p = reply_buf;
2910 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2911 p + RBD_OBJ_PREFIX_LEN_MAX,
2912 NULL, GFP_NOIO);
2913
2914 if (IS_ERR(rbd_dev->header.object_prefix)) {
2915 ret = PTR_ERR(rbd_dev->header.object_prefix);
2916 rbd_dev->header.object_prefix = NULL;
2917 } else {
2918 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2919 }
2920
2921out:
2922 kfree(reply_buf);
2923
2924 return ret;
2925}
2926
b1b5402a
AE
2927static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2928 u64 *snap_features)
2929{
2930 __le64 snapid = cpu_to_le64(snap_id);
2931 struct {
2932 __le64 features;
2933 __le64 incompat;
2934 } features_buf = { 0 };
d889140c 2935 u64 incompat;
b1b5402a
AE
2936 int ret;
2937
36be9a76 2938 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
b1b5402a
AE
2939 "rbd", "get_features",
2940 (char *) &snapid, sizeof (snapid),
2941 (char *) &features_buf, sizeof (features_buf),
07b2391f 2942 NULL);
36be9a76 2943 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
b1b5402a
AE
2944 if (ret < 0)
2945 return ret;
d889140c
AE
2946
2947 incompat = le64_to_cpu(features_buf.incompat);
5cbf6f12 2948 if (incompat & ~RBD_FEATURES_SUPPORTED)
b8f5c6ed 2949 return -ENXIO;
d889140c 2950
b1b5402a
AE
2951 *snap_features = le64_to_cpu(features_buf.features);
2952
2953 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2954 (unsigned long long) snap_id,
2955 (unsigned long long) *snap_features,
2956 (unsigned long long) le64_to_cpu(features_buf.incompat));
2957
2958 return 0;
2959}
2960
2961static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2962{
2963 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2964 &rbd_dev->header.features);
2965}
2966
86b00e0d
AE
2967static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2968{
2969 struct rbd_spec *parent_spec;
2970 size_t size;
2971 void *reply_buf = NULL;
2972 __le64 snapid;
2973 void *p;
2974 void *end;
2975 char *image_id;
2976 u64 overlap;
86b00e0d
AE
2977 int ret;
2978
2979 parent_spec = rbd_spec_alloc();
2980 if (!parent_spec)
2981 return -ENOMEM;
2982
2983 size = sizeof (__le64) + /* pool_id */
2984 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2985 sizeof (__le64) + /* snap_id */
2986 sizeof (__le64); /* overlap */
2987 reply_buf = kmalloc(size, GFP_KERNEL);
2988 if (!reply_buf) {
2989 ret = -ENOMEM;
2990 goto out_err;
2991 }
2992
2993 snapid = cpu_to_le64(CEPH_NOSNAP);
36be9a76 2994 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
86b00e0d
AE
2995 "rbd", "get_parent",
2996 (char *) &snapid, sizeof (snapid),
07b2391f 2997 (char *) reply_buf, size, NULL);
36be9a76 2998 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
86b00e0d
AE
2999 if (ret < 0)
3000 goto out_err;
3001
3002 ret = -ERANGE;
3003 p = reply_buf;
3004 end = (char *) reply_buf + size;
3005 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
3006 if (parent_spec->pool_id == CEPH_NOPOOL)
3007 goto out; /* No parent? No problem. */
3008
0903e875
AE
3009 /* The ceph file layout needs to fit pool id in 32 bits */
3010
3011 ret = -EIO;
3012 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
3013 goto out;
3014
979ed480 3015 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
86b00e0d
AE
3016 if (IS_ERR(image_id)) {
3017 ret = PTR_ERR(image_id);
3018 goto out_err;
3019 }
3020 parent_spec->image_id = image_id;
3021 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
3022 ceph_decode_64_safe(&p, end, overlap, out_err);
3023
3024 rbd_dev->parent_overlap = overlap;
3025 rbd_dev->parent_spec = parent_spec;
3026 parent_spec = NULL; /* rbd_dev now owns this */
3027out:
3028 ret = 0;
3029out_err:
3030 kfree(reply_buf);
3031 rbd_spec_put(parent_spec);
3032
3033 return ret;
3034}
3035
9e15b77d
AE
3036static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3037{
3038 size_t image_id_size;
3039 char *image_id;
3040 void *p;
3041 void *end;
3042 size_t size;
3043 void *reply_buf = NULL;
3044 size_t len = 0;
3045 char *image_name = NULL;
3046 int ret;
3047
3048 rbd_assert(!rbd_dev->spec->image_name);
3049
69e7a02f
AE
3050 len = strlen(rbd_dev->spec->image_id);
3051 image_id_size = sizeof (__le32) + len;
9e15b77d
AE
3052 image_id = kmalloc(image_id_size, GFP_KERNEL);
3053 if (!image_id)
3054 return NULL;
3055
3056 p = image_id;
3057 end = (char *) image_id + image_id_size;
69e7a02f 3058 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
9e15b77d
AE
3059
3060 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3061 reply_buf = kmalloc(size, GFP_KERNEL);
3062 if (!reply_buf)
3063 goto out;
3064
36be9a76 3065 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
9e15b77d
AE
3066 "rbd", "dir_get_name",
3067 image_id, image_id_size,
07b2391f 3068 (char *) reply_buf, size, NULL);
9e15b77d
AE
3069 if (ret < 0)
3070 goto out;
3071 p = reply_buf;
3072 end = (char *) reply_buf + size;
3073 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3074 if (IS_ERR(image_name))
3075 image_name = NULL;
3076 else
3077 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3078out:
3079 kfree(reply_buf);
3080 kfree(image_id);
3081
3082 return image_name;
3083}
3084
3085/*
3086 * When a parent image gets probed, we only have the pool, image,
3087 * and snapshot ids but not the names of any of them. This call
3088 * is made later to fill in those names. It has to be done after
3089 * rbd_dev_snaps_update() has completed because some of the
3090 * information (in particular, snapshot name) is not available
3091 * until then.
3092 */
3093static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
3094{
3095 struct ceph_osd_client *osdc;
3096 const char *name;
3097 void *reply_buf = NULL;
3098 int ret;
3099
3100 if (rbd_dev->spec->pool_name)
3101 return 0; /* Already have the names */
3102
3103 /* Look up the pool name */
3104
3105 osdc = &rbd_dev->rbd_client->client->osdc;
3106 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
935dc89f
AE
3107 if (!name) {
3108 rbd_warn(rbd_dev, "there is no pool with id %llu",
3109 rbd_dev->spec->pool_id); /* Really a BUG() */
3110 return -EIO;
3111 }
9e15b77d
AE
3112
3113 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
3114 if (!rbd_dev->spec->pool_name)
3115 return -ENOMEM;
3116
3117 /* Fetch the image name; tolerate failure here */
3118
3119 name = rbd_dev_image_name(rbd_dev);
69e7a02f 3120 if (name)
9e15b77d 3121 rbd_dev->spec->image_name = (char *) name;
69e7a02f 3122 else
06ecc6cb 3123 rbd_warn(rbd_dev, "unable to get image name");
9e15b77d
AE
3124
3125 /* Look up the snapshot name. */
3126
3127 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
3128 if (!name) {
935dc89f
AE
3129 rbd_warn(rbd_dev, "no snapshot with id %llu",
3130 rbd_dev->spec->snap_id); /* Really a BUG() */
9e15b77d
AE
3131 ret = -EIO;
3132 goto out_err;
3133 }
3134 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
3135 if(!rbd_dev->spec->snap_name)
3136 goto out_err;
3137
3138 return 0;
3139out_err:
3140 kfree(reply_buf);
3141 kfree(rbd_dev->spec->pool_name);
3142 rbd_dev->spec->pool_name = NULL;
3143
3144 return ret;
3145}
3146
6e14b1a6 3147static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
35d489f9
AE
3148{
3149 size_t size;
3150 int ret;
3151 void *reply_buf;
3152 void *p;
3153 void *end;
3154 u64 seq;
3155 u32 snap_count;
3156 struct ceph_snap_context *snapc;
3157 u32 i;
3158
3159 /*
3160 * We'll need room for the seq value (maximum snapshot id),
3161 * snapshot count, and array of that many snapshot ids.
3162 * For now we have a fixed upper limit on the number we're
3163 * prepared to receive.
3164 */
3165 size = sizeof (__le64) + sizeof (__le32) +
3166 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3167 reply_buf = kzalloc(size, GFP_KERNEL);
3168 if (!reply_buf)
3169 return -ENOMEM;
3170
36be9a76 3171 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
35d489f9
AE
3172 "rbd", "get_snapcontext",
3173 NULL, 0,
07b2391f 3174 reply_buf, size, ver);
36be9a76 3175 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
35d489f9
AE
3176 if (ret < 0)
3177 goto out;
3178
3179 ret = -ERANGE;
3180 p = reply_buf;
3181 end = (char *) reply_buf + size;
3182 ceph_decode_64_safe(&p, end, seq, out);
3183 ceph_decode_32_safe(&p, end, snap_count, out);
3184
3185 /*
3186 * Make sure the reported number of snapshot ids wouldn't go
3187 * beyond the end of our buffer. But before checking that,
3188 * make sure the computed size of the snapshot context we
3189 * allocate is representable in a size_t.
3190 */
3191 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3192 / sizeof (u64)) {
3193 ret = -EINVAL;
3194 goto out;
3195 }
3196 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3197 goto out;
3198
3199 size = sizeof (struct ceph_snap_context) +
3200 snap_count * sizeof (snapc->snaps[0]);
3201 snapc = kmalloc(size, GFP_KERNEL);
3202 if (!snapc) {
3203 ret = -ENOMEM;
3204 goto out;
3205 }
3206
3207 atomic_set(&snapc->nref, 1);
3208 snapc->seq = seq;
3209 snapc->num_snaps = snap_count;
3210 for (i = 0; i < snap_count; i++)
3211 snapc->snaps[i] = ceph_decode_64(&p);
3212
3213 rbd_dev->header.snapc = snapc;
3214
3215 dout(" snap context seq = %llu, snap_count = %u\n",
3216 (unsigned long long) seq, (unsigned int) snap_count);
3217
3218out:
3219 kfree(reply_buf);
3220
3221 return 0;
3222}
3223
b8b1e2db
AE
3224static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3225{
3226 size_t size;
3227 void *reply_buf;
3228 __le64 snap_id;
3229 int ret;
3230 void *p;
3231 void *end;
b8b1e2db
AE
3232 char *snap_name;
3233
3234 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3235 reply_buf = kmalloc(size, GFP_KERNEL);
3236 if (!reply_buf)
3237 return ERR_PTR(-ENOMEM);
3238
3239 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
36be9a76 3240 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
b8b1e2db
AE
3241 "rbd", "get_snapshot_name",
3242 (char *) &snap_id, sizeof (snap_id),
07b2391f 3243 reply_buf, size, NULL);
36be9a76 3244 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
b8b1e2db
AE
3245 if (ret < 0)
3246 goto out;
3247
3248 p = reply_buf;
3249 end = (char *) reply_buf + size;
e5c35534 3250 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
b8b1e2db
AE
3251 if (IS_ERR(snap_name)) {
3252 ret = PTR_ERR(snap_name);
3253 goto out;
3254 } else {
3255 dout(" snap_id 0x%016llx snap_name = %s\n",
3256 (unsigned long long) le64_to_cpu(snap_id), snap_name);
3257 }
3258 kfree(reply_buf);
3259
3260 return snap_name;
3261out:
3262 kfree(reply_buf);
3263
3264 return ERR_PTR(ret);
3265}
3266
3267static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3268 u64 *snap_size, u64 *snap_features)
3269{
e0b49868 3270 u64 snap_id;
b8b1e2db
AE
3271 u8 order;
3272 int ret;
3273
3274 snap_id = rbd_dev->header.snapc->snaps[which];
3275 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3276 if (ret)
3277 return ERR_PTR(ret);
3278 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3279 if (ret)
3280 return ERR_PTR(ret);
3281
3282 return rbd_dev_v2_snap_name(rbd_dev, which);
3283}
3284
3285static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3286 u64 *snap_size, u64 *snap_features)
3287{
3288 if (rbd_dev->image_format == 1)
3289 return rbd_dev_v1_snap_info(rbd_dev, which,
3290 snap_size, snap_features);
3291 if (rbd_dev->image_format == 2)
3292 return rbd_dev_v2_snap_info(rbd_dev, which,
3293 snap_size, snap_features);
3294 return ERR_PTR(-EINVAL);
3295}
3296
117973fb
AE
3297static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3298{
3299 int ret;
3300 __u8 obj_order;
3301
3302 down_write(&rbd_dev->header_rwsem);
3303
3304 /* Grab old order first, to see if it changes */
3305
3306 obj_order = rbd_dev->header.obj_order,
3307 ret = rbd_dev_v2_image_size(rbd_dev);
3308 if (ret)
3309 goto out;
3310 if (rbd_dev->header.obj_order != obj_order) {
3311 ret = -EIO;
3312 goto out;
3313 }
3314 rbd_update_mapping_size(rbd_dev);
3315
3316 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3317 dout("rbd_dev_v2_snap_context returned %d\n", ret);
3318 if (ret)
3319 goto out;
3320 ret = rbd_dev_snaps_update(rbd_dev);
3321 dout("rbd_dev_snaps_update returned %d\n", ret);
3322 if (ret)
3323 goto out;
3324 ret = rbd_dev_snaps_register(rbd_dev);
3325 dout("rbd_dev_snaps_register returned %d\n", ret);
3326out:
3327 up_write(&rbd_dev->header_rwsem);
3328
3329 return ret;
3330}
3331
dfc5606d 3332/*
35938150
AE
3333 * Scan the rbd device's current snapshot list and compare it to the
3334 * newly-received snapshot context. Remove any existing snapshots
3335 * not present in the new snapshot context. Add a new snapshot for
3336 * any snaphots in the snapshot context not in the current list.
3337 * And verify there are no changes to snapshots we already know
3338 * about.
3339 *
3340 * Assumes the snapshots in the snapshot context are sorted by
3341 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
3342 * are also maintained in that order.)
dfc5606d 3343 */
304f6808 3344static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
dfc5606d 3345{
35938150
AE
3346 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3347 const u32 snap_count = snapc->num_snaps;
35938150
AE
3348 struct list_head *head = &rbd_dev->snaps;
3349 struct list_head *links = head->next;
3350 u32 index = 0;
dfc5606d 3351
9fcbb800 3352 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
3353 while (index < snap_count || links != head) {
3354 u64 snap_id;
3355 struct rbd_snap *snap;
cd892126
AE
3356 char *snap_name;
3357 u64 snap_size = 0;
3358 u64 snap_features = 0;
dfc5606d 3359
35938150
AE
3360 snap_id = index < snap_count ? snapc->snaps[index]
3361 : CEPH_NOSNAP;
3362 snap = links != head ? list_entry(links, struct rbd_snap, node)
3363 : NULL;
aafb230e 3364 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 3365
35938150
AE
3366 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
3367 struct list_head *next = links->next;
dfc5606d 3368
6d292906
AE
3369 /*
3370 * A previously-existing snapshot is not in
3371 * the new snap context.
3372 *
3373 * If the now missing snapshot is the one the
3374 * image is mapped to, clear its exists flag
3375 * so we can avoid sending any more requests
3376 * to it.
3377 */
0d7dbfce 3378 if (rbd_dev->spec->snap_id == snap->id)
6d292906 3379 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
41f38c2b 3380 rbd_remove_snap_dev(snap);
9fcbb800 3381 dout("%ssnap id %llu has been removed\n",
0d7dbfce
AE
3382 rbd_dev->spec->snap_id == snap->id ?
3383 "mapped " : "",
9fcbb800 3384 (unsigned long long) snap->id);
35938150
AE
3385
3386 /* Done with this list entry; advance */
3387
3388 links = next;
dfc5606d
YS
3389 continue;
3390 }
35938150 3391
b8b1e2db
AE
3392 snap_name = rbd_dev_snap_info(rbd_dev, index,
3393 &snap_size, &snap_features);
cd892126
AE
3394 if (IS_ERR(snap_name))
3395 return PTR_ERR(snap_name);
3396
9fcbb800
AE
3397 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
3398 (unsigned long long) snap_id);
35938150
AE
3399 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
3400 struct rbd_snap *new_snap;
3401
3402 /* We haven't seen this snapshot before */
3403
c8d18425 3404 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
cd892126 3405 snap_id, snap_size, snap_features);
9fcbb800
AE
3406 if (IS_ERR(new_snap)) {
3407 int err = PTR_ERR(new_snap);
3408
3409 dout(" failed to add dev, error %d\n", err);
3410
3411 return err;
3412 }
35938150
AE
3413
3414 /* New goes before existing, or at end of list */
3415
9fcbb800 3416 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
3417 if (snap)
3418 list_add_tail(&new_snap->node, &snap->node);
3419 else
523f3258 3420 list_add_tail(&new_snap->node, head);
35938150
AE
3421 } else {
3422 /* Already have this one */
3423
9fcbb800
AE
3424 dout(" already present\n");
3425
cd892126 3426 rbd_assert(snap->size == snap_size);
aafb230e 3427 rbd_assert(!strcmp(snap->name, snap_name));
cd892126 3428 rbd_assert(snap->features == snap_features);
35938150
AE
3429
3430 /* Done with this list entry; advance */
3431
3432 links = links->next;
dfc5606d 3433 }
35938150
AE
3434
3435 /* Advance to the next entry in the snapshot context */
3436
3437 index++;
dfc5606d 3438 }
9fcbb800 3439 dout("%s: done\n", __func__);
dfc5606d
YS
3440
3441 return 0;
3442}
3443
304f6808
AE
3444/*
3445 * Scan the list of snapshots and register the devices for any that
3446 * have not already been registered.
3447 */
3448static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3449{
3450 struct rbd_snap *snap;
3451 int ret = 0;
3452
37206ee5 3453 dout("%s:\n", __func__);
86ff77bb
AE
3454 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3455 return -EIO;
304f6808
AE
3456
3457 list_for_each_entry(snap, &rbd_dev->snaps, node) {
3458 if (!rbd_snap_registered(snap)) {
3459 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3460 if (ret < 0)
3461 break;
3462 }
3463 }
3464 dout("%s: returning %d\n", __func__, ret);
3465
3466 return ret;
3467}
3468
dfc5606d
YS
3469static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3470{
dfc5606d 3471 struct device *dev;
cd789ab9 3472 int ret;
dfc5606d
YS
3473
3474 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
dfc5606d 3475
cd789ab9 3476 dev = &rbd_dev->dev;
dfc5606d
YS
3477 dev->bus = &rbd_bus_type;
3478 dev->type = &rbd_device_type;
3479 dev->parent = &rbd_root_dev;
3480 dev->release = rbd_dev_release;
de71a297 3481 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d 3482 ret = device_register(dev);
dfc5606d 3483
dfc5606d 3484 mutex_unlock(&ctl_mutex);
cd789ab9 3485
dfc5606d 3486 return ret;
602adf40
YS
3487}
3488
dfc5606d
YS
3489static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3490{
3491 device_unregister(&rbd_dev->dev);
3492}
3493
e2839308 3494static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
3495
3496/*
499afd5b
AE
3497 * Get a unique rbd identifier for the given new rbd_dev, and add
3498 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 3499 */
e2839308 3500static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 3501{
e2839308 3502 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
3503
3504 spin_lock(&rbd_dev_list_lock);
3505 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3506 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
3507 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3508 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 3509}
b7f23c36 3510
1ddbe94e 3511/*
499afd5b
AE
3512 * Remove an rbd_dev from the global list, and record that its
3513 * identifier is no longer in use.
1ddbe94e 3514 */
e2839308 3515static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 3516{
d184f6bf 3517 struct list_head *tmp;
de71a297 3518 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
3519 int max_id;
3520
aafb230e 3521 rbd_assert(rbd_id > 0);
499afd5b 3522
e2839308
AE
3523 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3524 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
3525 spin_lock(&rbd_dev_list_lock);
3526 list_del_init(&rbd_dev->node);
d184f6bf
AE
3527
3528 /*
3529 * If the id being "put" is not the current maximum, there
3530 * is nothing special we need to do.
3531 */
e2839308 3532 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
3533 spin_unlock(&rbd_dev_list_lock);
3534 return;
3535 }
3536
3537 /*
3538 * We need to update the current maximum id. Search the
3539 * list to find out what it is. We're more likely to find
3540 * the maximum at the end, so search the list backward.
3541 */
3542 max_id = 0;
3543 list_for_each_prev(tmp, &rbd_dev_list) {
3544 struct rbd_device *rbd_dev;
3545
3546 rbd_dev = list_entry(tmp, struct rbd_device, node);
b213e0b1
AE
3547 if (rbd_dev->dev_id > max_id)
3548 max_id = rbd_dev->dev_id;
d184f6bf 3549 }
499afd5b 3550 spin_unlock(&rbd_dev_list_lock);
b7f23c36 3551
1ddbe94e 3552 /*
e2839308 3553 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
3554 * which case it now accurately reflects the new maximum.
3555 * Be careful not to overwrite the maximum value in that
3556 * case.
1ddbe94e 3557 */
e2839308
AE
3558 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3559 dout(" max dev id has been reset\n");
b7f23c36
AE
3560}
3561
e28fff26
AE
3562/*
3563 * Skips over white space at *buf, and updates *buf to point to the
3564 * first found non-space character (if any). Returns the length of
593a9e7b
AE
3565 * the token (string of non-white space characters) found. Note
3566 * that *buf must be terminated with '\0'.
e28fff26
AE
3567 */
3568static inline size_t next_token(const char **buf)
3569{
3570 /*
3571 * These are the characters that produce nonzero for
3572 * isspace() in the "C" and "POSIX" locales.
3573 */
3574 const char *spaces = " \f\n\r\t\v";
3575
3576 *buf += strspn(*buf, spaces); /* Find start of token */
3577
3578 return strcspn(*buf, spaces); /* Return token length */
3579}
3580
3581/*
3582 * Finds the next token in *buf, and if the provided token buffer is
3583 * big enough, copies the found token into it. The result, if
593a9e7b
AE
3584 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3585 * must be terminated with '\0' on entry.
e28fff26
AE
3586 *
3587 * Returns the length of the token found (not including the '\0').
3588 * Return value will be 0 if no token is found, and it will be >=
3589 * token_size if the token would not fit.
3590 *
593a9e7b 3591 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
3592 * found token. Note that this occurs even if the token buffer is
3593 * too small to hold it.
3594 */
3595static inline size_t copy_token(const char **buf,
3596 char *token,
3597 size_t token_size)
3598{
3599 size_t len;
3600
3601 len = next_token(buf);
3602 if (len < token_size) {
3603 memcpy(token, *buf, len);
3604 *(token + len) = '\0';
3605 }
3606 *buf += len;
3607
3608 return len;
3609}
3610
ea3352f4
AE
3611/*
3612 * Finds the next token in *buf, dynamically allocates a buffer big
3613 * enough to hold a copy of it, and copies the token into the new
3614 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3615 * that a duplicate buffer is created even for a zero-length token.
3616 *
3617 * Returns a pointer to the newly-allocated duplicate, or a null
3618 * pointer if memory for the duplicate was not available. If
3619 * the lenp argument is a non-null pointer, the length of the token
3620 * (not including the '\0') is returned in *lenp.
3621 *
3622 * If successful, the *buf pointer will be updated to point beyond
3623 * the end of the found token.
3624 *
3625 * Note: uses GFP_KERNEL for allocation.
3626 */
3627static inline char *dup_token(const char **buf, size_t *lenp)
3628{
3629 char *dup;
3630 size_t len;
3631
3632 len = next_token(buf);
4caf35f9 3633 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
ea3352f4
AE
3634 if (!dup)
3635 return NULL;
ea3352f4
AE
3636 *(dup + len) = '\0';
3637 *buf += len;
3638
3639 if (lenp)
3640 *lenp = len;
3641
3642 return dup;
3643}
3644
a725f65e 3645/*
859c31df
AE
3646 * Parse the options provided for an "rbd add" (i.e., rbd image
3647 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3648 * and the data written is passed here via a NUL-terminated buffer.
3649 * Returns 0 if successful or an error code otherwise.
d22f76e7 3650 *
859c31df
AE
3651 * The information extracted from these options is recorded in
3652 * the other parameters which return dynamically-allocated
3653 * structures:
3654 * ceph_opts
3655 * The address of a pointer that will refer to a ceph options
3656 * structure. Caller must release the returned pointer using
3657 * ceph_destroy_options() when it is no longer needed.
3658 * rbd_opts
3659 * Address of an rbd options pointer. Fully initialized by
3660 * this function; caller must release with kfree().
3661 * spec
3662 * Address of an rbd image specification pointer. Fully
3663 * initialized by this function based on parsed options.
3664 * Caller must release with rbd_spec_put().
3665 *
3666 * The options passed take this form:
3667 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3668 * where:
3669 * <mon_addrs>
3670 * A comma-separated list of one or more monitor addresses.
3671 * A monitor address is an ip address, optionally followed
3672 * by a port number (separated by a colon).
3673 * I.e.: ip1[:port1][,ip2[:port2]...]
3674 * <options>
3675 * A comma-separated list of ceph and/or rbd options.
3676 * <pool_name>
3677 * The name of the rados pool containing the rbd image.
3678 * <image_name>
3679 * The name of the image in that pool to map.
3680 * <snap_id>
3681 * An optional snapshot id. If provided, the mapping will
3682 * present data from the image at the time that snapshot was
3683 * created. The image head is used if no snapshot id is
3684 * provided. Snapshot mappings are always read-only.
a725f65e 3685 */
859c31df 3686static int rbd_add_parse_args(const char *buf,
dc79b113 3687 struct ceph_options **ceph_opts,
859c31df
AE
3688 struct rbd_options **opts,
3689 struct rbd_spec **rbd_spec)
e28fff26 3690{
d22f76e7 3691 size_t len;
859c31df 3692 char *options;
0ddebc0c
AE
3693 const char *mon_addrs;
3694 size_t mon_addrs_size;
859c31df 3695 struct rbd_spec *spec = NULL;
4e9afeba 3696 struct rbd_options *rbd_opts = NULL;
859c31df 3697 struct ceph_options *copts;
dc79b113 3698 int ret;
e28fff26
AE
3699
3700 /* The first four tokens are required */
3701
7ef3214a 3702 len = next_token(&buf);
4fb5d671
AE
3703 if (!len) {
3704 rbd_warn(NULL, "no monitor address(es) provided");
3705 return -EINVAL;
3706 }
0ddebc0c 3707 mon_addrs = buf;
f28e565a 3708 mon_addrs_size = len + 1;
7ef3214a 3709 buf += len;
a725f65e 3710
dc79b113 3711 ret = -EINVAL;
f28e565a
AE
3712 options = dup_token(&buf, NULL);
3713 if (!options)
dc79b113 3714 return -ENOMEM;
4fb5d671
AE
3715 if (!*options) {
3716 rbd_warn(NULL, "no options provided");
3717 goto out_err;
3718 }
e28fff26 3719
859c31df
AE
3720 spec = rbd_spec_alloc();
3721 if (!spec)
f28e565a 3722 goto out_mem;
859c31df
AE
3723
3724 spec->pool_name = dup_token(&buf, NULL);
3725 if (!spec->pool_name)
3726 goto out_mem;
4fb5d671
AE
3727 if (!*spec->pool_name) {
3728 rbd_warn(NULL, "no pool name provided");
3729 goto out_err;
3730 }
e28fff26 3731
69e7a02f 3732 spec->image_name = dup_token(&buf, NULL);
859c31df 3733 if (!spec->image_name)
f28e565a 3734 goto out_mem;
4fb5d671
AE
3735 if (!*spec->image_name) {
3736 rbd_warn(NULL, "no image name provided");
3737 goto out_err;
3738 }
d4b125e9 3739
f28e565a
AE
3740 /*
3741 * Snapshot name is optional; default is to use "-"
3742 * (indicating the head/no snapshot).
3743 */
3feeb894 3744 len = next_token(&buf);
820a5f3e 3745 if (!len) {
3feeb894
AE
3746 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3747 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a 3748 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
dc79b113 3749 ret = -ENAMETOOLONG;
f28e565a 3750 goto out_err;
849b4260 3751 }
4caf35f9 3752 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
859c31df 3753 if (!spec->snap_name)
f28e565a 3754 goto out_mem;
859c31df 3755 *(spec->snap_name + len) = '\0';
e5c35534 3756
0ddebc0c 3757 /* Initialize all rbd options to the defaults */
e28fff26 3758
4e9afeba
AE
3759 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3760 if (!rbd_opts)
3761 goto out_mem;
3762
3763 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
d22f76e7 3764
859c31df 3765 copts = ceph_parse_options(options, mon_addrs,
0ddebc0c 3766 mon_addrs + mon_addrs_size - 1,
4e9afeba 3767 parse_rbd_opts_token, rbd_opts);
859c31df
AE
3768 if (IS_ERR(copts)) {
3769 ret = PTR_ERR(copts);
dc79b113
AE
3770 goto out_err;
3771 }
859c31df
AE
3772 kfree(options);
3773
3774 *ceph_opts = copts;
4e9afeba 3775 *opts = rbd_opts;
859c31df 3776 *rbd_spec = spec;
0ddebc0c 3777
dc79b113 3778 return 0;
f28e565a 3779out_mem:
dc79b113 3780 ret = -ENOMEM;
d22f76e7 3781out_err:
859c31df
AE
3782 kfree(rbd_opts);
3783 rbd_spec_put(spec);
f28e565a 3784 kfree(options);
d22f76e7 3785
dc79b113 3786 return ret;
a725f65e
AE
3787}
3788
589d30e0
AE
3789/*
3790 * An rbd format 2 image has a unique identifier, distinct from the
3791 * name given to it by the user. Internally, that identifier is
3792 * what's used to specify the names of objects related to the image.
3793 *
3794 * A special "rbd id" object is used to map an rbd image name to its
3795 * id. If that object doesn't exist, then there is no v2 rbd image
3796 * with the supplied name.
3797 *
3798 * This function will record the given rbd_dev's image_id field if
3799 * it can be determined, and in that case will return 0. If any
3800 * errors occur a negative errno will be returned and the rbd_dev's
3801 * image_id field will be unchanged (and should be NULL).
3802 */
3803static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3804{
3805 int ret;
3806 size_t size;
3807 char *object_name;
3808 void *response;
3809 void *p;
3810
2c0d0a10
AE
3811 /*
3812 * When probing a parent image, the image id is already
3813 * known (and the image name likely is not). There's no
3814 * need to fetch the image id again in this case.
3815 */
3816 if (rbd_dev->spec->image_id)
3817 return 0;
3818
589d30e0
AE
3819 /*
3820 * First, see if the format 2 image id file exists, and if
3821 * so, get the image's persistent id from it.
3822 */
69e7a02f 3823 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
589d30e0
AE
3824 object_name = kmalloc(size, GFP_NOIO);
3825 if (!object_name)
3826 return -ENOMEM;
0d7dbfce 3827 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
589d30e0
AE
3828 dout("rbd id object name is %s\n", object_name);
3829
3830 /* Response will be an encoded string, which includes a length */
3831
3832 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3833 response = kzalloc(size, GFP_NOIO);
3834 if (!response) {
3835 ret = -ENOMEM;
3836 goto out;
3837 }
3838
36be9a76 3839 ret = rbd_obj_method_sync(rbd_dev, object_name,
589d30e0
AE
3840 "rbd", "get_id",
3841 NULL, 0,
07b2391f 3842 response, RBD_IMAGE_ID_LEN_MAX, NULL);
36be9a76 3843 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
589d30e0
AE
3844 if (ret < 0)
3845 goto out;
3846
3847 p = response;
0d7dbfce 3848 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
589d30e0 3849 p + RBD_IMAGE_ID_LEN_MAX,
979ed480 3850 NULL, GFP_NOIO);
0d7dbfce
AE
3851 if (IS_ERR(rbd_dev->spec->image_id)) {
3852 ret = PTR_ERR(rbd_dev->spec->image_id);
3853 rbd_dev->spec->image_id = NULL;
589d30e0 3854 } else {
0d7dbfce 3855 dout("image_id is %s\n", rbd_dev->spec->image_id);
589d30e0
AE
3856 }
3857out:
3858 kfree(response);
3859 kfree(object_name);
3860
3861 return ret;
3862}
3863
a30b71b9
AE
3864static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3865{
3866 int ret;
3867 size_t size;
3868
3869 /* Version 1 images have no id; empty string is used */
3870
0d7dbfce
AE
3871 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3872 if (!rbd_dev->spec->image_id)
a30b71b9 3873 return -ENOMEM;
a30b71b9
AE
3874
3875 /* Record the header object name for this rbd image. */
3876
69e7a02f 3877 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
a30b71b9
AE
3878 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3879 if (!rbd_dev->header_name) {
3880 ret = -ENOMEM;
3881 goto out_err;
3882 }
0d7dbfce
AE
3883 sprintf(rbd_dev->header_name, "%s%s",
3884 rbd_dev->spec->image_name, RBD_SUFFIX);
a30b71b9
AE
3885
3886 /* Populate rbd image metadata */
3887
3888 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3889 if (ret < 0)
3890 goto out_err;
86b00e0d
AE
3891
3892 /* Version 1 images have no parent (no layering) */
3893
3894 rbd_dev->parent_spec = NULL;
3895 rbd_dev->parent_overlap = 0;
3896
a30b71b9
AE
3897 rbd_dev->image_format = 1;
3898
3899 dout("discovered version 1 image, header name is %s\n",
3900 rbd_dev->header_name);
3901
3902 return 0;
3903
3904out_err:
3905 kfree(rbd_dev->header_name);
3906 rbd_dev->header_name = NULL;
0d7dbfce
AE
3907 kfree(rbd_dev->spec->image_id);
3908 rbd_dev->spec->image_id = NULL;
a30b71b9
AE
3909
3910 return ret;
3911}
3912
3913static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3914{
3915 size_t size;
9d475de5 3916 int ret;
6e14b1a6 3917 u64 ver = 0;
a30b71b9
AE
3918
3919 /*
3920 * Image id was filled in by the caller. Record the header
3921 * object name for this rbd image.
3922 */
979ed480 3923 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
a30b71b9
AE
3924 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3925 if (!rbd_dev->header_name)
3926 return -ENOMEM;
3927 sprintf(rbd_dev->header_name, "%s%s",
0d7dbfce 3928 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
9d475de5
AE
3929
3930 /* Get the size and object order for the image */
3931
3932 ret = rbd_dev_v2_image_size(rbd_dev);
1e130199
AE
3933 if (ret < 0)
3934 goto out_err;
3935
3936 /* Get the object prefix (a.k.a. block_name) for the image */
3937
3938 ret = rbd_dev_v2_object_prefix(rbd_dev);
b1b5402a
AE
3939 if (ret < 0)
3940 goto out_err;
3941
d889140c 3942 /* Get the and check features for the image */
b1b5402a
AE
3943
3944 ret = rbd_dev_v2_features(rbd_dev);
9d475de5
AE
3945 if (ret < 0)
3946 goto out_err;
35d489f9 3947
86b00e0d
AE
3948 /* If the image supports layering, get the parent info */
3949
3950 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3951 ret = rbd_dev_v2_parent_info(rbd_dev);
3952 if (ret < 0)
3953 goto out_err;
3954 }
3955
6e14b1a6
AE
3956 /* crypto and compression type aren't (yet) supported for v2 images */
3957
3958 rbd_dev->header.crypt_type = 0;
3959 rbd_dev->header.comp_type = 0;
35d489f9 3960
6e14b1a6
AE
3961 /* Get the snapshot context, plus the header version */
3962
3963 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
35d489f9
AE
3964 if (ret)
3965 goto out_err;
6e14b1a6
AE
3966 rbd_dev->header.obj_version = ver;
3967
a30b71b9
AE
3968 rbd_dev->image_format = 2;
3969
3970 dout("discovered version 2 image, header name is %s\n",
3971 rbd_dev->header_name);
3972
35152979 3973 return 0;
9d475de5 3974out_err:
86b00e0d
AE
3975 rbd_dev->parent_overlap = 0;
3976 rbd_spec_put(rbd_dev->parent_spec);
3977 rbd_dev->parent_spec = NULL;
9d475de5
AE
3978 kfree(rbd_dev->header_name);
3979 rbd_dev->header_name = NULL;
1e130199
AE
3980 kfree(rbd_dev->header.object_prefix);
3981 rbd_dev->header.object_prefix = NULL;
9d475de5
AE
3982
3983 return ret;
a30b71b9
AE
3984}
3985
83a06263
AE
3986static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3987{
3988 int ret;
3989
3990 /* no need to lock here, as rbd_dev is not registered yet */
3991 ret = rbd_dev_snaps_update(rbd_dev);
3992 if (ret)
3993 return ret;
3994
9e15b77d
AE
3995 ret = rbd_dev_probe_update_spec(rbd_dev);
3996 if (ret)
3997 goto err_out_snaps;
3998
83a06263
AE
3999 ret = rbd_dev_set_mapping(rbd_dev);
4000 if (ret)
4001 goto err_out_snaps;
4002
4003 /* generate unique id: find highest unique id, add one */
4004 rbd_dev_id_get(rbd_dev);
4005
4006 /* Fill in the device name, now that we have its id. */
4007 BUILD_BUG_ON(DEV_NAME_LEN
4008 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
4009 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
4010
4011 /* Get our block major device number. */
4012
4013 ret = register_blkdev(0, rbd_dev->name);
4014 if (ret < 0)
4015 goto err_out_id;
4016 rbd_dev->major = ret;
4017
4018 /* Set up the blkdev mapping. */
4019
4020 ret = rbd_init_disk(rbd_dev);
4021 if (ret)
4022 goto err_out_blkdev;
4023
4024 ret = rbd_bus_add_dev(rbd_dev);
4025 if (ret)
4026 goto err_out_disk;
4027
4028 /*
4029 * At this point cleanup in the event of an error is the job
4030 * of the sysfs code (initiated by rbd_bus_del_dev()).
4031 */
4032 down_write(&rbd_dev->header_rwsem);
4033 ret = rbd_dev_snaps_register(rbd_dev);
4034 up_write(&rbd_dev->header_rwsem);
4035 if (ret)
4036 goto err_out_bus;
4037
9969ebc5 4038 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
83a06263
AE
4039 if (ret)
4040 goto err_out_bus;
4041
4042 /* Everything's ready. Announce the disk to the world. */
4043
4044 add_disk(rbd_dev->disk);
4045
4046 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4047 (unsigned long long) rbd_dev->mapping.size);
4048
4049 return ret;
4050err_out_bus:
4051 /* this will also clean up rest of rbd_dev stuff */
4052
4053 rbd_bus_del_dev(rbd_dev);
4054
4055 return ret;
4056err_out_disk:
4057 rbd_free_disk(rbd_dev);
4058err_out_blkdev:
4059 unregister_blkdev(rbd_dev->major, rbd_dev->name);
4060err_out_id:
4061 rbd_dev_id_put(rbd_dev);
4062err_out_snaps:
4063 rbd_remove_all_snaps(rbd_dev);
4064
4065 return ret;
4066}
4067
a30b71b9
AE
4068/*
4069 * Probe for the existence of the header object for the given rbd
4070 * device. For format 2 images this includes determining the image
4071 * id.
4072 */
4073static int rbd_dev_probe(struct rbd_device *rbd_dev)
4074{
4075 int ret;
4076
4077 /*
4078 * Get the id from the image id object. If it's not a
4079 * format 2 image, we'll get ENOENT back, and we'll assume
4080 * it's a format 1 image.
4081 */
4082 ret = rbd_dev_image_id(rbd_dev);
4083 if (ret)
4084 ret = rbd_dev_v1_probe(rbd_dev);
4085 else
4086 ret = rbd_dev_v2_probe(rbd_dev);
83a06263 4087 if (ret) {
a30b71b9
AE
4088 dout("probe failed, returning %d\n", ret);
4089
83a06263
AE
4090 return ret;
4091 }
4092
4093 ret = rbd_dev_probe_finish(rbd_dev);
4094 if (ret)
4095 rbd_header_free(&rbd_dev->header);
4096
a30b71b9
AE
4097 return ret;
4098}
4099
59c2be1e
YS
4100static ssize_t rbd_add(struct bus_type *bus,
4101 const char *buf,
4102 size_t count)
602adf40 4103{
cb8627c7 4104 struct rbd_device *rbd_dev = NULL;
dc79b113 4105 struct ceph_options *ceph_opts = NULL;
4e9afeba 4106 struct rbd_options *rbd_opts = NULL;
859c31df 4107 struct rbd_spec *spec = NULL;
9d3997fd 4108 struct rbd_client *rbdc;
27cc2594
AE
4109 struct ceph_osd_client *osdc;
4110 int rc = -ENOMEM;
602adf40
YS
4111
4112 if (!try_module_get(THIS_MODULE))
4113 return -ENODEV;
4114
602adf40 4115 /* parse add command */
859c31df 4116 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
dc79b113 4117 if (rc < 0)
bd4ba655 4118 goto err_out_module;
78cea76e 4119
9d3997fd
AE
4120 rbdc = rbd_get_client(ceph_opts);
4121 if (IS_ERR(rbdc)) {
4122 rc = PTR_ERR(rbdc);
0ddebc0c 4123 goto err_out_args;
9d3997fd 4124 }
c53d5893 4125 ceph_opts = NULL; /* rbd_dev client now owns this */
602adf40 4126
602adf40 4127 /* pick the pool */
9d3997fd 4128 osdc = &rbdc->client->osdc;
859c31df 4129 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
602adf40
YS
4130 if (rc < 0)
4131 goto err_out_client;
859c31df
AE
4132 spec->pool_id = (u64) rc;
4133
0903e875
AE
4134 /* The ceph file layout needs to fit pool id in 32 bits */
4135
4136 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4137 rc = -EIO;
4138 goto err_out_client;
4139 }
4140
c53d5893 4141 rbd_dev = rbd_dev_create(rbdc, spec);
bd4ba655
AE
4142 if (!rbd_dev)
4143 goto err_out_client;
c53d5893
AE
4144 rbdc = NULL; /* rbd_dev now owns this */
4145 spec = NULL; /* rbd_dev now owns this */
602adf40 4146
bd4ba655 4147 rbd_dev->mapping.read_only = rbd_opts->read_only;
c53d5893
AE
4148 kfree(rbd_opts);
4149 rbd_opts = NULL; /* done with this */
bd4ba655 4150
a30b71b9
AE
4151 rc = rbd_dev_probe(rbd_dev);
4152 if (rc < 0)
c53d5893 4153 goto err_out_rbd_dev;
05fd6f6f 4154
602adf40 4155 return count;
c53d5893
AE
4156err_out_rbd_dev:
4157 rbd_dev_destroy(rbd_dev);
bd4ba655 4158err_out_client:
9d3997fd 4159 rbd_put_client(rbdc);
0ddebc0c 4160err_out_args:
78cea76e
AE
4161 if (ceph_opts)
4162 ceph_destroy_options(ceph_opts);
4e9afeba 4163 kfree(rbd_opts);
859c31df 4164 rbd_spec_put(spec);
bd4ba655
AE
4165err_out_module:
4166 module_put(THIS_MODULE);
27cc2594 4167
602adf40 4168 dout("Error adding device %s\n", buf);
27cc2594
AE
4169
4170 return (ssize_t) rc;
602adf40
YS
4171}
4172
de71a297 4173static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
4174{
4175 struct list_head *tmp;
4176 struct rbd_device *rbd_dev;
4177
e124a82f 4178 spin_lock(&rbd_dev_list_lock);
602adf40
YS
4179 list_for_each(tmp, &rbd_dev_list) {
4180 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 4181 if (rbd_dev->dev_id == dev_id) {
e124a82f 4182 spin_unlock(&rbd_dev_list_lock);
602adf40 4183 return rbd_dev;
e124a82f 4184 }
602adf40 4185 }
e124a82f 4186 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
4187 return NULL;
4188}
4189
dfc5606d 4190static void rbd_dev_release(struct device *dev)
602adf40 4191{
593a9e7b 4192 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 4193
59c2be1e 4194 if (rbd_dev->watch_event)
9969ebc5 4195 rbd_dev_header_watch_sync(rbd_dev, 0);
602adf40
YS
4196
4197 /* clean up and free blkdev */
4198 rbd_free_disk(rbd_dev);
4199 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d 4200
2ac4e75d
AE
4201 /* release allocated disk header fields */
4202 rbd_header_free(&rbd_dev->header);
4203
32eec68d 4204 /* done with the id, and with the rbd_dev */
e2839308 4205 rbd_dev_id_put(rbd_dev);
c53d5893
AE
4206 rbd_assert(rbd_dev->rbd_client != NULL);
4207 rbd_dev_destroy(rbd_dev);
602adf40
YS
4208
4209 /* release module ref */
4210 module_put(THIS_MODULE);
602adf40
YS
4211}
4212
dfc5606d
YS
4213static ssize_t rbd_remove(struct bus_type *bus,
4214 const char *buf,
4215 size_t count)
602adf40
YS
4216{
4217 struct rbd_device *rbd_dev = NULL;
4218 int target_id, rc;
4219 unsigned long ul;
4220 int ret = count;
4221
4222 rc = strict_strtoul(buf, 10, &ul);
4223 if (rc)
4224 return rc;
4225
4226 /* convert to int; abort if we lost anything in the conversion */
4227 target_id = (int) ul;
4228 if (target_id != ul)
4229 return -EINVAL;
4230
4231 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4232
4233 rbd_dev = __rbd_get_dev(target_id);
4234 if (!rbd_dev) {
4235 ret = -ENOENT;
4236 goto done;
42382b70
AE
4237 }
4238
a14ea269 4239 spin_lock_irq(&rbd_dev->lock);
b82d167b 4240 if (rbd_dev->open_count)
42382b70 4241 ret = -EBUSY;
b82d167b
AE
4242 else
4243 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
a14ea269 4244 spin_unlock_irq(&rbd_dev->lock);
b82d167b 4245 if (ret < 0)
42382b70 4246 goto done;
602adf40 4247
41f38c2b 4248 rbd_remove_all_snaps(rbd_dev);
dfc5606d 4249 rbd_bus_del_dev(rbd_dev);
602adf40
YS
4250
4251done:
4252 mutex_unlock(&ctl_mutex);
aafb230e 4253
602adf40
YS
4254 return ret;
4255}
4256
602adf40
YS
4257/*
4258 * create control files in sysfs
dfc5606d 4259 * /sys/bus/rbd/...
602adf40
YS
4260 */
4261static int rbd_sysfs_init(void)
4262{
dfc5606d 4263 int ret;
602adf40 4264
fed4c143 4265 ret = device_register(&rbd_root_dev);
21079786 4266 if (ret < 0)
dfc5606d 4267 return ret;
602adf40 4268
fed4c143
AE
4269 ret = bus_register(&rbd_bus_type);
4270 if (ret < 0)
4271 device_unregister(&rbd_root_dev);
602adf40 4272
602adf40
YS
4273 return ret;
4274}
4275
4276static void rbd_sysfs_cleanup(void)
4277{
dfc5606d 4278 bus_unregister(&rbd_bus_type);
fed4c143 4279 device_unregister(&rbd_root_dev);
602adf40
YS
4280}
4281
cc344fa1 4282static int __init rbd_init(void)
602adf40
YS
4283{
4284 int rc;
4285
1e32d34c
AE
4286 if (!libceph_compatible(NULL)) {
4287 rbd_warn(NULL, "libceph incompatibility (quitting)");
4288
4289 return -EINVAL;
4290 }
602adf40
YS
4291 rc = rbd_sysfs_init();
4292 if (rc)
4293 return rc;
f0f8cef5 4294 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
4295 return 0;
4296}
4297
cc344fa1 4298static void __exit rbd_exit(void)
602adf40
YS
4299{
4300 rbd_sysfs_cleanup();
4301}
4302
4303module_init(rbd_init);
4304module_exit(rbd_exit);
4305
4306MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4307MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4308MODULE_DESCRIPTION("rados block device");
4309
4310/* following authorship retained from original osdblk.c */
4311MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4312
4313MODULE_LICENSE("GPL");
This page took 0.385628 seconds and 5 git commands to generate.