rbd: simplify snap_by_name() interface
[deliverable/linux.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
df111be6
AE
55/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
f0f8cef5
AE
59#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
61
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
602adf40
YS
64#define RBD_MAX_SNAP_NAME_LEN 32
65#define RBD_MAX_OPT_LEN 1024
66
67#define RBD_SNAP_HEAD_NAME "-"
68
81a89793
AE
69/*
70 * An RBD device name will be "rbd#", where the "rbd" comes from
71 * RBD_DRV_NAME above, and # is a unique integer identifier.
72 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
73 * enough to hold all possible device names.
74 */
602adf40 75#define DEV_NAME_LEN 32
81a89793 76#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40 77
cc0538b6 78#define RBD_READ_ONLY_DEFAULT false
59c2be1e 79
602adf40
YS
80/*
81 * block device image metadata (in-memory version)
82 */
83struct rbd_image_header {
f84344f3 84 /* These four fields never change for a given rbd image */
849b4260 85 char *object_prefix;
602adf40
YS
86 __u8 obj_order;
87 __u8 crypt_type;
88 __u8 comp_type;
602adf40 89
f84344f3
AE
90 /* The remaining fields need to be updated occasionally */
91 u64 image_size;
92 struct ceph_snap_context *snapc;
602adf40
YS
93 char *snap_names;
94 u64 *snap_sizes;
59c2be1e
YS
95
96 u64 obj_version;
97};
98
99struct rbd_options {
cc0538b6 100 bool read_only;
602adf40
YS
101};
102
103/*
f0f8cef5 104 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
105 */
106struct rbd_client {
107 struct ceph_client *client;
108 struct kref kref;
109 struct list_head node;
110};
111
112/*
f0f8cef5 113 * a request completion status
602adf40 114 */
1fec7093
YS
115struct rbd_req_status {
116 int done;
117 int rc;
118 u64 bytes;
119};
120
121/*
122 * a collection of requests
123 */
124struct rbd_req_coll {
125 int total;
126 int num_done;
127 struct kref kref;
128 struct rbd_req_status status[0];
602adf40
YS
129};
130
f0f8cef5
AE
131/*
132 * a single io request
133 */
134struct rbd_request {
135 struct request *rq; /* blk layer request */
136 struct bio *bio; /* cloned bio */
137 struct page **pages; /* list of used pages */
138 u64 len;
139 int coll_index;
140 struct rbd_req_coll *coll;
141};
142
dfc5606d
YS
143struct rbd_snap {
144 struct device dev;
145 const char *name;
3591538f 146 u64 size;
dfc5606d
YS
147 struct list_head node;
148 u64 id;
149};
150
f84344f3
AE
151struct rbd_mapping {
152 char *snap_name;
153 u64 snap_id;
99c1f08f 154 u64 size;
f84344f3
AE
155 bool snap_exists;
156 bool read_only;
157};
158
602adf40
YS
159/*
160 * a single device
161 */
162struct rbd_device {
de71a297 163 int dev_id; /* blkdev unique id */
602adf40
YS
164
165 int major; /* blkdev assigned major */
166 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 167
f8c38929 168 struct rbd_options rbd_opts;
602adf40
YS
169 struct rbd_client *rbd_client;
170
171 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
172
173 spinlock_t lock; /* queue lock */
174
175 struct rbd_image_header header;
0bed54dc
AE
176 char *image_name;
177 size_t image_name_len;
178 char *header_name;
d22f76e7 179 char *pool_name;
9bb2f334 180 int pool_id;
602adf40 181
59c2be1e
YS
182 struct ceph_osd_event *watch_event;
183 struct ceph_osd_request *watch_request;
184
c666601a
JD
185 /* protects updating the header */
186 struct rw_semaphore header_rwsem;
f84344f3
AE
187
188 struct rbd_mapping mapping;
602adf40
YS
189
190 struct list_head node;
dfc5606d
YS
191
192 /* list of snapshots */
193 struct list_head snaps;
194
195 /* sysfs related */
196 struct device dev;
197};
198
602adf40 199static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 200
602adf40 201static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
202static DEFINE_SPINLOCK(rbd_dev_list_lock);
203
432b8587
AE
204static LIST_HEAD(rbd_client_list); /* clients */
205static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 206
9fcbb800 207static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev);
dfc5606d 208static void rbd_dev_release(struct device *dev);
dfc5606d
YS
209static ssize_t rbd_snap_add(struct device *dev,
210 struct device_attribute *attr,
211 const char *buf,
212 size_t count);
14e7085d 213static void __rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 214
f0f8cef5
AE
215static ssize_t rbd_add(struct bus_type *bus, const char *buf,
216 size_t count);
217static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
218 size_t count);
219
220static struct bus_attribute rbd_bus_attrs[] = {
221 __ATTR(add, S_IWUSR, NULL, rbd_add),
222 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
223 __ATTR_NULL
224};
225
226static struct bus_type rbd_bus_type = {
227 .name = "rbd",
228 .bus_attrs = rbd_bus_attrs,
229};
230
231static void rbd_root_dev_release(struct device *dev)
232{
233}
234
235static struct device rbd_root_dev = {
236 .init_name = "rbd",
237 .release = rbd_root_dev_release,
238};
239
aafb230e
AE
240#ifdef RBD_DEBUG
241#define rbd_assert(expr) \
242 if (unlikely(!(expr))) { \
243 printk(KERN_ERR "\nAssertion failure in %s() " \
244 "at line %d:\n\n" \
245 "\trbd_assert(%s);\n\n", \
246 __func__, __LINE__, #expr); \
247 BUG(); \
248 }
249#else /* !RBD_DEBUG */
250# define rbd_assert(expr) ((void) 0)
251#endif /* !RBD_DEBUG */
dfc5606d 252
dfc5606d
YS
253static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
254{
255 return get_device(&rbd_dev->dev);
256}
257
258static void rbd_put_dev(struct rbd_device *rbd_dev)
259{
260 put_device(&rbd_dev->dev);
261}
602adf40 262
1fe5e993 263static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 264
602adf40
YS
265static int rbd_open(struct block_device *bdev, fmode_t mode)
266{
f0f8cef5 267 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
602adf40 268
f84344f3 269 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
270 return -EROFS;
271
340c7a2b 272 rbd_get_dev(rbd_dev);
f84344f3 273 set_device_ro(bdev, rbd_dev->mapping.read_only);
340c7a2b 274
602adf40
YS
275 return 0;
276}
277
dfc5606d
YS
278static int rbd_release(struct gendisk *disk, fmode_t mode)
279{
280 struct rbd_device *rbd_dev = disk->private_data;
281
282 rbd_put_dev(rbd_dev);
283
284 return 0;
285}
286
602adf40
YS
287static const struct block_device_operations rbd_bd_ops = {
288 .owner = THIS_MODULE,
289 .open = rbd_open,
dfc5606d 290 .release = rbd_release,
602adf40
YS
291};
292
293/*
294 * Initialize an rbd client instance.
43ae4701 295 * We own *ceph_opts.
602adf40 296 */
f8c38929 297static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
298{
299 struct rbd_client *rbdc;
300 int ret = -ENOMEM;
301
302 dout("rbd_client_create\n");
303 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
304 if (!rbdc)
305 goto out_opt;
306
307 kref_init(&rbdc->kref);
308 INIT_LIST_HEAD(&rbdc->node);
309
bc534d86
AE
310 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
311
43ae4701 312 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 313 if (IS_ERR(rbdc->client))
bc534d86 314 goto out_mutex;
43ae4701 315 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
316
317 ret = ceph_open_session(rbdc->client);
318 if (ret < 0)
319 goto out_err;
320
432b8587 321 spin_lock(&rbd_client_list_lock);
602adf40 322 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 323 spin_unlock(&rbd_client_list_lock);
602adf40 324
bc534d86
AE
325 mutex_unlock(&ctl_mutex);
326
602adf40
YS
327 dout("rbd_client_create created %p\n", rbdc);
328 return rbdc;
329
330out_err:
331 ceph_destroy_client(rbdc->client);
bc534d86
AE
332out_mutex:
333 mutex_unlock(&ctl_mutex);
602adf40
YS
334 kfree(rbdc);
335out_opt:
43ae4701
AE
336 if (ceph_opts)
337 ceph_destroy_options(ceph_opts);
28f259b7 338 return ERR_PTR(ret);
602adf40
YS
339}
340
341/*
1f7ba331
AE
342 * Find a ceph client with specific addr and configuration. If
343 * found, bump its reference count.
602adf40 344 */
1f7ba331 345static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
346{
347 struct rbd_client *client_node;
1f7ba331 348 bool found = false;
602adf40 349
43ae4701 350 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
351 return NULL;
352
1f7ba331
AE
353 spin_lock(&rbd_client_list_lock);
354 list_for_each_entry(client_node, &rbd_client_list, node) {
355 if (!ceph_compare_options(ceph_opts, client_node->client)) {
356 kref_get(&client_node->kref);
357 found = true;
358 break;
359 }
360 }
361 spin_unlock(&rbd_client_list_lock);
362
363 return found ? client_node : NULL;
602adf40
YS
364}
365
59c2be1e
YS
366/*
367 * mount options
368 */
369enum {
59c2be1e
YS
370 Opt_last_int,
371 /* int args above */
372 Opt_last_string,
373 /* string args above */
cc0538b6
AE
374 Opt_read_only,
375 Opt_read_write,
376 /* Boolean args above */
377 Opt_last_bool,
59c2be1e
YS
378};
379
43ae4701 380static match_table_t rbd_opts_tokens = {
59c2be1e
YS
381 /* int args above */
382 /* string args above */
f84344f3 383 {Opt_read_only, "mapping.read_only"},
cc0538b6
AE
384 {Opt_read_only, "ro"}, /* Alternate spelling */
385 {Opt_read_write, "read_write"},
386 {Opt_read_write, "rw"}, /* Alternate spelling */
387 /* Boolean args above */
59c2be1e
YS
388 {-1, NULL}
389};
390
391static int parse_rbd_opts_token(char *c, void *private)
392{
43ae4701 393 struct rbd_options *rbd_opts = private;
59c2be1e
YS
394 substring_t argstr[MAX_OPT_ARGS];
395 int token, intval, ret;
396
43ae4701 397 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
398 if (token < 0)
399 return -EINVAL;
400
401 if (token < Opt_last_int) {
402 ret = match_int(&argstr[0], &intval);
403 if (ret < 0) {
404 pr_err("bad mount option arg (not int) "
405 "at '%s'\n", c);
406 return ret;
407 }
408 dout("got int token %d val %d\n", token, intval);
409 } else if (token > Opt_last_int && token < Opt_last_string) {
410 dout("got string token %d val %s\n", token,
411 argstr[0].from);
cc0538b6
AE
412 } else if (token > Opt_last_string && token < Opt_last_bool) {
413 dout("got Boolean token %d\n", token);
59c2be1e
YS
414 } else {
415 dout("got token %d\n", token);
416 }
417
418 switch (token) {
cc0538b6
AE
419 case Opt_read_only:
420 rbd_opts->read_only = true;
421 break;
422 case Opt_read_write:
423 rbd_opts->read_only = false;
424 break;
59c2be1e 425 default:
aafb230e
AE
426 rbd_assert(false);
427 break;
59c2be1e
YS
428 }
429 return 0;
430}
431
602adf40
YS
432/*
433 * Get a ceph client with specific addr and configuration, if one does
434 * not exist create it.
435 */
f8c38929
AE
436static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
437 size_t mon_addr_len, char *options)
602adf40 438{
f8c38929 439 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
43ae4701 440 struct ceph_options *ceph_opts;
f8c38929 441 struct rbd_client *rbdc;
59c2be1e 442
cc0538b6 443 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
602adf40 444
43ae4701
AE
445 ceph_opts = ceph_parse_options(options, mon_addr,
446 mon_addr + mon_addr_len,
447 parse_rbd_opts_token, rbd_opts);
f8c38929
AE
448 if (IS_ERR(ceph_opts))
449 return PTR_ERR(ceph_opts);
602adf40 450
1f7ba331 451 rbdc = rbd_client_find(ceph_opts);
602adf40 452 if (rbdc) {
602adf40 453 /* using an existing client */
43ae4701 454 ceph_destroy_options(ceph_opts);
f8c38929
AE
455 } else {
456 rbdc = rbd_client_create(ceph_opts);
457 if (IS_ERR(rbdc))
458 return PTR_ERR(rbdc);
602adf40 459 }
f8c38929 460 rbd_dev->rbd_client = rbdc;
602adf40 461
f8c38929 462 return 0;
602adf40
YS
463}
464
465/*
466 * Destroy ceph client
d23a4b3f 467 *
432b8587 468 * Caller must hold rbd_client_list_lock.
602adf40
YS
469 */
470static void rbd_client_release(struct kref *kref)
471{
472 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
473
474 dout("rbd_release_client %p\n", rbdc);
cd9d9f5d 475 spin_lock(&rbd_client_list_lock);
602adf40 476 list_del(&rbdc->node);
cd9d9f5d 477 spin_unlock(&rbd_client_list_lock);
602adf40
YS
478
479 ceph_destroy_client(rbdc->client);
480 kfree(rbdc);
481}
482
483/*
484 * Drop reference to ceph client node. If it's not referenced anymore, release
485 * it.
486 */
487static void rbd_put_client(struct rbd_device *rbd_dev)
488{
489 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
490 rbd_dev->rbd_client = NULL;
602adf40
YS
491}
492
1fec7093
YS
493/*
494 * Destroy requests collection
495 */
496static void rbd_coll_release(struct kref *kref)
497{
498 struct rbd_req_coll *coll =
499 container_of(kref, struct rbd_req_coll, kref);
500
501 dout("rbd_coll_release %p\n", coll);
502 kfree(coll);
503}
602adf40 504
8e94af8e
AE
505static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
506{
103a150f
AE
507 size_t size;
508 u32 snap_count;
509
510 /* The header has to start with the magic rbd header text */
511 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
512 return false;
513
514 /*
515 * The size of a snapshot header has to fit in a size_t, and
516 * that limits the number of snapshots.
517 */
518 snap_count = le32_to_cpu(ondisk->snap_count);
519 size = SIZE_MAX - sizeof (struct ceph_snap_context);
520 if (snap_count > size / sizeof (__le64))
521 return false;
522
523 /*
524 * Not only that, but the size of the entire the snapshot
525 * header must also be representable in a size_t.
526 */
527 size -= snap_count * sizeof (__le64);
528 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
529 return false;
530
531 return true;
8e94af8e
AE
532}
533
602adf40
YS
534/*
535 * Create a new header structure, translate header format from the on-disk
536 * header.
537 */
538static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 539 struct rbd_image_header_ondisk *ondisk)
602adf40 540{
ccece235 541 u32 snap_count;
58c17b0e 542 size_t len;
d2bb24e5 543 size_t size;
621901d6 544 u32 i;
602adf40 545
6a52325f
AE
546 memset(header, 0, sizeof (*header));
547
103a150f
AE
548 snap_count = le32_to_cpu(ondisk->snap_count);
549
58c17b0e
AE
550 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
551 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 552 if (!header->object_prefix)
602adf40 553 return -ENOMEM;
58c17b0e
AE
554 memcpy(header->object_prefix, ondisk->object_prefix, len);
555 header->object_prefix[len] = '\0';
00f1f36f 556
602adf40 557 if (snap_count) {
f785cc1d
AE
558 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
559
621901d6
AE
560 /* Save a copy of the snapshot names */
561
f785cc1d
AE
562 if (snap_names_len > (u64) SIZE_MAX)
563 return -EIO;
564 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 565 if (!header->snap_names)
6a52325f 566 goto out_err;
f785cc1d
AE
567 /*
568 * Note that rbd_dev_v1_header_read() guarantees
569 * the ondisk buffer we're working with has
570 * snap_names_len bytes beyond the end of the
571 * snapshot id array, this memcpy() is safe.
572 */
573 memcpy(header->snap_names, &ondisk->snaps[snap_count],
574 snap_names_len);
6a52325f 575
621901d6
AE
576 /* Record each snapshot's size */
577
d2bb24e5
AE
578 size = snap_count * sizeof (*header->snap_sizes);
579 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 580 if (!header->snap_sizes)
6a52325f 581 goto out_err;
621901d6
AE
582 for (i = 0; i < snap_count; i++)
583 header->snap_sizes[i] =
584 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 585 } else {
ccece235 586 WARN_ON(ondisk->snap_names_len);
602adf40
YS
587 header->snap_names = NULL;
588 header->snap_sizes = NULL;
589 }
849b4260 590
602adf40
YS
591 header->obj_order = ondisk->options.order;
592 header->crypt_type = ondisk->options.crypt_type;
593 header->comp_type = ondisk->options.comp_type;
6a52325f 594
621901d6
AE
595 /* Allocate and fill in the snapshot context */
596
f84344f3 597 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
598 size = sizeof (struct ceph_snap_context);
599 size += snap_count * sizeof (header->snapc->snaps[0]);
600 header->snapc = kzalloc(size, GFP_KERNEL);
601 if (!header->snapc)
602 goto out_err;
602adf40
YS
603
604 atomic_set(&header->snapc->nref, 1);
505cbb9b 605 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 606 header->snapc->num_snaps = snap_count;
621901d6
AE
607 for (i = 0; i < snap_count; i++)
608 header->snapc->snaps[i] =
609 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
610
611 return 0;
612
6a52325f 613out_err:
849b4260 614 kfree(header->snap_sizes);
ccece235 615 header->snap_sizes = NULL;
602adf40 616 kfree(header->snap_names);
ccece235 617 header->snap_names = NULL;
6a52325f
AE
618 kfree(header->object_prefix);
619 header->object_prefix = NULL;
ccece235 620
00f1f36f 621 return -ENOMEM;
602adf40
YS
622}
623
8836b995 624static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40
YS
625{
626 int i;
8836b995 627 struct rbd_image_header *header = &rbd_dev->header;
602adf40
YS
628 char *p = header->snap_names;
629
c9aadfe7
AE
630 rbd_assert(header->snapc != NULL);
631 for (i = 0; i < header->snapc->num_snaps; i++) {
00f1f36f 632 if (!strcmp(snap_name, p)) {
602adf40 633
00f1f36f 634 /* Found it. Pass back its id and/or size */
602adf40 635
8836b995
AE
636 rbd_dev->mapping.snap_id = header->snapc->snaps[i];
637 rbd_dev->mapping.size = header->snap_sizes[i];
638
00f1f36f
AE
639 return i;
640 }
641 p += strlen(p) + 1; /* Skip ahead to the next name */
642 }
643 return -ENOENT;
602adf40
YS
644}
645
4e1105a2 646static int rbd_header_set_snap(struct rbd_device *rbd_dev, char *snap_name)
602adf40 647{
78dc447d 648 int ret;
602adf40 649
0ce1a794 650 down_write(&rbd_dev->header_rwsem);
602adf40 651
4e1105a2 652 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 653 sizeof (RBD_SNAP_HEAD_NAME))) {
f84344f3 654 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
99c1f08f 655 rbd_dev->mapping.size = rbd_dev->header.image_size;
f84344f3
AE
656 rbd_dev->mapping.snap_exists = false;
657 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
602adf40 658 } else {
8836b995 659 ret = snap_by_name(rbd_dev, snap_name);
602adf40
YS
660 if (ret < 0)
661 goto done;
f84344f3
AE
662 rbd_dev->mapping.snap_exists = true;
663 rbd_dev->mapping.read_only = true;
602adf40 664 }
4e1105a2 665 rbd_dev->mapping.snap_name = snap_name;
602adf40
YS
666
667 ret = 0;
668done:
0ce1a794 669 up_write(&rbd_dev->header_rwsem);
602adf40
YS
670 return ret;
671}
672
673static void rbd_header_free(struct rbd_image_header *header)
674{
849b4260 675 kfree(header->object_prefix);
d78fd7ae 676 header->object_prefix = NULL;
602adf40 677 kfree(header->snap_sizes);
d78fd7ae 678 header->snap_sizes = NULL;
849b4260 679 kfree(header->snap_names);
d78fd7ae 680 header->snap_names = NULL;
d1d25646 681 ceph_put_snap_context(header->snapc);
d78fd7ae 682 header->snapc = NULL;
602adf40
YS
683}
684
65ccfe21 685static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 686{
65ccfe21
AE
687 char *name;
688 u64 segment;
689 int ret;
602adf40 690
65ccfe21
AE
691 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
692 if (!name)
693 return NULL;
694 segment = offset >> rbd_dev->header.obj_order;
695 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
696 rbd_dev->header.object_prefix, segment);
697 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
698 pr_err("error formatting segment name for #%llu (%d)\n",
699 segment, ret);
700 kfree(name);
701 name = NULL;
702 }
602adf40 703
65ccfe21
AE
704 return name;
705}
602adf40 706
65ccfe21
AE
707static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
708{
709 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 710
65ccfe21
AE
711 return offset & (segment_size - 1);
712}
713
714static u64 rbd_segment_length(struct rbd_device *rbd_dev,
715 u64 offset, u64 length)
716{
717 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
718
719 offset &= segment_size - 1;
720
aafb230e 721 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
722 if (offset + length > segment_size)
723 length = segment_size - offset;
724
725 return length;
602adf40
YS
726}
727
1fec7093
YS
728static int rbd_get_num_segments(struct rbd_image_header *header,
729 u64 ofs, u64 len)
730{
df111be6
AE
731 u64 start_seg;
732 u64 end_seg;
733
734 if (!len)
735 return 0;
736 if (len - 1 > U64_MAX - ofs)
737 return -ERANGE;
738
739 start_seg = ofs >> header->obj_order;
740 end_seg = (ofs + len - 1) >> header->obj_order;
741
1fec7093
YS
742 return end_seg - start_seg + 1;
743}
744
029bcbd8
JD
745/*
746 * returns the size of an object in the image
747 */
748static u64 rbd_obj_bytes(struct rbd_image_header *header)
749{
750 return 1 << header->obj_order;
751}
752
602adf40
YS
753/*
754 * bio helpers
755 */
756
757static void bio_chain_put(struct bio *chain)
758{
759 struct bio *tmp;
760
761 while (chain) {
762 tmp = chain;
763 chain = chain->bi_next;
764 bio_put(tmp);
765 }
766}
767
768/*
769 * zeros a bio chain, starting at specific offset
770 */
771static void zero_bio_chain(struct bio *chain, int start_ofs)
772{
773 struct bio_vec *bv;
774 unsigned long flags;
775 void *buf;
776 int i;
777 int pos = 0;
778
779 while (chain) {
780 bio_for_each_segment(bv, chain, i) {
781 if (pos + bv->bv_len > start_ofs) {
782 int remainder = max(start_ofs - pos, 0);
783 buf = bvec_kmap_irq(bv, &flags);
784 memset(buf + remainder, 0,
785 bv->bv_len - remainder);
85b5aaa6 786 bvec_kunmap_irq(buf, &flags);
602adf40
YS
787 }
788 pos += bv->bv_len;
789 }
790
791 chain = chain->bi_next;
792 }
793}
794
795/*
796 * bio_chain_clone - clone a chain of bios up to a certain length.
797 * might return a bio_pair that will need to be released.
798 */
799static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
800 struct bio_pair **bp,
801 int len, gfp_t gfpmask)
802{
542582fc
AE
803 struct bio *old_chain = *old;
804 struct bio *new_chain = NULL;
805 struct bio *tail;
602adf40
YS
806 int total = 0;
807
808 if (*bp) {
809 bio_pair_release(*bp);
810 *bp = NULL;
811 }
812
813 while (old_chain && (total < len)) {
542582fc
AE
814 struct bio *tmp;
815
602adf40
YS
816 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
817 if (!tmp)
818 goto err_out;
542582fc 819 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
602adf40
YS
820
821 if (total + old_chain->bi_size > len) {
822 struct bio_pair *bp;
823
824 /*
825 * this split can only happen with a single paged bio,
826 * split_bio will BUG_ON if this is not the case
827 */
828 dout("bio_chain_clone split! total=%d remaining=%d"
bd919d45
AE
829 "bi_size=%u\n",
830 total, len - total, old_chain->bi_size);
602adf40
YS
831
832 /* split the bio. We'll release it either in the next
833 call, or it will have to be released outside */
593a9e7b 834 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
602adf40
YS
835 if (!bp)
836 goto err_out;
837
838 __bio_clone(tmp, &bp->bio1);
839
840 *next = &bp->bio2;
841 } else {
842 __bio_clone(tmp, old_chain);
843 *next = old_chain->bi_next;
844 }
845
846 tmp->bi_bdev = NULL;
602adf40 847 tmp->bi_next = NULL;
542582fc 848 if (new_chain)
602adf40 849 tail->bi_next = tmp;
542582fc
AE
850 else
851 new_chain = tmp;
852 tail = tmp;
602adf40
YS
853 old_chain = old_chain->bi_next;
854
855 total += tmp->bi_size;
856 }
857
aafb230e 858 rbd_assert(total == len);
602adf40 859
602adf40
YS
860 *old = old_chain;
861
862 return new_chain;
863
864err_out:
865 dout("bio_chain_clone with err\n");
866 bio_chain_put(new_chain);
867 return NULL;
868}
869
870/*
871 * helpers for osd request op vectors.
872 */
57cfc106
AE
873static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
874 int opcode, u32 payload_len)
602adf40 875{
57cfc106
AE
876 struct ceph_osd_req_op *ops;
877
878 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
879 if (!ops)
880 return NULL;
881
882 ops[0].op = opcode;
883
602adf40
YS
884 /*
885 * op extent offset and length will be set later on
886 * in calc_raw_layout()
887 */
57cfc106
AE
888 ops[0].payload_len = payload_len;
889
890 return ops;
602adf40
YS
891}
892
893static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
894{
895 kfree(ops);
896}
897
1fec7093
YS
898static void rbd_coll_end_req_index(struct request *rq,
899 struct rbd_req_coll *coll,
900 int index,
901 int ret, u64 len)
902{
903 struct request_queue *q;
904 int min, max, i;
905
bd919d45
AE
906 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
907 coll, index, ret, (unsigned long long) len);
1fec7093
YS
908
909 if (!rq)
910 return;
911
912 if (!coll) {
913 blk_end_request(rq, ret, len);
914 return;
915 }
916
917 q = rq->q;
918
919 spin_lock_irq(q->queue_lock);
920 coll->status[index].done = 1;
921 coll->status[index].rc = ret;
922 coll->status[index].bytes = len;
923 max = min = coll->num_done;
924 while (max < coll->total && coll->status[max].done)
925 max++;
926
927 for (i = min; i<max; i++) {
928 __blk_end_request(rq, coll->status[i].rc,
929 coll->status[i].bytes);
930 coll->num_done++;
931 kref_put(&coll->kref, rbd_coll_release);
932 }
933 spin_unlock_irq(q->queue_lock);
934}
935
936static void rbd_coll_end_req(struct rbd_request *req,
937 int ret, u64 len)
938{
939 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
940}
941
602adf40
YS
942/*
943 * Send ceph osd request
944 */
945static int rbd_do_request(struct request *rq,
0ce1a794 946 struct rbd_device *rbd_dev,
602adf40
YS
947 struct ceph_snap_context *snapc,
948 u64 snapid,
aded07ea 949 const char *object_name, u64 ofs, u64 len,
602adf40
YS
950 struct bio *bio,
951 struct page **pages,
952 int num_pages,
953 int flags,
954 struct ceph_osd_req_op *ops,
1fec7093
YS
955 struct rbd_req_coll *coll,
956 int coll_index,
602adf40 957 void (*rbd_cb)(struct ceph_osd_request *req,
59c2be1e
YS
958 struct ceph_msg *msg),
959 struct ceph_osd_request **linger_req,
960 u64 *ver)
602adf40
YS
961{
962 struct ceph_osd_request *req;
963 struct ceph_file_layout *layout;
964 int ret;
965 u64 bno;
966 struct timespec mtime = CURRENT_TIME;
967 struct rbd_request *req_data;
968 struct ceph_osd_request_head *reqhead;
1dbb4399 969 struct ceph_osd_client *osdc;
602adf40 970
602adf40 971 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
1fec7093
YS
972 if (!req_data) {
973 if (coll)
974 rbd_coll_end_req_index(rq, coll, coll_index,
975 -ENOMEM, len);
976 return -ENOMEM;
977 }
978
979 if (coll) {
980 req_data->coll = coll;
981 req_data->coll_index = coll_index;
982 }
602adf40 983
bd919d45
AE
984 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
985 (unsigned long long) ofs, (unsigned long long) len);
602adf40 986
0ce1a794 987 osdc = &rbd_dev->rbd_client->client->osdc;
1dbb4399
AE
988 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
989 false, GFP_NOIO, pages, bio);
4ad12621 990 if (!req) {
4ad12621 991 ret = -ENOMEM;
602adf40
YS
992 goto done_pages;
993 }
994
995 req->r_callback = rbd_cb;
996
997 req_data->rq = rq;
998 req_data->bio = bio;
999 req_data->pages = pages;
1000 req_data->len = len;
1001
1002 req->r_priv = req_data;
1003
1004 reqhead = req->r_request->front.iov_base;
1005 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1006
aded07ea 1007 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
602adf40
YS
1008 req->r_oid_len = strlen(req->r_oid);
1009
1010 layout = &req->r_file_layout;
1011 memset(layout, 0, sizeof(*layout));
1012 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1013 layout->fl_stripe_count = cpu_to_le32(1);
1014 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
0ce1a794 1015 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
1dbb4399
AE
1016 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1017 req, ops);
602adf40
YS
1018
1019 ceph_osdc_build_request(req, ofs, &len,
1020 ops,
1021 snapc,
1022 &mtime,
1023 req->r_oid, req->r_oid_len);
602adf40 1024
59c2be1e 1025 if (linger_req) {
1dbb4399 1026 ceph_osdc_set_request_linger(osdc, req);
59c2be1e
YS
1027 *linger_req = req;
1028 }
1029
1dbb4399 1030 ret = ceph_osdc_start_request(osdc, req, false);
602adf40
YS
1031 if (ret < 0)
1032 goto done_err;
1033
1034 if (!rbd_cb) {
1dbb4399 1035 ret = ceph_osdc_wait_request(osdc, req);
59c2be1e
YS
1036 if (ver)
1037 *ver = le64_to_cpu(req->r_reassert_version.version);
bd919d45
AE
1038 dout("reassert_ver=%llu\n",
1039 (unsigned long long)
1040 le64_to_cpu(req->r_reassert_version.version));
602adf40
YS
1041 ceph_osdc_put_request(req);
1042 }
1043 return ret;
1044
1045done_err:
1046 bio_chain_put(req_data->bio);
1047 ceph_osdc_put_request(req);
1048done_pages:
1fec7093 1049 rbd_coll_end_req(req_data, ret, len);
602adf40 1050 kfree(req_data);
602adf40
YS
1051 return ret;
1052}
1053
1054/*
1055 * Ceph osd op callback
1056 */
1057static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1058{
1059 struct rbd_request *req_data = req->r_priv;
1060 struct ceph_osd_reply_head *replyhead;
1061 struct ceph_osd_op *op;
1062 __s32 rc;
1063 u64 bytes;
1064 int read_op;
1065
1066 /* parse reply */
1067 replyhead = msg->front.iov_base;
1068 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1069 op = (void *)(replyhead + 1);
1070 rc = le32_to_cpu(replyhead->result);
1071 bytes = le64_to_cpu(op->extent.length);
895cfcc8 1072 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
602adf40 1073
bd919d45
AE
1074 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1075 (unsigned long long) bytes, read_op, (int) rc);
602adf40
YS
1076
1077 if (rc == -ENOENT && read_op) {
1078 zero_bio_chain(req_data->bio, 0);
1079 rc = 0;
1080 } else if (rc == 0 && read_op && bytes < req_data->len) {
1081 zero_bio_chain(req_data->bio, bytes);
1082 bytes = req_data->len;
1083 }
1084
1fec7093 1085 rbd_coll_end_req(req_data, rc, bytes);
602adf40
YS
1086
1087 if (req_data->bio)
1088 bio_chain_put(req_data->bio);
1089
1090 ceph_osdc_put_request(req);
1091 kfree(req_data);
1092}
1093
59c2be1e
YS
1094static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1095{
1096 ceph_osdc_put_request(req);
1097}
1098
602adf40
YS
1099/*
1100 * Do a synchronous ceph osd operation
1101 */
0ce1a794 1102static int rbd_req_sync_op(struct rbd_device *rbd_dev,
602adf40
YS
1103 struct ceph_snap_context *snapc,
1104 u64 snapid,
602adf40 1105 int flags,
913d2fdc 1106 struct ceph_osd_req_op *ops,
aded07ea 1107 const char *object_name,
602adf40 1108 u64 ofs, u64 len,
59c2be1e
YS
1109 char *buf,
1110 struct ceph_osd_request **linger_req,
1111 u64 *ver)
602adf40
YS
1112{
1113 int ret;
1114 struct page **pages;
1115 int num_pages;
913d2fdc 1116
aafb230e 1117 rbd_assert(ops != NULL);
602adf40
YS
1118
1119 num_pages = calc_pages_for(ofs , len);
1120 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
b8d0638a
DC
1121 if (IS_ERR(pages))
1122 return PTR_ERR(pages);
602adf40 1123
0ce1a794 1124 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
aded07ea 1125 object_name, ofs, len, NULL,
602adf40
YS
1126 pages, num_pages,
1127 flags,
1128 ops,
1fec7093 1129 NULL, 0,
59c2be1e
YS
1130 NULL,
1131 linger_req, ver);
602adf40 1132 if (ret < 0)
913d2fdc 1133 goto done;
602adf40
YS
1134
1135 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1136 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1137
602adf40
YS
1138done:
1139 ceph_release_page_vector(pages, num_pages);
1140 return ret;
1141}
1142
1143/*
1144 * Do an asynchronous ceph osd operation
1145 */
1146static int rbd_do_op(struct request *rq,
0ce1a794 1147 struct rbd_device *rbd_dev,
602adf40
YS
1148 struct ceph_snap_context *snapc,
1149 u64 snapid,
d1f57ea6 1150 int opcode, int flags,
602adf40 1151 u64 ofs, u64 len,
1fec7093
YS
1152 struct bio *bio,
1153 struct rbd_req_coll *coll,
1154 int coll_index)
602adf40
YS
1155{
1156 char *seg_name;
1157 u64 seg_ofs;
1158 u64 seg_len;
1159 int ret;
1160 struct ceph_osd_req_op *ops;
1161 u32 payload_len;
1162
65ccfe21 1163 seg_name = rbd_segment_name(rbd_dev, ofs);
602adf40
YS
1164 if (!seg_name)
1165 return -ENOMEM;
65ccfe21
AE
1166 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1167 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
602adf40
YS
1168
1169 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1170
57cfc106
AE
1171 ret = -ENOMEM;
1172 ops = rbd_create_rw_ops(1, opcode, payload_len);
1173 if (!ops)
602adf40
YS
1174 goto done;
1175
1176 /* we've taken care of segment sizes earlier when we
1177 cloned the bios. We should never have a segment
1178 truncated at this point */
aafb230e 1179 rbd_assert(seg_len == len);
602adf40
YS
1180
1181 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1182 seg_name, seg_ofs, seg_len,
1183 bio,
1184 NULL, 0,
1185 flags,
1186 ops,
1fec7093 1187 coll, coll_index,
59c2be1e 1188 rbd_req_cb, 0, NULL);
11f77002
SW
1189
1190 rbd_destroy_ops(ops);
602adf40
YS
1191done:
1192 kfree(seg_name);
1193 return ret;
1194}
1195
1196/*
1197 * Request async osd write
1198 */
1199static int rbd_req_write(struct request *rq,
1200 struct rbd_device *rbd_dev,
1201 struct ceph_snap_context *snapc,
1202 u64 ofs, u64 len,
1fec7093
YS
1203 struct bio *bio,
1204 struct rbd_req_coll *coll,
1205 int coll_index)
602adf40
YS
1206{
1207 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1208 CEPH_OSD_OP_WRITE,
1209 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1fec7093 1210 ofs, len, bio, coll, coll_index);
602adf40
YS
1211}
1212
1213/*
1214 * Request async osd read
1215 */
1216static int rbd_req_read(struct request *rq,
1217 struct rbd_device *rbd_dev,
1218 u64 snapid,
1219 u64 ofs, u64 len,
1fec7093
YS
1220 struct bio *bio,
1221 struct rbd_req_coll *coll,
1222 int coll_index)
602adf40
YS
1223{
1224 return rbd_do_op(rq, rbd_dev, NULL,
b06e6a6b 1225 snapid,
602adf40
YS
1226 CEPH_OSD_OP_READ,
1227 CEPH_OSD_FLAG_READ,
1fec7093 1228 ofs, len, bio, coll, coll_index);
602adf40
YS
1229}
1230
1231/*
1232 * Request sync osd read
1233 */
0ce1a794 1234static int rbd_req_sync_read(struct rbd_device *rbd_dev,
602adf40 1235 u64 snapid,
aded07ea 1236 const char *object_name,
602adf40 1237 u64 ofs, u64 len,
59c2be1e
YS
1238 char *buf,
1239 u64 *ver)
602adf40 1240{
913d2fdc
AE
1241 struct ceph_osd_req_op *ops;
1242 int ret;
1243
1244 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1245 if (!ops)
1246 return -ENOMEM;
1247
1248 ret = rbd_req_sync_op(rbd_dev, NULL,
b06e6a6b 1249 snapid,
602adf40 1250 CEPH_OSD_FLAG_READ,
913d2fdc
AE
1251 ops, object_name, ofs, len, buf, NULL, ver);
1252 rbd_destroy_ops(ops);
1253
1254 return ret;
602adf40
YS
1255}
1256
1257/*
59c2be1e
YS
1258 * Request sync osd watch
1259 */
0ce1a794 1260static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
59c2be1e 1261 u64 ver,
7f0a24d8 1262 u64 notify_id)
59c2be1e
YS
1263{
1264 struct ceph_osd_req_op *ops;
11f77002
SW
1265 int ret;
1266
57cfc106
AE
1267 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1268 if (!ops)
1269 return -ENOMEM;
59c2be1e 1270
a71b891b 1271 ops[0].watch.ver = cpu_to_le64(ver);
59c2be1e
YS
1272 ops[0].watch.cookie = notify_id;
1273 ops[0].watch.flag = 0;
1274
0ce1a794 1275 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
7f0a24d8 1276 rbd_dev->header_name, 0, 0, NULL,
ad4f232f 1277 NULL, 0,
59c2be1e
YS
1278 CEPH_OSD_FLAG_READ,
1279 ops,
1fec7093 1280 NULL, 0,
59c2be1e
YS
1281 rbd_simple_req_cb, 0, NULL);
1282
1283 rbd_destroy_ops(ops);
1284 return ret;
1285}
1286
1287static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1288{
0ce1a794 1289 struct rbd_device *rbd_dev = (struct rbd_device *)data;
a71b891b 1290 u64 hver;
13143d2d
SW
1291 int rc;
1292
0ce1a794 1293 if (!rbd_dev)
59c2be1e
YS
1294 return;
1295
bd919d45
AE
1296 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1297 rbd_dev->header_name, (unsigned long long) notify_id,
1298 (unsigned int) opcode);
1fe5e993 1299 rc = rbd_refresh_header(rbd_dev, &hver);
13143d2d 1300 if (rc)
f0f8cef5 1301 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
0ce1a794 1302 " update snaps: %d\n", rbd_dev->major, rc);
59c2be1e 1303
7f0a24d8 1304 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
59c2be1e
YS
1305}
1306
1307/*
1308 * Request sync osd watch
1309 */
0e6f322d 1310static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
59c2be1e
YS
1311{
1312 struct ceph_osd_req_op *ops;
0ce1a794 1313 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
57cfc106 1314 int ret;
59c2be1e 1315
57cfc106
AE
1316 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1317 if (!ops)
1318 return -ENOMEM;
59c2be1e
YS
1319
1320 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
0ce1a794 1321 (void *)rbd_dev, &rbd_dev->watch_event);
59c2be1e
YS
1322 if (ret < 0)
1323 goto fail;
1324
0e6f322d 1325 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
0ce1a794 1326 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
59c2be1e
YS
1327 ops[0].watch.flag = 1;
1328
0ce1a794 1329 ret = rbd_req_sync_op(rbd_dev, NULL,
59c2be1e 1330 CEPH_NOSNAP,
59c2be1e
YS
1331 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1332 ops,
0e6f322d
AE
1333 rbd_dev->header_name,
1334 0, 0, NULL,
0ce1a794 1335 &rbd_dev->watch_request, NULL);
59c2be1e
YS
1336
1337 if (ret < 0)
1338 goto fail_event;
1339
1340 rbd_destroy_ops(ops);
1341 return 0;
1342
1343fail_event:
0ce1a794
AE
1344 ceph_osdc_cancel_event(rbd_dev->watch_event);
1345 rbd_dev->watch_event = NULL;
59c2be1e
YS
1346fail:
1347 rbd_destroy_ops(ops);
1348 return ret;
1349}
1350
79e3057c
YS
1351/*
1352 * Request sync osd unwatch
1353 */
070c633f 1354static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
79e3057c
YS
1355{
1356 struct ceph_osd_req_op *ops;
57cfc106 1357 int ret;
79e3057c 1358
57cfc106
AE
1359 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1360 if (!ops)
1361 return -ENOMEM;
79e3057c
YS
1362
1363 ops[0].watch.ver = 0;
0ce1a794 1364 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
79e3057c
YS
1365 ops[0].watch.flag = 0;
1366
0ce1a794 1367 ret = rbd_req_sync_op(rbd_dev, NULL,
79e3057c 1368 CEPH_NOSNAP,
79e3057c
YS
1369 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1370 ops,
070c633f
AE
1371 rbd_dev->header_name,
1372 0, 0, NULL, NULL, NULL);
1373
79e3057c
YS
1374
1375 rbd_destroy_ops(ops);
0ce1a794
AE
1376 ceph_osdc_cancel_event(rbd_dev->watch_event);
1377 rbd_dev->watch_event = NULL;
79e3057c
YS
1378 return ret;
1379}
1380
59c2be1e 1381struct rbd_notify_info {
0ce1a794 1382 struct rbd_device *rbd_dev;
59c2be1e
YS
1383};
1384
1385static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1386{
0ce1a794
AE
1387 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1388 if (!rbd_dev)
59c2be1e
YS
1389 return;
1390
bd919d45
AE
1391 dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1392 rbd_dev->header_name, (unsigned long long) notify_id,
1393 (unsigned int) opcode);
59c2be1e
YS
1394}
1395
1396/*
1397 * Request sync osd notify
1398 */
4cb16250 1399static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
59c2be1e
YS
1400{
1401 struct ceph_osd_req_op *ops;
0ce1a794 1402 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
59c2be1e
YS
1403 struct ceph_osd_event *event;
1404 struct rbd_notify_info info;
1405 int payload_len = sizeof(u32) + sizeof(u32);
1406 int ret;
1407
57cfc106
AE
1408 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
1409 if (!ops)
1410 return -ENOMEM;
59c2be1e 1411
0ce1a794 1412 info.rbd_dev = rbd_dev;
59c2be1e
YS
1413
1414 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1415 (void *)&info, &event);
1416 if (ret < 0)
1417 goto fail;
1418
1419 ops[0].watch.ver = 1;
1420 ops[0].watch.flag = 1;
1421 ops[0].watch.cookie = event->cookie;
1422 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1423 ops[0].watch.timeout = 12;
1424
0ce1a794 1425 ret = rbd_req_sync_op(rbd_dev, NULL,
59c2be1e 1426 CEPH_NOSNAP,
59c2be1e
YS
1427 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1428 ops,
4cb16250
AE
1429 rbd_dev->header_name,
1430 0, 0, NULL, NULL, NULL);
59c2be1e
YS
1431 if (ret < 0)
1432 goto fail_event;
1433
1434 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1435 dout("ceph_osdc_wait_event returned %d\n", ret);
1436 rbd_destroy_ops(ops);
1437 return 0;
1438
1439fail_event:
1440 ceph_osdc_cancel_event(event);
1441fail:
1442 rbd_destroy_ops(ops);
1443 return ret;
1444}
1445
602adf40
YS
1446/*
1447 * Request sync osd read
1448 */
0ce1a794 1449static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
aded07ea
AE
1450 const char *object_name,
1451 const char *class_name,
1452 const char *method_name,
602adf40 1453 const char *data,
59c2be1e
YS
1454 int len,
1455 u64 *ver)
602adf40
YS
1456{
1457 struct ceph_osd_req_op *ops;
aded07ea
AE
1458 int class_name_len = strlen(class_name);
1459 int method_name_len = strlen(method_name);
57cfc106
AE
1460 int ret;
1461
1462 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
aded07ea 1463 class_name_len + method_name_len + len);
57cfc106
AE
1464 if (!ops)
1465 return -ENOMEM;
602adf40 1466
aded07ea
AE
1467 ops[0].cls.class_name = class_name;
1468 ops[0].cls.class_len = (__u8) class_name_len;
1469 ops[0].cls.method_name = method_name;
1470 ops[0].cls.method_len = (__u8) method_name_len;
602adf40
YS
1471 ops[0].cls.argc = 0;
1472 ops[0].cls.indata = data;
1473 ops[0].cls.indata_len = len;
1474
0ce1a794 1475 ret = rbd_req_sync_op(rbd_dev, NULL,
602adf40 1476 CEPH_NOSNAP,
602adf40
YS
1477 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1478 ops,
d1f57ea6 1479 object_name, 0, 0, NULL, NULL, ver);
602adf40
YS
1480
1481 rbd_destroy_ops(ops);
1482
1483 dout("cls_exec returned %d\n", ret);
1484 return ret;
1485}
1486
1fec7093
YS
1487static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1488{
1489 struct rbd_req_coll *coll =
1490 kzalloc(sizeof(struct rbd_req_coll) +
1491 sizeof(struct rbd_req_status) * num_reqs,
1492 GFP_ATOMIC);
1493
1494 if (!coll)
1495 return NULL;
1496 coll->total = num_reqs;
1497 kref_init(&coll->kref);
1498 return coll;
1499}
1500
602adf40
YS
1501/*
1502 * block device queue callback
1503 */
1504static void rbd_rq_fn(struct request_queue *q)
1505{
1506 struct rbd_device *rbd_dev = q->queuedata;
1507 struct request *rq;
1508 struct bio_pair *bp = NULL;
1509
00f1f36f 1510 while ((rq = blk_fetch_request(q))) {
602adf40
YS
1511 struct bio *bio;
1512 struct bio *rq_bio, *next_bio = NULL;
1513 bool do_write;
bd919d45
AE
1514 unsigned int size;
1515 u64 op_size = 0;
602adf40 1516 u64 ofs;
1fec7093
YS
1517 int num_segs, cur_seg = 0;
1518 struct rbd_req_coll *coll;
d1d25646 1519 struct ceph_snap_context *snapc;
602adf40 1520
602adf40
YS
1521 dout("fetched request\n");
1522
1523 /* filter out block requests we don't understand */
1524 if ((rq->cmd_type != REQ_TYPE_FS)) {
1525 __blk_end_request_all(rq, 0);
00f1f36f 1526 continue;
602adf40
YS
1527 }
1528
1529 /* deduce our operation (read, write) */
1530 do_write = (rq_data_dir(rq) == WRITE);
1531
1532 size = blk_rq_bytes(rq);
593a9e7b 1533 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
602adf40 1534 rq_bio = rq->bio;
f84344f3 1535 if (do_write && rbd_dev->mapping.read_only) {
602adf40 1536 __blk_end_request_all(rq, -EROFS);
00f1f36f 1537 continue;
602adf40
YS
1538 }
1539
1540 spin_unlock_irq(q->queue_lock);
1541
d1d25646 1542 down_read(&rbd_dev->header_rwsem);
e88a36ec 1543
f84344f3
AE
1544 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1545 !rbd_dev->mapping.snap_exists) {
e88a36ec 1546 up_read(&rbd_dev->header_rwsem);
d1d25646
JD
1547 dout("request for non-existent snapshot");
1548 spin_lock_irq(q->queue_lock);
1549 __blk_end_request_all(rq, -ENXIO);
1550 continue;
e88a36ec
JD
1551 }
1552
d1d25646
JD
1553 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1554
1555 up_read(&rbd_dev->header_rwsem);
1556
602adf40
YS
1557 dout("%s 0x%x bytes at 0x%llx\n",
1558 do_write ? "write" : "read",
bd919d45 1559 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
602adf40 1560
1fec7093 1561 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
df111be6
AE
1562 if (num_segs <= 0) {
1563 spin_lock_irq(q->queue_lock);
1564 __blk_end_request_all(rq, num_segs);
1565 ceph_put_snap_context(snapc);
1566 continue;
1567 }
1fec7093
YS
1568 coll = rbd_alloc_coll(num_segs);
1569 if (!coll) {
1570 spin_lock_irq(q->queue_lock);
1571 __blk_end_request_all(rq, -ENOMEM);
d1d25646 1572 ceph_put_snap_context(snapc);
00f1f36f 1573 continue;
1fec7093
YS
1574 }
1575
602adf40
YS
1576 do {
1577 /* a bio clone to be passed down to OSD req */
bd919d45 1578 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
65ccfe21 1579 op_size = rbd_segment_length(rbd_dev, ofs, size);
1fec7093 1580 kref_get(&coll->kref);
602adf40
YS
1581 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1582 op_size, GFP_ATOMIC);
1583 if (!bio) {
1fec7093
YS
1584 rbd_coll_end_req_index(rq, coll, cur_seg,
1585 -ENOMEM, op_size);
1586 goto next_seg;
602adf40
YS
1587 }
1588
1fec7093 1589
602adf40
YS
1590 /* init OSD command: write or read */
1591 if (do_write)
1592 rbd_req_write(rq, rbd_dev,
d1d25646 1593 snapc,
602adf40 1594 ofs,
1fec7093
YS
1595 op_size, bio,
1596 coll, cur_seg);
602adf40
YS
1597 else
1598 rbd_req_read(rq, rbd_dev,
f84344f3 1599 rbd_dev->mapping.snap_id,
602adf40 1600 ofs,
1fec7093
YS
1601 op_size, bio,
1602 coll, cur_seg);
602adf40 1603
1fec7093 1604next_seg:
602adf40
YS
1605 size -= op_size;
1606 ofs += op_size;
1607
1fec7093 1608 cur_seg++;
602adf40
YS
1609 rq_bio = next_bio;
1610 } while (size > 0);
1fec7093 1611 kref_put(&coll->kref, rbd_coll_release);
602adf40
YS
1612
1613 if (bp)
1614 bio_pair_release(bp);
602adf40 1615 spin_lock_irq(q->queue_lock);
d1d25646
JD
1616
1617 ceph_put_snap_context(snapc);
602adf40
YS
1618 }
1619}
1620
1621/*
1622 * a queue callback. Makes sure that we don't create a bio that spans across
1623 * multiple osd objects. One exception would be with a single page bios,
1624 * which we handle later at bio_chain_clone
1625 */
1626static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1627 struct bio_vec *bvec)
1628{
1629 struct rbd_device *rbd_dev = q->queuedata;
593a9e7b
AE
1630 unsigned int chunk_sectors;
1631 sector_t sector;
1632 unsigned int bio_sectors;
602adf40
YS
1633 int max;
1634
593a9e7b
AE
1635 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1636 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1637 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1638
602adf40 1639 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
593a9e7b 1640 + bio_sectors)) << SECTOR_SHIFT;
602adf40
YS
1641 if (max < 0)
1642 max = 0; /* bio_add cannot handle a negative return */
1643 if (max <= bvec->bv_len && bio_sectors == 0)
1644 return bvec->bv_len;
1645 return max;
1646}
1647
1648static void rbd_free_disk(struct rbd_device *rbd_dev)
1649{
1650 struct gendisk *disk = rbd_dev->disk;
1651
1652 if (!disk)
1653 return;
1654
1655 rbd_header_free(&rbd_dev->header);
1656
1657 if (disk->flags & GENHD_FL_UP)
1658 del_gendisk(disk);
1659 if (disk->queue)
1660 blk_cleanup_queue(disk->queue);
1661 put_disk(disk);
1662}
1663
1664/*
4156d998
AE
1665 * Read the complete header for the given rbd device.
1666 *
1667 * Returns a pointer to a dynamically-allocated buffer containing
1668 * the complete and validated header. Caller can pass the address
1669 * of a variable that will be filled in with the version of the
1670 * header object at the time it was read.
1671 *
1672 * Returns a pointer-coded errno if a failure occurs.
602adf40 1673 */
4156d998
AE
1674static struct rbd_image_header_ondisk *
1675rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 1676{
4156d998 1677 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 1678 u32 snap_count = 0;
4156d998
AE
1679 u64 names_size = 0;
1680 u32 want_count;
1681 int ret;
602adf40 1682
00f1f36f 1683 /*
4156d998
AE
1684 * The complete header will include an array of its 64-bit
1685 * snapshot ids, followed by the names of those snapshots as
1686 * a contiguous block of NUL-terminated strings. Note that
1687 * the number of snapshots could change by the time we read
1688 * it in, in which case we re-read it.
00f1f36f 1689 */
4156d998
AE
1690 do {
1691 size_t size;
1692
1693 kfree(ondisk);
1694
1695 size = sizeof (*ondisk);
1696 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1697 size += names_size;
1698 ondisk = kmalloc(size, GFP_KERNEL);
1699 if (!ondisk)
1700 return ERR_PTR(-ENOMEM);
1701
1702 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
0bed54dc 1703 rbd_dev->header_name,
4156d998
AE
1704 0, size,
1705 (char *) ondisk, version);
1706
1707 if (ret < 0)
1708 goto out_err;
1709 if (WARN_ON((size_t) ret < size)) {
1710 ret = -ENXIO;
1711 pr_warning("short header read for image %s"
1712 " (want %zd got %d)\n",
1713 rbd_dev->image_name, size, ret);
1714 goto out_err;
1715 }
1716 if (!rbd_dev_ondisk_valid(ondisk)) {
1717 ret = -ENXIO;
1718 pr_warning("invalid header for image %s\n",
1719 rbd_dev->image_name);
1720 goto out_err;
81e759fb 1721 }
602adf40 1722
4156d998
AE
1723 names_size = le64_to_cpu(ondisk->snap_names_len);
1724 want_count = snap_count;
1725 snap_count = le32_to_cpu(ondisk->snap_count);
1726 } while (snap_count != want_count);
00f1f36f 1727
4156d998 1728 return ondisk;
00f1f36f 1729
4156d998
AE
1730out_err:
1731 kfree(ondisk);
1732
1733 return ERR_PTR(ret);
1734}
1735
1736/*
1737 * reload the ondisk the header
1738 */
1739static int rbd_read_header(struct rbd_device *rbd_dev,
1740 struct rbd_image_header *header)
1741{
1742 struct rbd_image_header_ondisk *ondisk;
1743 u64 ver = 0;
1744 int ret;
602adf40 1745
4156d998
AE
1746 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1747 if (IS_ERR(ondisk))
1748 return PTR_ERR(ondisk);
1749 ret = rbd_header_from_disk(header, ondisk);
1750 if (ret >= 0)
1751 header->obj_version = ver;
1752 kfree(ondisk);
1753
1754 return ret;
602adf40
YS
1755}
1756
1757/*
1758 * create a snapshot
1759 */
0ce1a794 1760static int rbd_header_add_snap(struct rbd_device *rbd_dev,
602adf40
YS
1761 const char *snap_name,
1762 gfp_t gfp_flags)
1763{
1764 int name_len = strlen(snap_name);
1765 u64 new_snapid;
1766 int ret;
916d4d67 1767 void *data, *p, *e;
1dbb4399 1768 struct ceph_mon_client *monc;
602adf40
YS
1769
1770 /* we should create a snapshot only if we're pointing at the head */
f84344f3 1771 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP)
602adf40
YS
1772 return -EINVAL;
1773
0ce1a794
AE
1774 monc = &rbd_dev->rbd_client->client->monc;
1775 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
bd919d45 1776 dout("created snapid=%llu\n", (unsigned long long) new_snapid);
602adf40
YS
1777 if (ret < 0)
1778 return ret;
1779
1780 data = kmalloc(name_len + 16, gfp_flags);
1781 if (!data)
1782 return -ENOMEM;
1783
916d4d67
SW
1784 p = data;
1785 e = data + name_len + 16;
602adf40 1786
916d4d67
SW
1787 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1788 ceph_encode_64_safe(&p, e, new_snapid, bad);
602adf40 1789
0bed54dc 1790 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
0ce1a794 1791 "rbd", "snap_add",
d67d4be5 1792 data, p - data, NULL);
602adf40 1793
916d4d67 1794 kfree(data);
602adf40 1795
505cbb9b 1796 return ret < 0 ? ret : 0;
602adf40
YS
1797bad:
1798 return -ERANGE;
1799}
1800
dfc5606d
YS
1801static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1802{
1803 struct rbd_snap *snap;
a0593290 1804 struct rbd_snap *next;
dfc5606d 1805
a0593290 1806 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
14e7085d 1807 __rbd_remove_snap_dev(snap);
dfc5606d
YS
1808}
1809
602adf40
YS
1810/*
1811 * only read the first part of the ondisk header, without the snaps info
1812 */
b813623a 1813static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
1814{
1815 int ret;
1816 struct rbd_image_header h;
602adf40
YS
1817
1818 ret = rbd_read_header(rbd_dev, &h);
1819 if (ret < 0)
1820 return ret;
1821
a51aa0c0
JD
1822 down_write(&rbd_dev->header_rwsem);
1823
9db4b3e3 1824 /* resized? */
f84344f3 1825 if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
474ef7ce
JD
1826 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1827
99c1f08f
AE
1828 if (size != (sector_t) rbd_dev->mapping.size) {
1829 dout("setting size to %llu sectors",
1830 (unsigned long long) size);
1831 rbd_dev->mapping.size = (u64) size;
1832 set_capacity(rbd_dev->disk, size);
1833 }
474ef7ce 1834 }
9db4b3e3 1835
849b4260 1836 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 1837 kfree(rbd_dev->header.snap_sizes);
849b4260 1838 kfree(rbd_dev->header.snap_names);
d1d25646
JD
1839 /* osd requests may still refer to snapc */
1840 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 1841
b813623a
AE
1842 if (hver)
1843 *hver = h.obj_version;
a71b891b 1844 rbd_dev->header.obj_version = h.obj_version;
93a24e08 1845 rbd_dev->header.image_size = h.image_size;
602adf40
YS
1846 rbd_dev->header.snapc = h.snapc;
1847 rbd_dev->header.snap_names = h.snap_names;
1848 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
1849 /* Free the extra copy of the object prefix */
1850 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1851 kfree(h.object_prefix);
1852
9fcbb800 1853 ret = rbd_dev_snap_devs_update(rbd_dev);
dfc5606d 1854
c666601a 1855 up_write(&rbd_dev->header_rwsem);
602adf40 1856
dfc5606d 1857 return ret;
602adf40
YS
1858}
1859
1fe5e993
AE
1860static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1861{
1862 int ret;
1863
1864 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1865 ret = __rbd_refresh_header(rbd_dev, hver);
1866 mutex_unlock(&ctl_mutex);
1867
1868 return ret;
1869}
1870
602adf40
YS
1871static int rbd_init_disk(struct rbd_device *rbd_dev)
1872{
1873 struct gendisk *disk;
1874 struct request_queue *q;
1875 int rc;
593a9e7b 1876 u64 segment_size;
602adf40
YS
1877
1878 /* contact OSD, request size info about the object being mapped */
1879 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1880 if (rc)
1881 return rc;
1882
dfc5606d 1883 /* no need to lock here, as rbd_dev is not registered yet */
9fcbb800 1884 rc = rbd_dev_snap_devs_update(rbd_dev);
dfc5606d
YS
1885 if (rc)
1886 return rc;
1887
4e1105a2 1888 rc = rbd_header_set_snap(rbd_dev, snap_name);
602adf40
YS
1889 if (rc)
1890 return rc;
1891
1892 /* create gendisk info */
1893 rc = -ENOMEM;
1894 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1895 if (!disk)
1896 goto out;
1897
f0f8cef5 1898 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 1899 rbd_dev->dev_id);
602adf40
YS
1900 disk->major = rbd_dev->major;
1901 disk->first_minor = 0;
1902 disk->fops = &rbd_bd_ops;
1903 disk->private_data = rbd_dev;
1904
1905 /* init rq */
1906 rc = -ENOMEM;
1907 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1908 if (!q)
1909 goto out_disk;
029bcbd8 1910
593a9e7b
AE
1911 /* We use the default size, but let's be explicit about it. */
1912 blk_queue_physical_block_size(q, SECTOR_SIZE);
1913
029bcbd8 1914 /* set io sizes to object size */
593a9e7b
AE
1915 segment_size = rbd_obj_bytes(&rbd_dev->header);
1916 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1917 blk_queue_max_segment_size(q, segment_size);
1918 blk_queue_io_min(q, segment_size);
1919 blk_queue_io_opt(q, segment_size);
029bcbd8 1920
602adf40
YS
1921 blk_queue_merge_bvec(q, rbd_merge_bvec);
1922 disk->queue = q;
1923
1924 q->queuedata = rbd_dev;
1925
1926 rbd_dev->disk = disk;
602adf40
YS
1927
1928 /* finally, announce the disk to the world */
99c1f08f 1929 set_capacity(disk, (sector_t) rbd_dev->mapping.size / SECTOR_SIZE);
602adf40
YS
1930 add_disk(disk);
1931
1932 pr_info("%s: added with size 0x%llx\n",
99c1f08f 1933 disk->disk_name, (unsigned long long) rbd_dev->mapping.size);
602adf40
YS
1934 return 0;
1935
1936out_disk:
1937 put_disk(disk);
1938out:
1939 return rc;
1940}
1941
dfc5606d
YS
1942/*
1943 sysfs
1944*/
1945
593a9e7b
AE
1946static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1947{
1948 return container_of(dev, struct rbd_device, dev);
1949}
1950
dfc5606d
YS
1951static ssize_t rbd_size_show(struct device *dev,
1952 struct device_attribute *attr, char *buf)
1953{
593a9e7b 1954 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
1955 sector_t size;
1956
1957 down_read(&rbd_dev->header_rwsem);
1958 size = get_capacity(rbd_dev->disk);
1959 up_read(&rbd_dev->header_rwsem);
dfc5606d 1960
a51aa0c0 1961 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
1962}
1963
1964static ssize_t rbd_major_show(struct device *dev,
1965 struct device_attribute *attr, char *buf)
1966{
593a9e7b 1967 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 1968
dfc5606d
YS
1969 return sprintf(buf, "%d\n", rbd_dev->major);
1970}
1971
1972static ssize_t rbd_client_id_show(struct device *dev,
1973 struct device_attribute *attr, char *buf)
602adf40 1974{
593a9e7b 1975 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1976
1dbb4399
AE
1977 return sprintf(buf, "client%lld\n",
1978 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
1979}
1980
dfc5606d
YS
1981static ssize_t rbd_pool_show(struct device *dev,
1982 struct device_attribute *attr, char *buf)
602adf40 1983{
593a9e7b 1984 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d
YS
1985
1986 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1987}
1988
9bb2f334
AE
1989static ssize_t rbd_pool_id_show(struct device *dev,
1990 struct device_attribute *attr, char *buf)
1991{
1992 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1993
1994 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1995}
1996
dfc5606d
YS
1997static ssize_t rbd_name_show(struct device *dev,
1998 struct device_attribute *attr, char *buf)
1999{
593a9e7b 2000 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2001
0bed54dc 2002 return sprintf(buf, "%s\n", rbd_dev->image_name);
dfc5606d
YS
2003}
2004
2005static ssize_t rbd_snap_show(struct device *dev,
2006 struct device_attribute *attr,
2007 char *buf)
2008{
593a9e7b 2009 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2010
f84344f3 2011 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
dfc5606d
YS
2012}
2013
2014static ssize_t rbd_image_refresh(struct device *dev,
2015 struct device_attribute *attr,
2016 const char *buf,
2017 size_t size)
2018{
593a9e7b 2019 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 2020 int ret;
602adf40 2021
1fe5e993 2022 ret = rbd_refresh_header(rbd_dev, NULL);
b813623a
AE
2023
2024 return ret < 0 ? ret : size;
dfc5606d 2025}
602adf40 2026
dfc5606d
YS
2027static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
2028static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2029static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2030static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 2031static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d
YS
2032static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2033static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2034static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
2035static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
dfc5606d
YS
2036
2037static struct attribute *rbd_attrs[] = {
2038 &dev_attr_size.attr,
2039 &dev_attr_major.attr,
2040 &dev_attr_client_id.attr,
2041 &dev_attr_pool.attr,
9bb2f334 2042 &dev_attr_pool_id.attr,
dfc5606d
YS
2043 &dev_attr_name.attr,
2044 &dev_attr_current_snap.attr,
2045 &dev_attr_refresh.attr,
2046 &dev_attr_create_snap.attr,
dfc5606d
YS
2047 NULL
2048};
2049
2050static struct attribute_group rbd_attr_group = {
2051 .attrs = rbd_attrs,
2052};
2053
2054static const struct attribute_group *rbd_attr_groups[] = {
2055 &rbd_attr_group,
2056 NULL
2057};
2058
2059static void rbd_sysfs_dev_release(struct device *dev)
2060{
2061}
2062
2063static struct device_type rbd_device_type = {
2064 .name = "rbd",
2065 .groups = rbd_attr_groups,
2066 .release = rbd_sysfs_dev_release,
2067};
2068
2069
2070/*
2071 sysfs - snapshots
2072*/
2073
2074static ssize_t rbd_snap_size_show(struct device *dev,
2075 struct device_attribute *attr,
2076 char *buf)
2077{
2078 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2079
3591538f 2080 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
2081}
2082
2083static ssize_t rbd_snap_id_show(struct device *dev,
2084 struct device_attribute *attr,
2085 char *buf)
2086{
2087 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2088
3591538f 2089 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
2090}
2091
2092static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2093static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2094
2095static struct attribute *rbd_snap_attrs[] = {
2096 &dev_attr_snap_size.attr,
2097 &dev_attr_snap_id.attr,
2098 NULL,
2099};
2100
2101static struct attribute_group rbd_snap_attr_group = {
2102 .attrs = rbd_snap_attrs,
2103};
2104
2105static void rbd_snap_dev_release(struct device *dev)
2106{
2107 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2108 kfree(snap->name);
2109 kfree(snap);
2110}
2111
2112static const struct attribute_group *rbd_snap_attr_groups[] = {
2113 &rbd_snap_attr_group,
2114 NULL
2115};
2116
2117static struct device_type rbd_snap_device_type = {
2118 .groups = rbd_snap_attr_groups,
2119 .release = rbd_snap_dev_release,
2120};
2121
14e7085d 2122static void __rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2123{
2124 list_del(&snap->node);
2125 device_unregister(&snap->dev);
2126}
2127
14e7085d 2128static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2129 struct device *parent)
2130{
2131 struct device *dev = &snap->dev;
2132 int ret;
2133
2134 dev->type = &rbd_snap_device_type;
2135 dev->parent = parent;
2136 dev->release = rbd_snap_dev_release;
2137 dev_set_name(dev, "snap_%s", snap->name);
2138 ret = device_register(dev);
2139
2140 return ret;
2141}
2142
4e891e0a
AE
2143static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2144 int i, const char *name)
dfc5606d 2145{
4e891e0a 2146 struct rbd_snap *snap;
dfc5606d 2147 int ret;
4e891e0a
AE
2148
2149 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2150 if (!snap)
4e891e0a
AE
2151 return ERR_PTR(-ENOMEM);
2152
2153 ret = -ENOMEM;
dfc5606d 2154 snap->name = kstrdup(name, GFP_KERNEL);
4e891e0a
AE
2155 if (!snap->name)
2156 goto err;
2157
dfc5606d
YS
2158 snap->size = rbd_dev->header.snap_sizes[i];
2159 snap->id = rbd_dev->header.snapc->snaps[i];
2160 if (device_is_registered(&rbd_dev->dev)) {
14e7085d 2161 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
dfc5606d
YS
2162 if (ret < 0)
2163 goto err;
2164 }
4e891e0a
AE
2165
2166 return snap;
2167
dfc5606d
YS
2168err:
2169 kfree(snap->name);
2170 kfree(snap);
4e891e0a
AE
2171
2172 return ERR_PTR(ret);
dfc5606d
YS
2173}
2174
2175/*
35938150
AE
2176 * Scan the rbd device's current snapshot list and compare it to the
2177 * newly-received snapshot context. Remove any existing snapshots
2178 * not present in the new snapshot context. Add a new snapshot for
2179 * any snaphots in the snapshot context not in the current list.
2180 * And verify there are no changes to snapshots we already know
2181 * about.
2182 *
2183 * Assumes the snapshots in the snapshot context are sorted by
2184 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2185 * are also maintained in that order.)
dfc5606d 2186 */
9fcbb800 2187static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev)
dfc5606d 2188{
35938150
AE
2189 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2190 const u32 snap_count = snapc->num_snaps;
2191 char *snap_name = rbd_dev->header.snap_names;
2192 struct list_head *head = &rbd_dev->snaps;
2193 struct list_head *links = head->next;
2194 u32 index = 0;
dfc5606d 2195
9fcbb800 2196 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
2197 while (index < snap_count || links != head) {
2198 u64 snap_id;
2199 struct rbd_snap *snap;
dfc5606d 2200
35938150
AE
2201 snap_id = index < snap_count ? snapc->snaps[index]
2202 : CEPH_NOSNAP;
2203 snap = links != head ? list_entry(links, struct rbd_snap, node)
2204 : NULL;
aafb230e 2205 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 2206
35938150
AE
2207 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2208 struct list_head *next = links->next;
dfc5606d 2209
35938150 2210 /* Existing snapshot not in the new snap context */
dfc5606d 2211
f84344f3
AE
2212 if (rbd_dev->mapping.snap_id == snap->id)
2213 rbd_dev->mapping.snap_exists = false;
35938150 2214 __rbd_remove_snap_dev(snap);
9fcbb800 2215 dout("%ssnap id %llu has been removed\n",
f84344f3
AE
2216 rbd_dev->mapping.snap_id == snap->id ?
2217 "mapped " : "",
9fcbb800 2218 (unsigned long long) snap->id);
35938150
AE
2219
2220 /* Done with this list entry; advance */
2221
2222 links = next;
dfc5606d
YS
2223 continue;
2224 }
35938150 2225
9fcbb800
AE
2226 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2227 (unsigned long long) snap_id);
35938150
AE
2228 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2229 struct rbd_snap *new_snap;
2230
2231 /* We haven't seen this snapshot before */
2232
2233 new_snap = __rbd_add_snap_dev(rbd_dev, index,
2234 snap_name);
9fcbb800
AE
2235 if (IS_ERR(new_snap)) {
2236 int err = PTR_ERR(new_snap);
2237
2238 dout(" failed to add dev, error %d\n", err);
2239
2240 return err;
2241 }
35938150
AE
2242
2243 /* New goes before existing, or at end of list */
2244
9fcbb800 2245 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
2246 if (snap)
2247 list_add_tail(&new_snap->node, &snap->node);
2248 else
523f3258 2249 list_add_tail(&new_snap->node, head);
35938150
AE
2250 } else {
2251 /* Already have this one */
2252
9fcbb800
AE
2253 dout(" already present\n");
2254
aafb230e
AE
2255 rbd_assert(snap->size ==
2256 rbd_dev->header.snap_sizes[index]);
2257 rbd_assert(!strcmp(snap->name, snap_name));
35938150
AE
2258
2259 /* Done with this list entry; advance */
2260
2261 links = links->next;
dfc5606d 2262 }
35938150
AE
2263
2264 /* Advance to the next entry in the snapshot context */
2265
2266 index++;
2267 snap_name += strlen(snap_name) + 1;
dfc5606d 2268 }
9fcbb800 2269 dout("%s: done\n", __func__);
dfc5606d
YS
2270
2271 return 0;
2272}
2273
dfc5606d
YS
2274static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2275{
f0f8cef5 2276 int ret;
dfc5606d
YS
2277 struct device *dev;
2278 struct rbd_snap *snap;
2279
2280 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2281 dev = &rbd_dev->dev;
2282
2283 dev->bus = &rbd_bus_type;
2284 dev->type = &rbd_device_type;
2285 dev->parent = &rbd_root_dev;
2286 dev->release = rbd_dev_release;
de71a297 2287 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d
YS
2288 ret = device_register(dev);
2289 if (ret < 0)
f0f8cef5 2290 goto out;
dfc5606d
YS
2291
2292 list_for_each_entry(snap, &rbd_dev->snaps, node) {
14e7085d 2293 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
dfc5606d 2294 if (ret < 0)
602adf40
YS
2295 break;
2296 }
f0f8cef5 2297out:
dfc5606d
YS
2298 mutex_unlock(&ctl_mutex);
2299 return ret;
602adf40
YS
2300}
2301
dfc5606d
YS
2302static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2303{
2304 device_unregister(&rbd_dev->dev);
2305}
2306
59c2be1e
YS
2307static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2308{
2309 int ret, rc;
2310
2311 do {
0e6f322d 2312 ret = rbd_req_sync_watch(rbd_dev);
59c2be1e 2313 if (ret == -ERANGE) {
1fe5e993 2314 rc = rbd_refresh_header(rbd_dev, NULL);
59c2be1e
YS
2315 if (rc < 0)
2316 return rc;
2317 }
2318 } while (ret == -ERANGE);
2319
2320 return ret;
2321}
2322
e2839308 2323static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
2324
2325/*
499afd5b
AE
2326 * Get a unique rbd identifier for the given new rbd_dev, and add
2327 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 2328 */
e2839308 2329static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 2330{
e2839308 2331 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
2332
2333 spin_lock(&rbd_dev_list_lock);
2334 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2335 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
2336 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2337 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 2338}
b7f23c36 2339
1ddbe94e 2340/*
499afd5b
AE
2341 * Remove an rbd_dev from the global list, and record that its
2342 * identifier is no longer in use.
1ddbe94e 2343 */
e2839308 2344static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 2345{
d184f6bf 2346 struct list_head *tmp;
de71a297 2347 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
2348 int max_id;
2349
aafb230e 2350 rbd_assert(rbd_id > 0);
499afd5b 2351
e2839308
AE
2352 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2353 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
2354 spin_lock(&rbd_dev_list_lock);
2355 list_del_init(&rbd_dev->node);
d184f6bf
AE
2356
2357 /*
2358 * If the id being "put" is not the current maximum, there
2359 * is nothing special we need to do.
2360 */
e2839308 2361 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
2362 spin_unlock(&rbd_dev_list_lock);
2363 return;
2364 }
2365
2366 /*
2367 * We need to update the current maximum id. Search the
2368 * list to find out what it is. We're more likely to find
2369 * the maximum at the end, so search the list backward.
2370 */
2371 max_id = 0;
2372 list_for_each_prev(tmp, &rbd_dev_list) {
2373 struct rbd_device *rbd_dev;
2374
2375 rbd_dev = list_entry(tmp, struct rbd_device, node);
2376 if (rbd_id > max_id)
2377 max_id = rbd_id;
2378 }
499afd5b 2379 spin_unlock(&rbd_dev_list_lock);
b7f23c36 2380
1ddbe94e 2381 /*
e2839308 2382 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
2383 * which case it now accurately reflects the new maximum.
2384 * Be careful not to overwrite the maximum value in that
2385 * case.
1ddbe94e 2386 */
e2839308
AE
2387 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2388 dout(" max dev id has been reset\n");
b7f23c36
AE
2389}
2390
e28fff26
AE
2391/*
2392 * Skips over white space at *buf, and updates *buf to point to the
2393 * first found non-space character (if any). Returns the length of
593a9e7b
AE
2394 * the token (string of non-white space characters) found. Note
2395 * that *buf must be terminated with '\0'.
e28fff26
AE
2396 */
2397static inline size_t next_token(const char **buf)
2398{
2399 /*
2400 * These are the characters that produce nonzero for
2401 * isspace() in the "C" and "POSIX" locales.
2402 */
2403 const char *spaces = " \f\n\r\t\v";
2404
2405 *buf += strspn(*buf, spaces); /* Find start of token */
2406
2407 return strcspn(*buf, spaces); /* Return token length */
2408}
2409
2410/*
2411 * Finds the next token in *buf, and if the provided token buffer is
2412 * big enough, copies the found token into it. The result, if
593a9e7b
AE
2413 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2414 * must be terminated with '\0' on entry.
e28fff26
AE
2415 *
2416 * Returns the length of the token found (not including the '\0').
2417 * Return value will be 0 if no token is found, and it will be >=
2418 * token_size if the token would not fit.
2419 *
593a9e7b 2420 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
2421 * found token. Note that this occurs even if the token buffer is
2422 * too small to hold it.
2423 */
2424static inline size_t copy_token(const char **buf,
2425 char *token,
2426 size_t token_size)
2427{
2428 size_t len;
2429
2430 len = next_token(buf);
2431 if (len < token_size) {
2432 memcpy(token, *buf, len);
2433 *(token + len) = '\0';
2434 }
2435 *buf += len;
2436
2437 return len;
2438}
2439
ea3352f4
AE
2440/*
2441 * Finds the next token in *buf, dynamically allocates a buffer big
2442 * enough to hold a copy of it, and copies the token into the new
2443 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2444 * that a duplicate buffer is created even for a zero-length token.
2445 *
2446 * Returns a pointer to the newly-allocated duplicate, or a null
2447 * pointer if memory for the duplicate was not available. If
2448 * the lenp argument is a non-null pointer, the length of the token
2449 * (not including the '\0') is returned in *lenp.
2450 *
2451 * If successful, the *buf pointer will be updated to point beyond
2452 * the end of the found token.
2453 *
2454 * Note: uses GFP_KERNEL for allocation.
2455 */
2456static inline char *dup_token(const char **buf, size_t *lenp)
2457{
2458 char *dup;
2459 size_t len;
2460
2461 len = next_token(buf);
2462 dup = kmalloc(len + 1, GFP_KERNEL);
2463 if (!dup)
2464 return NULL;
2465
2466 memcpy(dup, *buf, len);
2467 *(dup + len) = '\0';
2468 *buf += len;
2469
2470 if (lenp)
2471 *lenp = len;
2472
2473 return dup;
2474}
2475
a725f65e 2476/*
3feeb894
AE
2477 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2478 * rbd_md_name, and name fields of the given rbd_dev, based on the
2479 * list of monitor addresses and other options provided via
2480 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2481 * copy of the snapshot name to map if successful, or a
2482 * pointer-coded error otherwise.
d22f76e7
AE
2483 *
2484 * Note: rbd_dev is assumed to have been initially zero-filled.
a725f65e 2485 */
3feeb894
AE
2486static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2487 const char *buf,
2488 const char **mon_addrs,
2489 size_t *mon_addrs_size,
2490 char *options,
2491 size_t options_size)
e28fff26 2492{
d22f76e7 2493 size_t len;
3feeb894
AE
2494 char *err_ptr = ERR_PTR(-EINVAL);
2495 char *snap_name;
e28fff26
AE
2496
2497 /* The first four tokens are required */
2498
7ef3214a
AE
2499 len = next_token(&buf);
2500 if (!len)
3feeb894 2501 return err_ptr;
5214ecc4 2502 *mon_addrs_size = len + 1;
7ef3214a
AE
2503 *mon_addrs = buf;
2504
2505 buf += len;
a725f65e 2506
e28fff26
AE
2507 len = copy_token(&buf, options, options_size);
2508 if (!len || len >= options_size)
3feeb894 2509 return err_ptr;
e28fff26 2510
3feeb894 2511 err_ptr = ERR_PTR(-ENOMEM);
d22f76e7
AE
2512 rbd_dev->pool_name = dup_token(&buf, NULL);
2513 if (!rbd_dev->pool_name)
d22f76e7 2514 goto out_err;
e28fff26 2515
0bed54dc
AE
2516 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2517 if (!rbd_dev->image_name)
bf3e5ae1 2518 goto out_err;
a725f65e 2519
cb8627c7
AE
2520 /* Create the name of the header object */
2521
0bed54dc 2522 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
bf3e5ae1
AE
2523 + sizeof (RBD_SUFFIX),
2524 GFP_KERNEL);
0bed54dc 2525 if (!rbd_dev->header_name)
cb8627c7 2526 goto out_err;
0bed54dc 2527 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
a725f65e 2528
3feeb894
AE
2529 /* Snapshot name is optional */
2530 len = next_token(&buf);
820a5f3e 2531 if (!len) {
3feeb894
AE
2532 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2533 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
849b4260 2534 }
3feeb894
AE
2535 snap_name = kmalloc(len + 1, GFP_KERNEL);
2536 if (!snap_name)
2537 goto out_err;
2538 memcpy(snap_name, buf, len);
2539 *(snap_name + len) = '\0';
e28fff26 2540
3feeb894
AE
2541dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len);
2542
2543 return snap_name;
d22f76e7
AE
2544
2545out_err:
0bed54dc 2546 kfree(rbd_dev->header_name);
d78fd7ae 2547 rbd_dev->header_name = NULL;
0bed54dc 2548 kfree(rbd_dev->image_name);
d78fd7ae
AE
2549 rbd_dev->image_name = NULL;
2550 rbd_dev->image_name_len = 0;
d22f76e7
AE
2551 kfree(rbd_dev->pool_name);
2552 rbd_dev->pool_name = NULL;
2553
3feeb894 2554 return err_ptr;
a725f65e
AE
2555}
2556
59c2be1e
YS
2557static ssize_t rbd_add(struct bus_type *bus,
2558 const char *buf,
2559 size_t count)
602adf40 2560{
cb8627c7
AE
2561 char *options;
2562 struct rbd_device *rbd_dev = NULL;
7ef3214a
AE
2563 const char *mon_addrs = NULL;
2564 size_t mon_addrs_size = 0;
27cc2594
AE
2565 struct ceph_osd_client *osdc;
2566 int rc = -ENOMEM;
3feeb894 2567 char *snap_name;
602adf40
YS
2568
2569 if (!try_module_get(THIS_MODULE))
2570 return -ENODEV;
2571
60571c7d 2572 options = kmalloc(count, GFP_KERNEL);
602adf40 2573 if (!options)
27cc2594 2574 goto err_nomem;
cb8627c7
AE
2575 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2576 if (!rbd_dev)
2577 goto err_nomem;
602adf40
YS
2578
2579 /* static rbd_device initialization */
2580 spin_lock_init(&rbd_dev->lock);
2581 INIT_LIST_HEAD(&rbd_dev->node);
dfc5606d 2582 INIT_LIST_HEAD(&rbd_dev->snaps);
c666601a 2583 init_rwsem(&rbd_dev->header_rwsem);
602adf40 2584
d184f6bf 2585 /* generate unique id: find highest unique id, add one */
e2839308 2586 rbd_dev_id_get(rbd_dev);
602adf40 2587
a725f65e 2588 /* Fill in the device name, now that we have its id. */
81a89793
AE
2589 BUILD_BUG_ON(DEV_NAME_LEN
2590 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
de71a297 2591 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
a725f65e 2592
602adf40 2593 /* parse add command */
3feeb894
AE
2594 snap_name = rbd_add_parse_args(rbd_dev, buf,
2595 &mon_addrs, &mon_addrs_size, options, count);
2596 if (IS_ERR(snap_name)) {
2597 rc = PTR_ERR(snap_name);
f0f8cef5 2598 goto err_put_id;
3feeb894 2599 }
e124a82f 2600
f8c38929
AE
2601 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2602 if (rc < 0)
f0f8cef5 2603 goto err_put_id;
602adf40 2604
602adf40 2605 /* pick the pool */
1dbb4399 2606 osdc = &rbd_dev->rbd_client->client->osdc;
602adf40
YS
2607 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2608 if (rc < 0)
2609 goto err_out_client;
9bb2f334 2610 rbd_dev->pool_id = rc;
602adf40
YS
2611
2612 /* register our block device */
27cc2594
AE
2613 rc = register_blkdev(0, rbd_dev->name);
2614 if (rc < 0)
602adf40 2615 goto err_out_client;
27cc2594 2616 rbd_dev->major = rc;
602adf40 2617
dfc5606d
YS
2618 rc = rbd_bus_add_dev(rbd_dev);
2619 if (rc)
766fc439
YS
2620 goto err_out_blkdev;
2621
32eec68d
AE
2622 /*
2623 * At this point cleanup in the event of an error is the job
2624 * of the sysfs code (initiated by rbd_bus_del_dev()).
2625 *
2626 * Set up and announce blkdev mapping.
2627 */
602adf40
YS
2628 rc = rbd_init_disk(rbd_dev);
2629 if (rc)
766fc439 2630 goto err_out_bus;
602adf40 2631
59c2be1e
YS
2632 rc = rbd_init_watch_dev(rbd_dev);
2633 if (rc)
2634 goto err_out_bus;
2635
602adf40
YS
2636 return count;
2637
766fc439 2638err_out_bus:
766fc439
YS
2639 /* this will also clean up rest of rbd_dev stuff */
2640
2641 rbd_bus_del_dev(rbd_dev);
2642 kfree(options);
766fc439
YS
2643 return rc;
2644
602adf40
YS
2645err_out_blkdev:
2646 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2647err_out_client:
2648 rbd_put_client(rbd_dev);
f0f8cef5 2649err_put_id:
cb8627c7 2650 if (rbd_dev->pool_name) {
f84344f3 2651 kfree(rbd_dev->mapping.snap_name);
0bed54dc
AE
2652 kfree(rbd_dev->header_name);
2653 kfree(rbd_dev->image_name);
cb8627c7
AE
2654 kfree(rbd_dev->pool_name);
2655 }
e2839308 2656 rbd_dev_id_put(rbd_dev);
27cc2594 2657err_nomem:
27cc2594 2658 kfree(rbd_dev);
cb8627c7 2659 kfree(options);
27cc2594 2660
602adf40
YS
2661 dout("Error adding device %s\n", buf);
2662 module_put(THIS_MODULE);
27cc2594
AE
2663
2664 return (ssize_t) rc;
602adf40
YS
2665}
2666
de71a297 2667static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
2668{
2669 struct list_head *tmp;
2670 struct rbd_device *rbd_dev;
2671
e124a82f 2672 spin_lock(&rbd_dev_list_lock);
602adf40
YS
2673 list_for_each(tmp, &rbd_dev_list) {
2674 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 2675 if (rbd_dev->dev_id == dev_id) {
e124a82f 2676 spin_unlock(&rbd_dev_list_lock);
602adf40 2677 return rbd_dev;
e124a82f 2678 }
602adf40 2679 }
e124a82f 2680 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
2681 return NULL;
2682}
2683
dfc5606d 2684static void rbd_dev_release(struct device *dev)
602adf40 2685{
593a9e7b 2686 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 2687
1dbb4399
AE
2688 if (rbd_dev->watch_request) {
2689 struct ceph_client *client = rbd_dev->rbd_client->client;
2690
2691 ceph_osdc_unregister_linger_request(&client->osdc,
59c2be1e 2692 rbd_dev->watch_request);
1dbb4399 2693 }
59c2be1e 2694 if (rbd_dev->watch_event)
070c633f 2695 rbd_req_sync_unwatch(rbd_dev);
59c2be1e 2696
602adf40
YS
2697 rbd_put_client(rbd_dev);
2698
2699 /* clean up and free blkdev */
2700 rbd_free_disk(rbd_dev);
2701 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d
AE
2702
2703 /* done with the id, and with the rbd_dev */
f84344f3 2704 kfree(rbd_dev->mapping.snap_name);
0bed54dc 2705 kfree(rbd_dev->header_name);
d22f76e7 2706 kfree(rbd_dev->pool_name);
0bed54dc 2707 kfree(rbd_dev->image_name);
e2839308 2708 rbd_dev_id_put(rbd_dev);
602adf40
YS
2709 kfree(rbd_dev);
2710
2711 /* release module ref */
2712 module_put(THIS_MODULE);
602adf40
YS
2713}
2714
dfc5606d
YS
2715static ssize_t rbd_remove(struct bus_type *bus,
2716 const char *buf,
2717 size_t count)
602adf40
YS
2718{
2719 struct rbd_device *rbd_dev = NULL;
2720 int target_id, rc;
2721 unsigned long ul;
2722 int ret = count;
2723
2724 rc = strict_strtoul(buf, 10, &ul);
2725 if (rc)
2726 return rc;
2727
2728 /* convert to int; abort if we lost anything in the conversion */
2729 target_id = (int) ul;
2730 if (target_id != ul)
2731 return -EINVAL;
2732
2733 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2734
2735 rbd_dev = __rbd_get_dev(target_id);
2736 if (!rbd_dev) {
2737 ret = -ENOENT;
2738 goto done;
2739 }
2740
dfc5606d
YS
2741 __rbd_remove_all_snaps(rbd_dev);
2742 rbd_bus_del_dev(rbd_dev);
602adf40
YS
2743
2744done:
2745 mutex_unlock(&ctl_mutex);
aafb230e 2746
602adf40
YS
2747 return ret;
2748}
2749
dfc5606d
YS
2750static ssize_t rbd_snap_add(struct device *dev,
2751 struct device_attribute *attr,
2752 const char *buf,
2753 size_t count)
602adf40 2754{
593a9e7b 2755 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d
YS
2756 int ret;
2757 char *name = kmalloc(count + 1, GFP_KERNEL);
602adf40
YS
2758 if (!name)
2759 return -ENOMEM;
2760
dfc5606d 2761 snprintf(name, count, "%s", buf);
602adf40
YS
2762
2763 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2764
602adf40
YS
2765 ret = rbd_header_add_snap(rbd_dev,
2766 name, GFP_KERNEL);
2767 if (ret < 0)
59c2be1e 2768 goto err_unlock;
602adf40 2769
b813623a 2770 ret = __rbd_refresh_header(rbd_dev, NULL);
602adf40 2771 if (ret < 0)
59c2be1e
YS
2772 goto err_unlock;
2773
2774 /* shouldn't hold ctl_mutex when notifying.. notify might
2775 trigger a watch callback that would need to get that mutex */
2776 mutex_unlock(&ctl_mutex);
2777
2778 /* make a best effort, don't error if failed */
4cb16250 2779 rbd_req_sync_notify(rbd_dev);
602adf40
YS
2780
2781 ret = count;
59c2be1e
YS
2782 kfree(name);
2783 return ret;
2784
2785err_unlock:
602adf40 2786 mutex_unlock(&ctl_mutex);
602adf40
YS
2787 kfree(name);
2788 return ret;
2789}
2790
602adf40
YS
2791/*
2792 * create control files in sysfs
dfc5606d 2793 * /sys/bus/rbd/...
602adf40
YS
2794 */
2795static int rbd_sysfs_init(void)
2796{
dfc5606d 2797 int ret;
602adf40 2798
fed4c143 2799 ret = device_register(&rbd_root_dev);
21079786 2800 if (ret < 0)
dfc5606d 2801 return ret;
602adf40 2802
fed4c143
AE
2803 ret = bus_register(&rbd_bus_type);
2804 if (ret < 0)
2805 device_unregister(&rbd_root_dev);
602adf40 2806
602adf40
YS
2807 return ret;
2808}
2809
2810static void rbd_sysfs_cleanup(void)
2811{
dfc5606d 2812 bus_unregister(&rbd_bus_type);
fed4c143 2813 device_unregister(&rbd_root_dev);
602adf40
YS
2814}
2815
2816int __init rbd_init(void)
2817{
2818 int rc;
2819
2820 rc = rbd_sysfs_init();
2821 if (rc)
2822 return rc;
f0f8cef5 2823 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
2824 return 0;
2825}
2826
2827void __exit rbd_exit(void)
2828{
2829 rbd_sysfs_cleanup();
2830}
2831
2832module_init(rbd_init);
2833module_exit(rbd_exit);
2834
2835MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2836MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2837MODULE_DESCRIPTION("rados block device");
2838
2839/* following authorship retained from original osdblk.c */
2840MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2841
2842MODULE_LICENSE("GPL");
This page took 0.269036 seconds and 5 git commands to generate.