2 * linux/fs/nfs/blocklayout/blocklayout.c
4 * Module for the NFSv4.1 pNFS block layout driver.
6 * Copyright (c) 2006 The Regents of the University of Michigan.
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
33 #include <linux/module.h>
34 #include <linux/init.h>
35 #include <linux/mount.h>
36 #include <linux/namei.h>
37 #include <linux/bio.h> /* struct bio */
39 #include "blocklayout.h"
41 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
43 MODULE_LICENSE("GPL");
44 MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
45 MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
47 struct dentry
*bl_device_pipe
;
48 wait_queue_head_t bl_wq
;
50 static void print_page(struct page
*page
)
52 dprintk("PRINTPAGE page %p\n", page
);
53 dprintk(" PagePrivate %d\n", PagePrivate(page
));
54 dprintk(" PageUptodate %d\n", PageUptodate(page
));
55 dprintk(" PageError %d\n", PageError(page
));
56 dprintk(" PageDirty %d\n", PageDirty(page
));
57 dprintk(" PageReferenced %d\n", PageReferenced(page
));
58 dprintk(" PageLocked %d\n", PageLocked(page
));
59 dprintk(" PageWriteback %d\n", PageWriteback(page
));
60 dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page
));
64 /* Given the be associated with isect, determine if page data needs to be
67 static int is_hole(struct pnfs_block_extent
*be
, sector_t isect
)
69 if (be
->be_state
== PNFS_BLOCK_NONE_DATA
)
71 else if (be
->be_state
!= PNFS_BLOCK_INVALID_DATA
)
74 return !bl_is_sector_init(be
->be_inval
, isect
);
77 /* The data we are handed might be spread across several bios. We need
78 * to track when the last one is finished.
82 struct rpc_call_ops call_ops
;
83 void (*pnfs_callback
) (void *data
);
87 static inline struct parallel_io
*alloc_parallel(void *data
)
89 struct parallel_io
*rv
;
91 rv
= kmalloc(sizeof(*rv
), GFP_NOFS
);
94 kref_init(&rv
->refcnt
);
99 static inline void get_parallel(struct parallel_io
*p
)
101 kref_get(&p
->refcnt
);
104 static void destroy_parallel(struct kref
*kref
)
106 struct parallel_io
*p
= container_of(kref
, struct parallel_io
, refcnt
);
108 dprintk("%s enter\n", __func__
);
109 p
->pnfs_callback(p
->data
);
113 static inline void put_parallel(struct parallel_io
*p
)
115 kref_put(&p
->refcnt
, destroy_parallel
);
119 bl_submit_bio(int rw
, struct bio
*bio
)
122 get_parallel(bio
->bi_private
);
123 dprintk("%s submitting %s bio %u@%llu\n", __func__
,
124 rw
== READ
? "read" : "write",
125 bio
->bi_size
, (unsigned long long)bio
->bi_sector
);
131 static struct bio
*bl_alloc_init_bio(int npg
, sector_t isect
,
132 struct pnfs_block_extent
*be
,
133 void (*end_io
)(struct bio
*, int err
),
134 struct parallel_io
*par
)
138 bio
= bio_alloc(GFP_NOIO
, npg
);
142 bio
->bi_sector
= isect
- be
->be_f_offset
+ be
->be_v_offset
;
143 bio
->bi_bdev
= be
->be_mdev
;
144 bio
->bi_end_io
= end_io
;
145 bio
->bi_private
= par
;
149 static struct bio
*bl_add_page_to_bio(struct bio
*bio
, int npg
, int rw
,
150 sector_t isect
, struct page
*page
,
151 struct pnfs_block_extent
*be
,
152 void (*end_io
)(struct bio
*, int err
),
153 struct parallel_io
*par
)
157 bio
= bl_alloc_init_bio(npg
, isect
, be
, end_io
, par
);
159 return ERR_PTR(-ENOMEM
);
161 if (bio_add_page(bio
, page
, PAGE_CACHE_SIZE
, 0) < PAGE_CACHE_SIZE
) {
162 bio
= bl_submit_bio(rw
, bio
);
168 static void bl_set_lo_fail(struct pnfs_layout_segment
*lseg
)
170 if (lseg
->pls_range
.iomode
== IOMODE_RW
) {
171 dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__
);
172 set_bit(lo_fail_bit(IOMODE_RW
), &lseg
->pls_layout
->plh_flags
);
174 dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__
);
175 set_bit(lo_fail_bit(IOMODE_READ
), &lseg
->pls_layout
->plh_flags
);
179 /* This is basically copied from mpage_end_io_read */
180 static void bl_end_io_read(struct bio
*bio
, int err
)
182 struct parallel_io
*par
= bio
->bi_private
;
183 const int uptodate
= test_bit(BIO_UPTODATE
, &bio
->bi_flags
);
184 struct bio_vec
*bvec
= bio
->bi_io_vec
+ bio
->bi_vcnt
- 1;
185 struct nfs_read_data
*rdata
= (struct nfs_read_data
*)par
->data
;
188 struct page
*page
= bvec
->bv_page
;
190 if (--bvec
>= bio
->bi_io_vec
)
191 prefetchw(&bvec
->bv_page
->flags
);
193 SetPageUptodate(page
);
194 } while (bvec
>= bio
->bi_io_vec
);
196 if (!rdata
->pnfs_error
)
197 rdata
->pnfs_error
= -EIO
;
198 bl_set_lo_fail(rdata
->lseg
);
204 static void bl_read_cleanup(struct work_struct
*work
)
206 struct rpc_task
*task
;
207 struct nfs_read_data
*rdata
;
208 dprintk("%s enter\n", __func__
);
209 task
= container_of(work
, struct rpc_task
, u
.tk_work
);
210 rdata
= container_of(task
, struct nfs_read_data
, task
);
211 pnfs_ld_read_done(rdata
);
215 bl_end_par_io_read(void *data
)
217 struct nfs_read_data
*rdata
= data
;
219 INIT_WORK(&rdata
->task
.u
.tk_work
, bl_read_cleanup
);
220 schedule_work(&rdata
->task
.u
.tk_work
);
223 /* We don't want normal .rpc_call_done callback used, so we replace it
226 static void bl_rpc_do_nothing(struct rpc_task
*task
, void *calldata
)
231 static enum pnfs_try_status
232 bl_read_pagelist(struct nfs_read_data
*rdata
)
235 struct bio
*bio
= NULL
;
236 struct pnfs_block_extent
*be
= NULL
, *cow_read
= NULL
;
237 sector_t isect
, extent_length
= 0;
238 struct parallel_io
*par
;
239 loff_t f_offset
= rdata
->args
.offset
;
240 size_t count
= rdata
->args
.count
;
241 struct page
**pages
= rdata
->args
.pages
;
242 int pg_index
= rdata
->args
.pgbase
>> PAGE_CACHE_SHIFT
;
244 dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__
,
245 rdata
->npages
, f_offset
, count
);
247 par
= alloc_parallel(rdata
);
250 par
->call_ops
= *rdata
->mds_ops
;
251 par
->call_ops
.rpc_call_done
= bl_rpc_do_nothing
;
252 par
->pnfs_callback
= bl_end_par_io_read
;
253 /* At this point, we can no longer jump to use_mds */
255 isect
= (sector_t
) (f_offset
>> SECTOR_SHIFT
);
256 /* Code assumes extents are page-aligned */
257 for (i
= pg_index
; i
< rdata
->npages
; i
++) {
258 if (!extent_length
) {
259 /* We've used up the previous extent */
261 bl_put_extent(cow_read
);
262 bio
= bl_submit_bio(READ
, bio
);
263 /* Get the next one */
264 be
= bl_find_get_extent(BLK_LSEG2EXT(rdata
->lseg
),
267 rdata
->pnfs_error
= -EIO
;
270 extent_length
= be
->be_length
-
271 (isect
- be
->be_f_offset
);
273 sector_t cow_length
= cow_read
->be_length
-
274 (isect
- cow_read
->be_f_offset
);
275 extent_length
= min(extent_length
, cow_length
);
278 hole
= is_hole(be
, isect
);
279 if (hole
&& !cow_read
) {
280 bio
= bl_submit_bio(READ
, bio
);
281 /* Fill hole w/ zeroes w/o accessing device */
282 dprintk("%s Zeroing page for hole\n", __func__
);
283 zero_user_segment(pages
[i
], 0, PAGE_CACHE_SIZE
);
284 print_page(pages
[i
]);
285 SetPageUptodate(pages
[i
]);
287 struct pnfs_block_extent
*be_read
;
289 be_read
= (hole
&& cow_read
) ? cow_read
: be
;
290 bio
= bl_add_page_to_bio(bio
, rdata
->npages
- i
, READ
,
291 isect
, pages
[i
], be_read
,
292 bl_end_io_read
, par
);
294 rdata
->pnfs_error
= PTR_ERR(bio
);
298 isect
+= PAGE_CACHE_SECTORS
;
299 extent_length
-= PAGE_CACHE_SECTORS
;
301 if ((isect
<< SECTOR_SHIFT
) >= rdata
->inode
->i_size
) {
303 rdata
->res
.count
= rdata
->inode
->i_size
- f_offset
;
305 rdata
->res
.count
= (isect
<< SECTOR_SHIFT
) - f_offset
;
309 bl_put_extent(cow_read
);
310 bl_submit_bio(READ
, bio
);
312 return PNFS_ATTEMPTED
;
315 dprintk("Giving up and using normal NFS\n");
316 return PNFS_NOT_ATTEMPTED
;
319 static enum pnfs_try_status
320 bl_write_pagelist(struct nfs_write_data
*wdata
,
323 return PNFS_NOT_ATTEMPTED
;
326 /* FIXME - range ignored */
328 release_extents(struct pnfs_block_layout
*bl
, struct pnfs_layout_range
*range
)
331 struct pnfs_block_extent
*be
;
333 spin_lock(&bl
->bl_ext_lock
);
334 for (i
= 0; i
< EXTENT_LISTS
; i
++) {
335 while (!list_empty(&bl
->bl_extents
[i
])) {
336 be
= list_first_entry(&bl
->bl_extents
[i
],
337 struct pnfs_block_extent
,
339 list_del(&be
->be_node
);
343 spin_unlock(&bl
->bl_ext_lock
);
347 release_inval_marks(struct pnfs_inval_markings
*marks
)
349 struct pnfs_inval_tracking
*pos
, *temp
;
351 list_for_each_entry_safe(pos
, temp
, &marks
->im_tree
.mtt_stub
, it_link
) {
352 list_del(&pos
->it_link
);
358 static void bl_free_layout_hdr(struct pnfs_layout_hdr
*lo
)
360 struct pnfs_block_layout
*bl
= BLK_LO2EXT(lo
);
362 dprintk("%s enter\n", __func__
);
363 release_extents(bl
, NULL
);
364 release_inval_marks(&bl
->bl_inval
);
368 static struct pnfs_layout_hdr
*bl_alloc_layout_hdr(struct inode
*inode
,
371 struct pnfs_block_layout
*bl
;
373 dprintk("%s enter\n", __func__
);
374 bl
= kzalloc(sizeof(*bl
), gfp_flags
);
377 spin_lock_init(&bl
->bl_ext_lock
);
378 INIT_LIST_HEAD(&bl
->bl_extents
[0]);
379 INIT_LIST_HEAD(&bl
->bl_extents
[1]);
380 INIT_LIST_HEAD(&bl
->bl_commit
);
381 INIT_LIST_HEAD(&bl
->bl_committing
);
383 bl
->bl_blocksize
= NFS_SERVER(inode
)->pnfs_blksize
>> SECTOR_SHIFT
;
384 BL_INIT_INVAL_MARKS(&bl
->bl_inval
, bl
->bl_blocksize
);
385 return &bl
->bl_layout
;
388 static void bl_free_lseg(struct pnfs_layout_segment
*lseg
)
390 dprintk("%s enter\n", __func__
);
394 /* We pretty much ignore lseg, and store all data layout wide, so we
395 * can correctly merge.
397 static struct pnfs_layout_segment
*bl_alloc_lseg(struct pnfs_layout_hdr
*lo
,
398 struct nfs4_layoutget_res
*lgr
,
401 struct pnfs_layout_segment
*lseg
;
404 dprintk("%s enter\n", __func__
);
405 lseg
= kzalloc(sizeof(*lseg
), gfp_flags
);
407 return ERR_PTR(-ENOMEM
);
408 status
= nfs4_blk_process_layoutget(lo
, lgr
, gfp_flags
);
410 /* We don't want to call the full-blown bl_free_lseg,
411 * since on error extents were not touched.
414 return ERR_PTR(status
);
420 bl_encode_layoutcommit(struct pnfs_layout_hdr
*lo
, struct xdr_stream
*xdr
,
421 const struct nfs4_layoutcommit_args
*arg
)
423 dprintk("%s enter\n", __func__
);
424 encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo
), xdr
, arg
);
428 bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data
*lcdata
)
430 struct pnfs_layout_hdr
*lo
= NFS_I(lcdata
->args
.inode
)->layout
;
432 dprintk("%s enter\n", __func__
);
433 clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo
), &lcdata
->args
, lcdata
->res
.status
);
436 static void free_blk_mountid(struct block_mount_id
*mid
)
439 struct pnfs_block_dev
*dev
;
440 spin_lock(&mid
->bm_lock
);
441 while (!list_empty(&mid
->bm_devlist
)) {
442 dev
= list_first_entry(&mid
->bm_devlist
,
443 struct pnfs_block_dev
,
445 list_del(&dev
->bm_node
);
446 bl_free_block_dev(dev
);
448 spin_unlock(&mid
->bm_lock
);
453 /* This is mostly copied from the filelayout's get_device_info function.
454 * It seems much of this should be at the generic pnfs level.
456 static struct pnfs_block_dev
*
457 nfs4_blk_get_deviceinfo(struct nfs_server
*server
, const struct nfs_fh
*fh
,
458 struct nfs4_deviceid
*d_id
)
460 struct pnfs_device
*dev
;
461 struct pnfs_block_dev
*rv
= NULL
;
464 struct page
**pages
= NULL
;
468 * Use the session max response size as the basis for setting
469 * GETDEVICEINFO's maxcount
471 max_resp_sz
= server
->nfs_client
->cl_session
->fc_attrs
.max_resp_sz
;
472 max_pages
= max_resp_sz
>> PAGE_SHIFT
;
473 dprintk("%s max_resp_sz %u max_pages %d\n",
474 __func__
, max_resp_sz
, max_pages
);
476 dev
= kmalloc(sizeof(*dev
), GFP_NOFS
);
478 dprintk("%s kmalloc failed\n", __func__
);
482 pages
= kzalloc(max_pages
* sizeof(struct page
*), GFP_NOFS
);
487 for (i
= 0; i
< max_pages
; i
++) {
488 pages
[i
] = alloc_page(GFP_NOFS
);
493 memcpy(&dev
->dev_id
, d_id
, sizeof(*d_id
));
494 dev
->layout_type
= LAYOUT_BLOCK_VOLUME
;
497 dev
->pglen
= PAGE_SIZE
* max_pages
;
500 dprintk("%s: dev_id: %s\n", __func__
, dev
->dev_id
.data
);
501 rc
= nfs4_proc_getdeviceinfo(server
, dev
);
502 dprintk("%s getdevice info returns %d\n", __func__
, rc
);
506 rv
= nfs4_blk_decode_device(server
, dev
);
508 for (i
= 0; i
< max_pages
; i
++)
509 __free_page(pages
[i
]);
516 bl_set_layoutdriver(struct nfs_server
*server
, const struct nfs_fh
*fh
)
518 struct block_mount_id
*b_mt_id
= NULL
;
519 struct pnfs_devicelist
*dlist
= NULL
;
520 struct pnfs_block_dev
*bdev
;
521 LIST_HEAD(block_disklist
);
524 dprintk("%s enter\n", __func__
);
526 if (server
->pnfs_blksize
== 0) {
527 dprintk("%s Server did not return blksize\n", __func__
);
530 b_mt_id
= kzalloc(sizeof(struct block_mount_id
), GFP_NOFS
);
535 /* Initialize nfs4 block layout mount id */
536 spin_lock_init(&b_mt_id
->bm_lock
);
537 INIT_LIST_HEAD(&b_mt_id
->bm_devlist
);
539 dlist
= kmalloc(sizeof(struct pnfs_devicelist
), GFP_NOFS
);
545 while (!dlist
->eof
) {
546 status
= nfs4_proc_getdevicelist(server
, fh
, dlist
);
549 dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n",
550 __func__
, dlist
->num_devs
, dlist
->eof
);
551 for (i
= 0; i
< dlist
->num_devs
; i
++) {
552 bdev
= nfs4_blk_get_deviceinfo(server
, fh
,
558 spin_lock(&b_mt_id
->bm_lock
);
559 list_add(&bdev
->bm_node
, &b_mt_id
->bm_devlist
);
560 spin_unlock(&b_mt_id
->bm_lock
);
563 dprintk("%s SUCCESS\n", __func__
);
564 server
->pnfs_ld_data
= b_mt_id
;
571 free_blk_mountid(b_mt_id
);
576 bl_clear_layoutdriver(struct nfs_server
*server
)
578 struct block_mount_id
*b_mt_id
= server
->pnfs_ld_data
;
580 dprintk("%s enter\n", __func__
);
581 free_blk_mountid(b_mt_id
);
582 dprintk("%s RETURNS\n", __func__
);
586 static const struct nfs_pageio_ops bl_pg_read_ops
= {
587 .pg_init
= pnfs_generic_pg_init_read
,
588 .pg_test
= pnfs_generic_pg_test
,
589 .pg_doio
= pnfs_generic_pg_readpages
,
592 static const struct nfs_pageio_ops bl_pg_write_ops
= {
593 .pg_init
= pnfs_generic_pg_init_write
,
594 .pg_test
= pnfs_generic_pg_test
,
595 .pg_doio
= pnfs_generic_pg_writepages
,
598 static struct pnfs_layoutdriver_type blocklayout_type
= {
599 .id
= LAYOUT_BLOCK_VOLUME
,
600 .name
= "LAYOUT_BLOCK_VOLUME",
601 .read_pagelist
= bl_read_pagelist
,
602 .write_pagelist
= bl_write_pagelist
,
603 .alloc_layout_hdr
= bl_alloc_layout_hdr
,
604 .free_layout_hdr
= bl_free_layout_hdr
,
605 .alloc_lseg
= bl_alloc_lseg
,
606 .free_lseg
= bl_free_lseg
,
607 .encode_layoutcommit
= bl_encode_layoutcommit
,
608 .cleanup_layoutcommit
= bl_cleanup_layoutcommit
,
609 .set_layoutdriver
= bl_set_layoutdriver
,
610 .clear_layoutdriver
= bl_clear_layoutdriver
,
611 .pg_read_ops
= &bl_pg_read_ops
,
612 .pg_write_ops
= &bl_pg_write_ops
,
615 static const struct rpc_pipe_ops bl_upcall_ops
= {
616 .upcall
= bl_pipe_upcall
,
617 .downcall
= bl_pipe_downcall
,
618 .destroy_msg
= bl_pipe_destroy_msg
,
621 static int __init
nfs4blocklayout_init(void)
623 struct vfsmount
*mnt
;
627 dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__
);
629 ret
= pnfs_register_layoutdriver(&blocklayout_type
);
633 init_waitqueue_head(&bl_wq
);
635 mnt
= rpc_get_mount();
641 ret
= vfs_path_lookup(mnt
->mnt_root
,
643 NFS_PIPE_DIRNAME
, 0, &path
);
647 bl_device_pipe
= rpc_mkpipe(path
.dentry
, "blocklayout", NULL
,
649 if (IS_ERR(bl_device_pipe
)) {
650 ret
= PTR_ERR(bl_device_pipe
);
657 pnfs_unregister_layoutdriver(&blocklayout_type
);
661 static void __exit
nfs4blocklayout_exit(void)
663 dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
666 pnfs_unregister_layoutdriver(&blocklayout_type
);
667 rpc_unlink(bl_device_pipe
);
670 MODULE_ALIAS("nfs-layouttype4-3");
672 module_init(nfs4blocklayout_init
);
673 module_exit(nfs4blocklayout_exit
);