6 * Partial copy of Linus' read cache modifications to fs/nfs/file.c
7 * modified for async RPC by okir@monad.swb.de
9 * We do an ugly hack here in order to return proper error codes to the
10 * user program when a read request failed: since generic_file_read
11 * only checks the return value of inode->i_op->readpage() which is always 0
12 * for async RPC, we set the error bit of the page to 1 when an error occurs,
13 * and make nfs_readpage transmit requests synchronously when encountering this.
14 * This is only a small problem, though, since we now retry all operations
15 * within the RPC code when root squashing is suspected.
18 #include <linux/time.h>
19 #include <linux/kernel.h>
20 #include <linux/errno.h>
21 #include <linux/fcntl.h>
22 #include <linux/stat.h>
24 #include <linux/slab.h>
25 #include <linux/pagemap.h>
26 #include <linux/sunrpc/clnt.h>
27 #include <linux/nfs_fs.h>
28 #include <linux/nfs_page.h>
29 #include <linux/smp_lock.h>
31 #include <asm/system.h>
35 #define NFSDBG_FACILITY NFSDBG_PAGECACHE
37 static int nfs_pagein_one(struct list_head
*, struct inode
*);
38 static const struct rpc_call_ops nfs_read_partial_ops
;
39 static const struct rpc_call_ops nfs_read_full_ops
;
41 static kmem_cache_t
*nfs_rdata_cachep
;
42 static mempool_t
*nfs_rdata_mempool
;
44 #define MIN_POOL_READ (32)
46 struct nfs_read_data
*nfs_readdata_alloc(size_t len
)
48 unsigned int pagecount
= (len
+ PAGE_SIZE
- 1) >> PAGE_SHIFT
;
49 struct nfs_read_data
*p
= mempool_alloc(nfs_rdata_mempool
, SLAB_NOFS
);
52 memset(p
, 0, sizeof(*p
));
53 INIT_LIST_HEAD(&p
->pages
);
54 p
->npages
= pagecount
;
55 if (pagecount
<= ARRAY_SIZE(p
->page_array
))
56 p
->pagevec
= p
->page_array
;
58 p
->pagevec
= kcalloc(pagecount
, sizeof(struct page
*), GFP_NOFS
);
60 mempool_free(p
, nfs_rdata_mempool
);
68 static void nfs_readdata_rcu_free(struct rcu_head
*head
)
70 struct nfs_read_data
*p
= container_of(head
, struct nfs_read_data
, task
.u
.tk_rcu
);
71 if (p
&& (p
->pagevec
!= &p
->page_array
[0]))
73 mempool_free(p
, nfs_rdata_mempool
);
76 static void nfs_readdata_free(struct nfs_read_data
*rdata
)
78 call_rcu_bh(&rdata
->task
.u
.tk_rcu
, nfs_readdata_rcu_free
);
81 void nfs_readdata_release(void *data
)
83 nfs_readdata_free(data
);
87 unsigned int nfs_page_length(struct inode
*inode
, struct page
*page
)
89 loff_t i_size
= i_size_read(inode
);
94 idx
= (i_size
- 1) >> PAGE_CACHE_SHIFT
;
95 if (page
->index
> idx
)
97 if (page
->index
!= idx
)
98 return PAGE_CACHE_SIZE
;
99 return 1 + ((i_size
- 1) & (PAGE_CACHE_SIZE
- 1));
103 int nfs_return_empty_page(struct page
*page
)
105 memclear_highpage_flush(page
, 0, PAGE_CACHE_SIZE
);
106 SetPageUptodate(page
);
111 static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data
*data
)
113 unsigned int remainder
= data
->args
.count
- data
->res
.count
;
114 unsigned int base
= data
->args
.pgbase
+ data
->res
.count
;
118 if (data
->res
.eof
== 0 || remainder
== 0)
121 * Note: "remainder" can never be negative, since we check for
122 * this in the XDR code.
124 pages
= &data
->args
.pages
[base
>> PAGE_CACHE_SHIFT
];
125 base
&= ~PAGE_CACHE_MASK
;
126 pglen
= PAGE_CACHE_SIZE
- base
;
128 if (remainder
<= pglen
) {
129 memclear_highpage_flush(*pages
, base
, remainder
);
132 memclear_highpage_flush(*pages
, base
, pglen
);
135 pglen
= PAGE_CACHE_SIZE
;
141 * Read a page synchronously.
143 static int nfs_readpage_sync(struct nfs_open_context
*ctx
, struct inode
*inode
,
146 unsigned int rsize
= NFS_SERVER(inode
)->rsize
;
147 unsigned int count
= PAGE_CACHE_SIZE
;
149 struct nfs_read_data
*rdata
;
151 rdata
= nfs_readdata_alloc(count
);
155 memset(rdata
, 0, sizeof(*rdata
));
156 rdata
->flags
= (IS_SWAPFILE(inode
)? NFS_RPC_SWAPFLAGS
: 0);
157 rdata
->cred
= ctx
->cred
;
158 rdata
->inode
= inode
;
159 INIT_LIST_HEAD(&rdata
->pages
);
160 rdata
->args
.fh
= NFS_FH(inode
);
161 rdata
->args
.context
= ctx
;
162 rdata
->args
.pages
= &page
;
163 rdata
->args
.pgbase
= 0UL;
164 rdata
->args
.count
= rsize
;
165 rdata
->res
.fattr
= &rdata
->fattr
;
167 dprintk("NFS: nfs_readpage_sync(%p)\n", page
);
170 * This works now because the socket layer never tries to DMA
171 * into this buffer directly.
175 rdata
->args
.count
= count
;
176 rdata
->res
.count
= rdata
->args
.count
;
177 rdata
->args
.offset
= page_offset(page
) + rdata
->args
.pgbase
;
179 dprintk("NFS: nfs_proc_read(%s, (%s/%Ld), %Lu, %u)\n",
180 NFS_SERVER(inode
)->nfs_client
->cl_hostname
,
182 (long long)NFS_FILEID(inode
),
183 (unsigned long long)rdata
->args
.pgbase
,
187 result
= NFS_PROTO(inode
)->read(rdata
);
191 * Even if we had a partial success we can't mark the page
195 if (result
== -EISDIR
)
200 rdata
->args
.pgbase
+= result
;
201 nfs_add_stats(inode
, NFSIOS_SERVERREADBYTES
, result
);
203 /* Note: result == 0 should only happen if we're caching
204 * a write that extends the file and punches a hole.
206 if (rdata
->res
.eof
!= 0 || result
== 0)
209 spin_lock(&inode
->i_lock
);
210 NFS_I(inode
)->cache_validity
|= NFS_INO_INVALID_ATIME
;
211 spin_unlock(&inode
->i_lock
);
213 if (rdata
->res
.eof
|| rdata
->res
.count
== rdata
->args
.count
) {
214 SetPageUptodate(page
);
215 if (rdata
->res
.eof
&& count
!= 0)
216 memclear_highpage_flush(page
, rdata
->args
.pgbase
, count
);
222 nfs_readdata_free(rdata
);
226 static int nfs_readpage_async(struct nfs_open_context
*ctx
, struct inode
*inode
,
229 LIST_HEAD(one_request
);
230 struct nfs_page
*new;
233 len
= nfs_page_length(inode
, page
);
235 return nfs_return_empty_page(page
);
236 new = nfs_create_request(ctx
, inode
, page
, 0, len
);
241 if (len
< PAGE_CACHE_SIZE
)
242 memclear_highpage_flush(page
, len
, PAGE_CACHE_SIZE
- len
);
244 nfs_list_add_request(new, &one_request
);
245 nfs_pagein_one(&one_request
, inode
);
249 static void nfs_readpage_release(struct nfs_page
*req
)
251 unlock_page(req
->wb_page
);
253 dprintk("NFS: read done (%s/%Ld %d@%Ld)\n",
254 req
->wb_context
->dentry
->d_inode
->i_sb
->s_id
,
255 (long long)NFS_FILEID(req
->wb_context
->dentry
->d_inode
),
257 (long long)req_offset(req
));
258 nfs_clear_request(req
);
259 nfs_release_request(req
);
263 * Set up the NFS read request struct
265 static void nfs_read_rpcsetup(struct nfs_page
*req
, struct nfs_read_data
*data
,
266 const struct rpc_call_ops
*call_ops
,
267 unsigned int count
, unsigned int offset
)
273 data
->inode
= inode
= req
->wb_context
->dentry
->d_inode
;
274 data
->cred
= req
->wb_context
->cred
;
276 data
->args
.fh
= NFS_FH(inode
);
277 data
->args
.offset
= req_offset(req
) + offset
;
278 data
->args
.pgbase
= req
->wb_pgbase
+ offset
;
279 data
->args
.pages
= data
->pagevec
;
280 data
->args
.count
= count
;
281 data
->args
.context
= req
->wb_context
;
283 data
->res
.fattr
= &data
->fattr
;
284 data
->res
.count
= count
;
286 nfs_fattr_init(&data
->fattr
);
288 /* Set up the initial task struct. */
289 flags
= RPC_TASK_ASYNC
| (IS_SWAPFILE(inode
)? NFS_RPC_SWAPFLAGS
: 0);
290 rpc_init_task(&data
->task
, NFS_CLIENT(inode
), flags
, call_ops
, data
);
291 NFS_PROTO(inode
)->read_setup(data
);
293 data
->task
.tk_cookie
= (unsigned long)inode
;
295 dprintk("NFS: %4d initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
298 (long long)NFS_FILEID(inode
),
300 (unsigned long long)data
->args
.offset
);
304 nfs_async_read_error(struct list_head
*head
)
306 struct nfs_page
*req
;
308 while (!list_empty(head
)) {
309 req
= nfs_list_entry(head
->next
);
310 nfs_list_remove_request(req
);
311 SetPageError(req
->wb_page
);
312 nfs_readpage_release(req
);
317 * Start an async read operation
319 static void nfs_execute_read(struct nfs_read_data
*data
)
321 struct rpc_clnt
*clnt
= NFS_CLIENT(data
->inode
);
324 rpc_clnt_sigmask(clnt
, &oldset
);
326 rpc_execute(&data
->task
);
328 rpc_clnt_sigunmask(clnt
, &oldset
);
332 * Generate multiple requests to fill a single page.
334 * We optimize to reduce the number of read operations on the wire. If we
335 * detect that we're reading a page, or an area of a page, that is past the
336 * end of file, we do not generate NFS read operations but just clear the
337 * parts of the page that would have come back zero from the server anyway.
339 * We rely on the cached value of i_size to make this determination; another
340 * client can fill pages on the server past our cached end-of-file, but we
341 * won't see the new data until our attribute cache is updated. This is more
342 * or less conventional NFS client behavior.
344 static int nfs_pagein_multi(struct list_head
*head
, struct inode
*inode
)
346 struct nfs_page
*req
= nfs_list_entry(head
->next
);
347 struct page
*page
= req
->wb_page
;
348 struct nfs_read_data
*data
;
349 size_t rsize
= NFS_SERVER(inode
)->rsize
, nbytes
;
354 nfs_list_remove_request(req
);
356 nbytes
= req
->wb_bytes
;
358 size_t len
= min(nbytes
,rsize
);
360 data
= nfs_readdata_alloc(len
);
363 INIT_LIST_HEAD(&data
->pages
);
364 list_add(&data
->pages
, &list
);
367 } while(nbytes
!= 0);
368 atomic_set(&req
->wb_complete
, requests
);
370 ClearPageError(page
);
372 nbytes
= req
->wb_bytes
;
374 data
= list_entry(list
.next
, struct nfs_read_data
, pages
);
375 list_del_init(&data
->pages
);
377 data
->pagevec
[0] = page
;
379 if (nbytes
> rsize
) {
380 nfs_read_rpcsetup(req
, data
, &nfs_read_partial_ops
,
385 nfs_read_rpcsetup(req
, data
, &nfs_read_partial_ops
,
389 nfs_execute_read(data
);
390 } while (nbytes
!= 0);
395 while (!list_empty(&list
)) {
396 data
= list_entry(list
.next
, struct nfs_read_data
, pages
);
397 list_del(&data
->pages
);
398 nfs_readdata_free(data
);
401 nfs_readpage_release(req
);
405 static int nfs_pagein_one(struct list_head
*head
, struct inode
*inode
)
407 struct nfs_page
*req
;
409 struct nfs_read_data
*data
;
412 if (NFS_SERVER(inode
)->rsize
< PAGE_CACHE_SIZE
)
413 return nfs_pagein_multi(head
, inode
);
415 data
= nfs_readdata_alloc(NFS_SERVER(inode
)->rsize
);
419 INIT_LIST_HEAD(&data
->pages
);
420 pages
= data
->pagevec
;
422 while (!list_empty(head
)) {
423 req
= nfs_list_entry(head
->next
);
424 nfs_list_remove_request(req
);
425 nfs_list_add_request(req
, &data
->pages
);
426 ClearPageError(req
->wb_page
);
427 *pages
++ = req
->wb_page
;
428 count
+= req
->wb_bytes
;
430 req
= nfs_list_entry(data
->pages
.next
);
432 nfs_read_rpcsetup(req
, data
, &nfs_read_full_ops
, count
, 0);
434 nfs_execute_read(data
);
437 nfs_async_read_error(head
);
442 nfs_pagein_list(struct list_head
*head
, int rpages
)
444 LIST_HEAD(one_request
);
445 struct nfs_page
*req
;
447 unsigned int pages
= 0;
449 while (!list_empty(head
)) {
450 pages
+= nfs_coalesce_requests(head
, &one_request
, rpages
);
451 req
= nfs_list_entry(one_request
.next
);
452 error
= nfs_pagein_one(&one_request
, req
->wb_context
->dentry
->d_inode
);
459 nfs_async_read_error(head
);
464 * This is the callback from RPC telling us whether a reply was
465 * received or some error occurred (timeout or socket shutdown).
467 int nfs_readpage_result(struct rpc_task
*task
, struct nfs_read_data
*data
)
471 dprintk("%s: %4d, (status %d)\n", __FUNCTION__
, task
->tk_pid
,
474 status
= NFS_PROTO(data
->inode
)->read_done(task
, data
);
478 nfs_add_stats(data
->inode
, NFSIOS_SERVERREADBYTES
, data
->res
.count
);
480 if (task
->tk_status
== -ESTALE
) {
481 set_bit(NFS_INO_STALE
, &NFS_FLAGS(data
->inode
));
482 nfs_mark_for_revalidate(data
->inode
);
484 spin_lock(&data
->inode
->i_lock
);
485 NFS_I(data
->inode
)->cache_validity
|= NFS_INO_INVALID_ATIME
;
486 spin_unlock(&data
->inode
->i_lock
);
490 static int nfs_readpage_retry(struct rpc_task
*task
, struct nfs_read_data
*data
)
492 struct nfs_readargs
*argp
= &data
->args
;
493 struct nfs_readres
*resp
= &data
->res
;
495 if (resp
->eof
|| resp
->count
== argp
->count
)
498 /* This is a short read! */
499 nfs_inc_stats(data
->inode
, NFSIOS_SHORTREAD
);
500 /* Has the server at least made some progress? */
501 if (resp
->count
== 0)
504 /* Yes, so retry the read at the end of the data */
505 argp
->offset
+= resp
->count
;
506 argp
->pgbase
+= resp
->count
;
507 argp
->count
-= resp
->count
;
508 rpc_restart_call(task
);
513 * Handle a read reply that fills part of a page.
515 static void nfs_readpage_result_partial(struct rpc_task
*task
, void *calldata
)
517 struct nfs_read_data
*data
= calldata
;
518 struct nfs_page
*req
= data
->req
;
519 struct page
*page
= req
->wb_page
;
521 if (nfs_readpage_result(task
, data
) != 0)
524 if (likely(task
->tk_status
>= 0)) {
525 nfs_readpage_truncate_uninitialised_page(data
);
526 if (nfs_readpage_retry(task
, data
) != 0)
529 if (unlikely(task
->tk_status
< 0))
531 if (atomic_dec_and_test(&req
->wb_complete
)) {
532 if (!PageError(page
))
533 SetPageUptodate(page
);
534 nfs_readpage_release(req
);
538 static const struct rpc_call_ops nfs_read_partial_ops
= {
539 .rpc_call_done
= nfs_readpage_result_partial
,
540 .rpc_release
= nfs_readdata_release
,
543 static void nfs_readpage_set_pages_uptodate(struct nfs_read_data
*data
)
545 unsigned int count
= data
->res
.count
;
546 unsigned int base
= data
->args
.pgbase
;
550 count
= data
->args
.count
;
551 if (unlikely(count
== 0))
553 pages
= &data
->args
.pages
[base
>> PAGE_CACHE_SHIFT
];
554 base
&= ~PAGE_CACHE_MASK
;
556 for (;count
>= PAGE_CACHE_SIZE
; count
-= PAGE_CACHE_SIZE
, pages
++)
557 SetPageUptodate(*pages
);
560 /* Was this a short read? */
561 if (data
->res
.eof
|| data
->res
.count
== data
->args
.count
)
562 SetPageUptodate(*pages
);
566 * This is the callback from RPC telling us whether a reply was
567 * received or some error occurred (timeout or socket shutdown).
569 static void nfs_readpage_result_full(struct rpc_task
*task
, void *calldata
)
571 struct nfs_read_data
*data
= calldata
;
573 if (nfs_readpage_result(task
, data
) != 0)
576 * Note: nfs_readpage_retry may change the values of
577 * data->args. In the multi-page case, we therefore need
578 * to ensure that we call nfs_readpage_set_pages_uptodate()
581 if (likely(task
->tk_status
>= 0)) {
582 nfs_readpage_truncate_uninitialised_page(data
);
583 nfs_readpage_set_pages_uptodate(data
);
584 if (nfs_readpage_retry(task
, data
) != 0)
587 while (!list_empty(&data
->pages
)) {
588 struct nfs_page
*req
= nfs_list_entry(data
->pages
.next
);
590 nfs_list_remove_request(req
);
591 nfs_readpage_release(req
);
595 static const struct rpc_call_ops nfs_read_full_ops
= {
596 .rpc_call_done
= nfs_readpage_result_full
,
597 .rpc_release
= nfs_readdata_release
,
601 * Read a page over NFS.
602 * We read the page synchronously in the following case:
603 * - The error flag is set for this page. This happens only when a
604 * previous async read operation failed.
606 int nfs_readpage(struct file
*file
, struct page
*page
)
608 struct nfs_open_context
*ctx
;
609 struct inode
*inode
= page
->mapping
->host
;
612 dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
613 page
, PAGE_CACHE_SIZE
, page
->index
);
614 nfs_inc_stats(inode
, NFSIOS_VFSREADPAGE
);
615 nfs_add_stats(inode
, NFSIOS_READPAGES
, 1);
618 * Try to flush any pending writes to the file..
620 * NOTE! Because we own the page lock, there cannot
621 * be any new pending writes generated at this point
622 * for this page (other pages can be written to).
624 error
= nfs_wb_page(inode
, page
);
629 if (NFS_STALE(inode
))
633 ctx
= nfs_find_open_context(inode
, NULL
, FMODE_READ
);
637 ctx
= get_nfs_open_context((struct nfs_open_context
*)
639 if (!IS_SYNC(inode
)) {
640 error
= nfs_readpage_async(ctx
, inode
, page
);
644 error
= nfs_readpage_sync(ctx
, inode
, page
);
645 if (error
< 0 && IS_SWAPFILE(inode
))
646 printk("Aiee.. nfs swap-in of page failed!\n");
648 put_nfs_open_context(ctx
);
656 struct nfs_readdesc
{
657 struct list_head
*head
;
658 struct nfs_open_context
*ctx
;
662 readpage_async_filler(void *data
, struct page
*page
)
664 struct nfs_readdesc
*desc
= (struct nfs_readdesc
*)data
;
665 struct inode
*inode
= page
->mapping
->host
;
666 struct nfs_page
*new;
669 nfs_wb_page(inode
, page
);
670 len
= nfs_page_length(inode
, page
);
672 return nfs_return_empty_page(page
);
673 new = nfs_create_request(desc
->ctx
, inode
, page
, 0, len
);
679 if (len
< PAGE_CACHE_SIZE
)
680 memclear_highpage_flush(page
, len
, PAGE_CACHE_SIZE
- len
);
681 nfs_list_add_request(new, desc
->head
);
685 int nfs_readpages(struct file
*filp
, struct address_space
*mapping
,
686 struct list_head
*pages
, unsigned nr_pages
)
689 struct nfs_readdesc desc
= {
692 struct inode
*inode
= mapping
->host
;
693 struct nfs_server
*server
= NFS_SERVER(inode
);
696 dprintk("NFS: nfs_readpages (%s/%Ld %d)\n",
698 (long long)NFS_FILEID(inode
),
700 nfs_inc_stats(inode
, NFSIOS_VFSREADPAGES
);
702 if (NFS_STALE(inode
))
706 desc
.ctx
= nfs_find_open_context(inode
, NULL
, FMODE_READ
);
707 if (desc
.ctx
== NULL
)
710 desc
.ctx
= get_nfs_open_context((struct nfs_open_context
*)
712 ret
= read_cache_pages(mapping
, pages
, readpage_async_filler
, &desc
);
713 if (!list_empty(&head
)) {
714 int err
= nfs_pagein_list(&head
, server
->rpages
);
716 nfs_add_stats(inode
, NFSIOS_READPAGES
, err
);
719 put_nfs_open_context(desc
.ctx
);
724 int __init
nfs_init_readpagecache(void)
726 nfs_rdata_cachep
= kmem_cache_create("nfs_read_data",
727 sizeof(struct nfs_read_data
),
728 0, SLAB_HWCACHE_ALIGN
,
730 if (nfs_rdata_cachep
== NULL
)
733 nfs_rdata_mempool
= mempool_create_slab_pool(MIN_POOL_READ
,
735 if (nfs_rdata_mempool
== NULL
)
741 void nfs_destroy_readpagecache(void)
743 mempool_destroy(nfs_rdata_mempool
);
744 kmem_cache_destroy(nfs_rdata_cachep
);