4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2015, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Lustre Lite I/O page cache routines shared by different kernel revs
37 #include <linux/kernel.h>
39 #include <linux/string.h>
40 #include <linux/stat.h>
41 #include <linux/errno.h>
42 #include <linux/unistd.h>
43 #include <linux/writeback.h>
44 #include <linux/uaccess.h>
47 #include <linux/pagemap.h>
48 /* current_is_kswapd() */
49 #include <linux/swap.h>
51 #define DEBUG_SUBSYSTEM S_LLITE
53 #include "../include/lustre_lite.h"
54 #include "../include/obd_cksum.h"
55 #include "llite_internal.h"
56 #include "../include/linux/lustre_compat25.h"
58 static void ll_ra_stats_inc_sbi(struct ll_sb_info
*sbi
, enum ra_stat which
);
61 * Get readahead pages from the filesystem readahead pool of the client for a
64 * /param sbi superblock for filesystem readahead state ll_ra_info
65 * /param ria per-thread readahead state
66 * /param pages number of pages requested for readahead for the thread.
68 * WARNING: This algorithm is used to reduce contention on sbi->ll_lock.
69 * It should work well if the ra_max_pages is much greater than the single
70 * file's read-ahead window, and not too many threads contending for
71 * these readahead pages.
73 * TODO: There may be a 'global sync problem' if many threads are trying
74 * to get an ra budget that is larger than the remaining readahead pages
75 * and reach here at exactly the same time. They will compute /a ret to
76 * consume the remaining pages, but will fail at atomic_add_return() and
77 * get a zero ra window, although there is still ra space remaining. - Jay
79 static unsigned long ll_ra_count_get(struct ll_sb_info
*sbi
,
80 struct ra_io_arg
*ria
,
81 unsigned long pages
, unsigned long min
)
83 struct ll_ra_info
*ra
= &sbi
->ll_ra_info
;
86 /* If read-ahead pages left are less than 1M, do not do read-ahead,
87 * otherwise it will form small read RPC(< 1M), which hurt server
90 ret
= min(ra
->ra_max_pages
- atomic_read(&ra
->ra_cur_pages
), pages
);
91 if (ret
< 0 || ret
< min_t(long, PTLRPC_MAX_BRW_PAGES
, pages
)) {
96 /* If the non-strided (ria_pages == 0) readahead window
97 * (ria_start + ret) has grown across an RPC boundary, then trim
98 * readahead size by the amount beyond the RPC so it ends on an
99 * RPC boundary. If the readahead window is already ending on
100 * an RPC boundary (beyond_rpc == 0), or smaller than a full
101 * RPC (beyond_rpc < ret) the readahead size is unchanged.
102 * The (beyond_rpc != 0) check is skipped since the conditional
103 * branch is more expensive than subtracting zero from the result.
105 * Strided read is left unaligned to avoid small fragments beyond
106 * the RPC boundary from needing an extra read RPC.
108 if (ria
->ria_pages
== 0) {
109 long beyond_rpc
= (ria
->ria_start
+ ret
) % PTLRPC_MAX_BRW_PAGES
;
111 if (/* beyond_rpc != 0 && */ beyond_rpc
< ret
)
115 if (atomic_add_return(ret
, &ra
->ra_cur_pages
) > ra
->ra_max_pages
) {
116 atomic_sub(ret
, &ra
->ra_cur_pages
);
122 /* override ra limit for maximum performance */
123 atomic_add(min
- ret
, &ra
->ra_cur_pages
);
129 void ll_ra_count_put(struct ll_sb_info
*sbi
, unsigned long len
)
131 struct ll_ra_info
*ra
= &sbi
->ll_ra_info
;
133 atomic_sub(len
, &ra
->ra_cur_pages
);
136 static void ll_ra_stats_inc_sbi(struct ll_sb_info
*sbi
, enum ra_stat which
)
138 LASSERTF(which
>= 0 && which
< _NR_RA_STAT
, "which: %u\n", which
);
139 lprocfs_counter_incr(sbi
->ll_ra_stats
, which
);
142 void ll_ra_stats_inc(struct inode
*inode
, enum ra_stat which
)
144 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
146 ll_ra_stats_inc_sbi(sbi
, which
);
149 #define RAS_CDEBUG(ras) \
151 "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu" \
152 "csr %lu sf %lu sp %lu sl %lu\n", \
153 ras->ras_last_readpage, ras->ras_consecutive_requests, \
154 ras->ras_consecutive_pages, ras->ras_window_start, \
155 ras->ras_window_len, ras->ras_next_readahead, \
156 ras->ras_requests, ras->ras_request_index, \
157 ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
158 ras->ras_stride_pages, ras->ras_stride_length)
160 static int index_in_window(unsigned long index
, unsigned long point
,
161 unsigned long before
, unsigned long after
)
163 unsigned long start
= point
- before
, end
= point
+ after
;
170 return start
<= index
&& index
<= end
;
173 void ll_ras_enter(struct file
*f
)
175 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(f
);
176 struct ll_readahead_state
*ras
= &fd
->fd_ras
;
178 spin_lock(&ras
->ras_lock
);
180 ras
->ras_request_index
= 0;
181 ras
->ras_consecutive_requests
++;
182 spin_unlock(&ras
->ras_lock
);
185 static int cl_read_ahead_page(const struct lu_env
*env
, struct cl_io
*io
,
186 struct cl_page_list
*queue
, struct cl_page
*page
,
187 struct cl_object
*clob
, pgoff_t
*max_index
)
189 struct page
*vmpage
= page
->cp_vmpage
;
190 struct vvp_page
*vpg
;
194 cl_page_assume(env
, io
, page
);
195 lu_ref_add(&page
->cp_reference
, "ra", current
);
196 vpg
= cl2vvp_page(cl_object_page_slice(clob
, page
));
197 if (!vpg
->vpg_defer_uptodate
&& !PageUptodate(vmpage
)) {
198 CDEBUG(D_READA
, "page index %lu, max_index: %lu\n",
199 vvp_index(vpg
), *max_index
);
200 if (*max_index
== 0 || vvp_index(vpg
) > *max_index
)
201 rc
= cl_page_is_under_lock(env
, io
, page
, max_index
);
203 vpg
->vpg_defer_uptodate
= 1;
204 vpg
->vpg_ra_used
= 0;
205 cl_page_list_add(queue
, page
);
208 cl_page_discard(env
, io
, page
);
212 /* skip completed pages */
213 cl_page_unassume(env
, io
, page
);
215 lu_ref_del(&page
->cp_reference
, "ra", current
);
216 cl_page_put(env
, page
);
221 * Initiates read-ahead of a page with given index.
223 * \retval +ve: page was added to \a queue.
225 * \retval -ENOLCK: there is no extent lock for this part of a file, stop
228 * \retval -ve, 0: page wasn't added to \a queue for other reason.
230 static int ll_read_ahead_page(const struct lu_env
*env
, struct cl_io
*io
,
231 struct cl_page_list
*queue
,
232 pgoff_t index
, pgoff_t
*max_index
)
234 struct cl_object
*clob
= io
->ci_obj
;
235 struct inode
*inode
= vvp_object_inode(clob
);
237 struct cl_page
*page
;
238 enum ra_stat which
= _NR_RA_STAT
; /* keep gcc happy */
240 const char *msg
= NULL
;
242 vmpage
= grab_cache_page_nowait(inode
->i_mapping
, index
);
244 /* Check if vmpage was truncated or reclaimed */
245 if (vmpage
->mapping
== inode
->i_mapping
) {
246 page
= cl_page_find(env
, clob
, vmpage
->index
,
247 vmpage
, CPT_CACHEABLE
);
249 rc
= cl_read_ahead_page(env
, io
, queue
,
250 page
, clob
, max_index
);
252 which
= RA_STAT_FAILED_MATCH
;
253 msg
= "lock match failed";
256 which
= RA_STAT_FAILED_GRAB_PAGE
;
257 msg
= "cl_page_find failed";
260 which
= RA_STAT_WRONG_GRAB_PAGE
;
261 msg
= "g_c_p_n returned invalid page";
267 which
= RA_STAT_FAILED_GRAB_PAGE
;
268 msg
= "g_c_p_n failed";
271 ll_ra_stats_inc(inode
, which
);
272 CDEBUG(D_READA
, "%s\n", msg
);
277 #define RIA_DEBUG(ria) \
278 CDEBUG(D_READA, "rs %lu re %lu ro %lu rl %lu rp %lu\n", \
279 ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\
282 /* Limit this to the blocksize instead of PTLRPC_BRW_MAX_SIZE, since we don't
283 * know what the actual RPC size is. If this needs to change, it makes more
284 * sense to tune the i_blkbits value for the file based on the OSTs it is
285 * striped over, rather than having a constant value for all files here.
288 /* RAS_INCREASE_STEP should be (1UL << (inode->i_blkbits - PAGE_SHIFT)).
289 * Temporarily set RAS_INCREASE_STEP to 1MB. After 4MB RPC is enabled
290 * by default, this should be adjusted corresponding with max_read_ahead_mb
291 * and max_read_ahead_per_file_mb otherwise the readahead budget can be used
292 * up quickly which will affect read performance significantly. See LU-2816
294 #define RAS_INCREASE_STEP(inode) (ONE_MB_BRW_SIZE >> PAGE_SHIFT)
296 static inline int stride_io_mode(struct ll_readahead_state
*ras
)
298 return ras
->ras_consecutive_stride_requests
> 1;
301 /* The function calculates how much pages will be read in
302 * [off, off + length], in such stride IO area,
303 * stride_offset = st_off, stride_length = st_len,
304 * stride_pages = st_pgs
306 * |------------------|*****|------------------|*****|------------|*****|....
309 * |----- st_len -----|
311 * How many pages it should read in such pattern
312 * |-------------------------------------------------------------|
314 * |<------ length ------->|
316 * = |<----->| + |-------------------------------------| + |---|
317 * start_left st_pgs * i end_left
320 stride_pg_count(pgoff_t st_off
, unsigned long st_len
, unsigned long st_pgs
,
321 unsigned long off
, unsigned long length
)
323 __u64 start
= off
> st_off
? off
- st_off
: 0;
324 __u64 end
= off
+ length
> st_off
? off
+ length
- st_off
: 0;
325 unsigned long start_left
= 0;
326 unsigned long end_left
= 0;
327 unsigned long pg_count
;
329 if (st_len
== 0 || length
== 0 || end
== 0)
332 start_left
= do_div(start
, st_len
);
333 if (start_left
< st_pgs
)
334 start_left
= st_pgs
- start_left
;
338 end_left
= do_div(end
, st_len
);
339 if (end_left
> st_pgs
)
342 CDEBUG(D_READA
, "start %llu, end %llu start_left %lu end_left %lu\n",
343 start
, end
, start_left
, end_left
);
346 pg_count
= end_left
- (st_pgs
- start_left
);
348 pg_count
= start_left
+ st_pgs
* (end
- start
- 1) + end_left
;
350 CDEBUG(D_READA
, "st_off %lu, st_len %lu st_pgs %lu off %lu length %lu pgcount %lu\n",
351 st_off
, st_len
, st_pgs
, off
, length
, pg_count
);
356 static int ria_page_count(struct ra_io_arg
*ria
)
358 __u64 length
= ria
->ria_end
>= ria
->ria_start
?
359 ria
->ria_end
- ria
->ria_start
+ 1 : 0;
361 return stride_pg_count(ria
->ria_stoff
, ria
->ria_length
,
362 ria
->ria_pages
, ria
->ria_start
,
366 /*Check whether the index is in the defined ra-window */
367 static int ras_inside_ra_window(unsigned long idx
, struct ra_io_arg
*ria
)
369 /* If ria_length == ria_pages, it means non-stride I/O mode,
370 * idx should always inside read-ahead window in this case
371 * For stride I/O mode, just check whether the idx is inside
374 return ria
->ria_length
== 0 || ria
->ria_length
== ria
->ria_pages
||
375 (idx
>= ria
->ria_stoff
&& (idx
- ria
->ria_stoff
) %
376 ria
->ria_length
< ria
->ria_pages
);
379 static int ll_read_ahead_pages(const struct lu_env
*env
,
380 struct cl_io
*io
, struct cl_page_list
*queue
,
381 struct ra_io_arg
*ria
,
382 unsigned long *reserved_pages
,
383 unsigned long *ra_end
)
388 pgoff_t max_index
= 0;
393 stride_ria
= ria
->ria_length
> ria
->ria_pages
&& ria
->ria_pages
> 0;
394 for (page_idx
= ria
->ria_start
;
395 page_idx
<= ria
->ria_end
&& *reserved_pages
> 0; page_idx
++) {
396 if (ras_inside_ra_window(page_idx
, ria
)) {
397 /* If the page is inside the read-ahead window*/
398 rc
= ll_read_ahead_page(env
, io
, queue
,
399 page_idx
, &max_index
);
403 } else if (rc
== -ENOLCK
) {
406 } else if (stride_ria
) {
407 /* If it is not in the read-ahead window, and it is
408 * read-ahead mode, then check whether it should skip
412 /* FIXME: This assertion only is valid when it is for
413 * forward read-ahead, it will be fixed when backward
414 * read-ahead is implemented
416 LASSERTF(page_idx
> ria
->ria_stoff
, "Invalid page_idx %lu rs %lu re %lu ro %lu rl %lu rp %lu\n",
418 ria
->ria_start
, ria
->ria_end
, ria
->ria_stoff
,
419 ria
->ria_length
, ria
->ria_pages
);
420 offset
= page_idx
- ria
->ria_stoff
;
421 offset
= offset
% (ria
->ria_length
);
422 if (offset
> ria
->ria_pages
) {
423 page_idx
+= ria
->ria_length
- offset
;
424 CDEBUG(D_READA
, "i %lu skip %lu\n", page_idx
,
425 ria
->ria_length
- offset
);
434 int ll_readahead(const struct lu_env
*env
, struct cl_io
*io
,
435 struct cl_page_list
*queue
, struct ll_readahead_state
*ras
,
438 struct vvp_io
*vio
= vvp_env_io(env
);
439 struct ll_thread_info
*lti
= ll_env_info(env
);
440 struct cl_attr
*attr
= vvp_env_thread_attr(env
);
441 unsigned long start
= 0, end
= 0, reserved
;
442 unsigned long ra_end
, len
, mlen
= 0;
444 struct ra_io_arg
*ria
= <i
->lti_ria
;
445 struct cl_object
*clob
;
450 inode
= vvp_object_inode(clob
);
452 memset(ria
, 0, sizeof(*ria
));
454 cl_object_attr_lock(clob
);
455 ret
= cl_object_attr_get(env
, clob
, attr
);
456 cl_object_attr_unlock(clob
);
462 ll_ra_stats_inc(inode
, RA_STAT_ZERO_LEN
);
466 spin_lock(&ras
->ras_lock
);
468 /* Enlarge the RA window to encompass the full read */
469 if (vio
->vui_ra_valid
&&
470 ras
->ras_window_start
+ ras
->ras_window_len
<
471 vio
->vui_ra_start
+ vio
->vui_ra_count
) {
472 ras
->ras_window_len
= vio
->vui_ra_start
+ vio
->vui_ra_count
-
473 ras
->ras_window_start
;
476 /* Reserve a part of the read-ahead window that we'll be issuing */
477 if (ras
->ras_window_len
) {
478 start
= ras
->ras_next_readahead
;
479 end
= ras
->ras_window_start
+ ras
->ras_window_len
- 1;
482 unsigned long rpc_boundary
;
484 * Align RA window to an optimal boundary.
486 * XXX This would be better to align to cl_max_pages_per_rpc
487 * instead of PTLRPC_MAX_BRW_PAGES, because the RPC size may
488 * be aligned to the RAID stripe size in the future and that
489 * is more important than the RPC size.
491 /* Note: we only trim the RPC, instead of extending the RPC
492 * to the boundary, so to avoid reading too much pages during
495 rpc_boundary
= (end
+ 1) & (~(PTLRPC_MAX_BRW_PAGES
- 1));
496 if (rpc_boundary
> 0)
499 if (rpc_boundary
> start
)
502 /* Truncate RA window to end of file */
503 end
= min(end
, (unsigned long)((kms
- 1) >> PAGE_SHIFT
));
505 ras
->ras_next_readahead
= max(end
, end
+ 1);
508 ria
->ria_start
= start
;
510 /* If stride I/O mode is detected, get stride window*/
511 if (stride_io_mode(ras
)) {
512 ria
->ria_stoff
= ras
->ras_stride_offset
;
513 ria
->ria_length
= ras
->ras_stride_length
;
514 ria
->ria_pages
= ras
->ras_stride_pages
;
516 spin_unlock(&ras
->ras_lock
);
519 ll_ra_stats_inc(inode
, RA_STAT_ZERO_WINDOW
);
522 len
= ria_page_count(ria
);
524 ll_ra_stats_inc(inode
, RA_STAT_ZERO_WINDOW
);
528 CDEBUG(D_READA
, DFID
": ria: %lu/%lu, bead: %lu/%lu, hit: %d\n",
529 PFID(lu_object_fid(&clob
->co_lu
)),
530 ria
->ria_start
, ria
->ria_end
,
531 vio
->vui_ra_valid
? vio
->vui_ra_start
: 0,
532 vio
->vui_ra_valid
? vio
->vui_ra_count
: 0,
535 /* at least to extend the readahead window to cover current read */
536 if (!hit
&& vio
->vui_ra_valid
&&
537 vio
->vui_ra_start
+ vio
->vui_ra_count
> ria
->ria_start
) {
538 /* to the end of current read window. */
539 mlen
= vio
->vui_ra_start
+ vio
->vui_ra_count
- ria
->ria_start
;
540 /* trim to RPC boundary */
541 start
= ria
->ria_start
& (PTLRPC_MAX_BRW_PAGES
- 1);
542 mlen
= min(mlen
, PTLRPC_MAX_BRW_PAGES
- start
);
545 reserved
= ll_ra_count_get(ll_i2sbi(inode
), ria
, len
, mlen
);
547 ll_ra_stats_inc(inode
, RA_STAT_MAX_IN_FLIGHT
);
549 CDEBUG(D_READA
, "reserved pages %lu/%lu/%lu, ra_cur %d, ra_max %lu\n",
551 atomic_read(&ll_i2sbi(inode
)->ll_ra_info
.ra_cur_pages
),
552 ll_i2sbi(inode
)->ll_ra_info
.ra_max_pages
);
554 ret
= ll_read_ahead_pages(env
, io
, queue
, ria
, &reserved
, &ra_end
);
557 ll_ra_count_put(ll_i2sbi(inode
), reserved
);
559 if (ra_end
== end
+ 1 && ra_end
== (kms
>> PAGE_SHIFT
))
560 ll_ra_stats_inc(inode
, RA_STAT_EOF
);
562 /* if we didn't get to the end of the region we reserved from
563 * the ras we need to go back and update the ras so that the
564 * next read-ahead tries from where we left off. we only do so
565 * if the region we failed to issue read-ahead on is still ahead
566 * of the app and behind the next index to start read-ahead from
568 CDEBUG(D_READA
, "ra_end %lu end %lu stride end %lu\n",
569 ra_end
, end
, ria
->ria_end
);
571 if (ra_end
!= end
+ 1) {
572 ll_ra_stats_inc(inode
, RA_STAT_FAILED_REACH_END
);
573 spin_lock(&ras
->ras_lock
);
574 if (ra_end
< ras
->ras_next_readahead
&&
575 index_in_window(ra_end
, ras
->ras_window_start
, 0,
576 ras
->ras_window_len
)) {
577 ras
->ras_next_readahead
= ra_end
;
580 spin_unlock(&ras
->ras_lock
);
586 static void ras_set_start(struct inode
*inode
, struct ll_readahead_state
*ras
,
589 ras
->ras_window_start
= index
& (~(RAS_INCREASE_STEP(inode
) - 1));
592 /* called with the ras_lock held or from places where it doesn't matter */
593 static void ras_reset(struct inode
*inode
, struct ll_readahead_state
*ras
,
596 ras
->ras_last_readpage
= index
;
597 ras
->ras_consecutive_requests
= 0;
598 ras
->ras_consecutive_pages
= 0;
599 ras
->ras_window_len
= 0;
600 ras_set_start(inode
, ras
, index
);
601 ras
->ras_next_readahead
= max(ras
->ras_window_start
, index
);
606 /* called with the ras_lock held or from places where it doesn't matter */
607 static void ras_stride_reset(struct ll_readahead_state
*ras
)
609 ras
->ras_consecutive_stride_requests
= 0;
610 ras
->ras_stride_length
= 0;
611 ras
->ras_stride_pages
= 0;
615 void ll_readahead_init(struct inode
*inode
, struct ll_readahead_state
*ras
)
617 spin_lock_init(&ras
->ras_lock
);
618 ras_reset(inode
, ras
, 0);
619 ras
->ras_requests
= 0;
623 * Check whether the read request is in the stride window.
624 * If it is in the stride window, return 1, otherwise return 0.
626 static int index_in_stride_window(struct ll_readahead_state
*ras
,
629 unsigned long stride_gap
;
631 if (ras
->ras_stride_length
== 0 || ras
->ras_stride_pages
== 0 ||
632 ras
->ras_stride_pages
== ras
->ras_stride_length
)
635 stride_gap
= index
- ras
->ras_last_readpage
- 1;
637 /* If it is contiguous read */
639 return ras
->ras_consecutive_pages
+ 1 <= ras
->ras_stride_pages
;
641 /* Otherwise check the stride by itself */
642 return (ras
->ras_stride_length
- ras
->ras_stride_pages
) == stride_gap
&&
643 ras
->ras_consecutive_pages
== ras
->ras_stride_pages
;
646 static void ras_update_stride_detector(struct ll_readahead_state
*ras
,
649 unsigned long stride_gap
= index
- ras
->ras_last_readpage
- 1;
651 if (!stride_io_mode(ras
) && (stride_gap
!= 0 ||
652 ras
->ras_consecutive_stride_requests
== 0)) {
653 ras
->ras_stride_pages
= ras
->ras_consecutive_pages
;
654 ras
->ras_stride_length
= stride_gap
+ras
->ras_consecutive_pages
;
656 LASSERT(ras
->ras_request_index
== 0);
657 LASSERT(ras
->ras_consecutive_stride_requests
== 0);
659 if (index
<= ras
->ras_last_readpage
) {
660 /*Reset stride window for forward read*/
661 ras_stride_reset(ras
);
665 ras
->ras_stride_pages
= ras
->ras_consecutive_pages
;
666 ras
->ras_stride_length
= stride_gap
+ras
->ras_consecutive_pages
;
672 /* Stride Read-ahead window will be increased inc_len according to
675 static void ras_stride_increase_window(struct ll_readahead_state
*ras
,
676 struct ll_ra_info
*ra
,
677 unsigned long inc_len
)
679 unsigned long left
, step
, window_len
;
680 unsigned long stride_len
;
682 LASSERT(ras
->ras_stride_length
> 0);
683 LASSERTF(ras
->ras_window_start
+ ras
->ras_window_len
684 >= ras
->ras_stride_offset
, "window_start %lu, window_len %lu stride_offset %lu\n",
685 ras
->ras_window_start
,
686 ras
->ras_window_len
, ras
->ras_stride_offset
);
688 stride_len
= ras
->ras_window_start
+ ras
->ras_window_len
-
689 ras
->ras_stride_offset
;
691 left
= stride_len
% ras
->ras_stride_length
;
692 window_len
= ras
->ras_window_len
- left
;
694 if (left
< ras
->ras_stride_pages
)
697 left
= ras
->ras_stride_pages
+ inc_len
;
699 LASSERT(ras
->ras_stride_pages
!= 0);
701 step
= left
/ ras
->ras_stride_pages
;
702 left
%= ras
->ras_stride_pages
;
704 window_len
+= step
* ras
->ras_stride_length
+ left
;
706 if (stride_pg_count(ras
->ras_stride_offset
, ras
->ras_stride_length
,
707 ras
->ras_stride_pages
, ras
->ras_stride_offset
,
708 window_len
) <= ra
->ra_max_pages_per_file
)
709 ras
->ras_window_len
= window_len
;
714 static void ras_increase_window(struct inode
*inode
,
715 struct ll_readahead_state
*ras
,
716 struct ll_ra_info
*ra
)
718 /* The stretch of ra-window should be aligned with max rpc_size
719 * but current clio architecture does not support retrieve such
720 * information from lower layer. FIXME later
722 if (stride_io_mode(ras
))
723 ras_stride_increase_window(ras
, ra
, RAS_INCREASE_STEP(inode
));
725 ras
->ras_window_len
= min(ras
->ras_window_len
+
726 RAS_INCREASE_STEP(inode
),
727 ra
->ra_max_pages_per_file
);
730 void ras_update(struct ll_sb_info
*sbi
, struct inode
*inode
,
731 struct ll_readahead_state
*ras
, unsigned long index
,
734 struct ll_ra_info
*ra
= &sbi
->ll_ra_info
;
735 int zero
= 0, stride_detect
= 0, ra_miss
= 0;
737 spin_lock(&ras
->ras_lock
);
739 ll_ra_stats_inc_sbi(sbi
, hit
? RA_STAT_HIT
: RA_STAT_MISS
);
741 /* reset the read-ahead window in two cases. First when the app seeks
742 * or reads to some other part of the file. Secondly if we get a
743 * read-ahead miss that we think we've previously issued. This can
744 * be a symptom of there being so many read-ahead pages that the VM is
745 * reclaiming it before we get to it.
747 if (!index_in_window(index
, ras
->ras_last_readpage
, 8, 8)) {
749 ll_ra_stats_inc_sbi(sbi
, RA_STAT_DISTANT_READPAGE
);
750 } else if (!hit
&& ras
->ras_window_len
&&
751 index
< ras
->ras_next_readahead
&&
752 index_in_window(index
, ras
->ras_window_start
, 0,
753 ras
->ras_window_len
)) {
755 ll_ra_stats_inc_sbi(sbi
, RA_STAT_MISS_IN_WINDOW
);
758 /* On the second access to a file smaller than the tunable
759 * ra_max_read_ahead_whole_pages trigger RA on all pages in the
760 * file up to ra_max_pages_per_file. This is simply a best effort
761 * and only occurs once per open file. Normal RA behavior is reverted
762 * to for subsequent IO. The mmap case does not increment
763 * ras_requests and thus can never trigger this behavior.
765 if (ras
->ras_requests
== 2 && !ras
->ras_request_index
) {
768 kms_pages
= (i_size_read(inode
) + PAGE_SIZE
- 1) >>
771 CDEBUG(D_READA
, "kmsp %llu mwp %lu mp %lu\n", kms_pages
,
772 ra
->ra_max_read_ahead_whole_pages
, ra
->ra_max_pages_per_file
);
775 kms_pages
<= ra
->ra_max_read_ahead_whole_pages
) {
776 ras
->ras_window_start
= 0;
777 ras
->ras_last_readpage
= 0;
778 ras
->ras_next_readahead
= 0;
779 ras
->ras_window_len
= min(ra
->ra_max_pages_per_file
,
780 ra
->ra_max_read_ahead_whole_pages
);
785 /* check whether it is in stride I/O mode*/
786 if (!index_in_stride_window(ras
, index
)) {
787 if (ras
->ras_consecutive_stride_requests
== 0 &&
788 ras
->ras_request_index
== 0) {
789 ras_update_stride_detector(ras
, index
);
790 ras
->ras_consecutive_stride_requests
++;
792 ras_stride_reset(ras
);
794 ras_reset(inode
, ras
, index
);
795 ras
->ras_consecutive_pages
++;
798 ras
->ras_consecutive_pages
= 0;
799 ras
->ras_consecutive_requests
= 0;
800 if (++ras
->ras_consecutive_stride_requests
> 1)
806 if (index_in_stride_window(ras
, index
) &&
807 stride_io_mode(ras
)) {
808 /*If stride-RA hit cache miss, the stride dector
809 *will not be reset to avoid the overhead of
810 *redetecting read-ahead mode
812 if (index
!= ras
->ras_last_readpage
+ 1)
813 ras
->ras_consecutive_pages
= 0;
814 ras_reset(inode
, ras
, index
);
817 /* Reset both stride window and normal RA
820 ras_reset(inode
, ras
, index
);
821 ras
->ras_consecutive_pages
++;
822 ras_stride_reset(ras
);
825 } else if (stride_io_mode(ras
)) {
826 /* If this is contiguous read but in stride I/O mode
827 * currently, check whether stride step still is valid,
828 * if invalid, it will reset the stride ra window
830 if (!index_in_stride_window(ras
, index
)) {
831 /* Shrink stride read-ahead window to be zero */
832 ras_stride_reset(ras
);
833 ras
->ras_window_len
= 0;
834 ras
->ras_next_readahead
= index
;
838 ras
->ras_consecutive_pages
++;
839 ras
->ras_last_readpage
= index
;
840 ras_set_start(inode
, ras
, index
);
842 if (stride_io_mode(ras
)) {
843 /* Since stride readahead is sensitive to the offset
844 * of read-ahead, so we use original offset here,
845 * instead of ras_window_start, which is RPC aligned
847 ras
->ras_next_readahead
= max(index
, ras
->ras_next_readahead
);
849 if (ras
->ras_next_readahead
< ras
->ras_window_start
)
850 ras
->ras_next_readahead
= ras
->ras_window_start
;
852 ras
->ras_next_readahead
= index
+ 1;
856 /* Trigger RA in the mmap case where ras_consecutive_requests
857 * is not incremented and thus can't be used to trigger RA
859 if (!ras
->ras_window_len
&& ras
->ras_consecutive_pages
== 4) {
860 ras
->ras_window_len
= RAS_INCREASE_STEP(inode
);
864 /* Initially reset the stride window offset to next_readahead*/
865 if (ras
->ras_consecutive_stride_requests
== 2 && stride_detect
) {
867 * Once stride IO mode is detected, next_readahead should be
868 * reset to make sure next_readahead > stride offset
870 ras
->ras_next_readahead
= max(index
, ras
->ras_next_readahead
);
871 ras
->ras_stride_offset
= index
;
872 ras
->ras_window_len
= RAS_INCREASE_STEP(inode
);
875 /* The initial ras_window_len is set to the request size. To avoid
876 * uselessly reading and discarding pages for random IO the window is
877 * only increased once per consecutive request received. */
878 if ((ras
->ras_consecutive_requests
> 1 || stride_detect
) &&
879 !ras
->ras_request_index
)
880 ras_increase_window(inode
, ras
, ra
);
883 ras
->ras_request_index
++;
884 spin_unlock(&ras
->ras_lock
);
888 int ll_writepage(struct page
*vmpage
, struct writeback_control
*wbc
)
890 struct inode
*inode
= vmpage
->mapping
->host
;
891 struct ll_inode_info
*lli
= ll_i2info(inode
);
894 struct cl_page
*page
;
895 struct cl_object
*clob
;
896 struct cl_env_nest nest
;
897 bool redirtied
= false;
898 bool unlocked
= false;
901 LASSERT(PageLocked(vmpage
));
902 LASSERT(!PageWriteback(vmpage
));
904 LASSERT(ll_i2dtexp(inode
));
906 env
= cl_env_nested_get(&nest
);
908 result
= PTR_ERR(env
);
912 clob
= ll_i2info(inode
)->lli_clob
;
915 io
= vvp_env_thread_io(env
);
917 io
->ci_ignore_layout
= 1;
918 result
= cl_io_init(env
, io
, CIT_MISC
, clob
);
920 page
= cl_page_find(env
, clob
, vmpage
->index
,
921 vmpage
, CPT_CACHEABLE
);
923 lu_ref_add(&page
->cp_reference
, "writepage",
925 cl_page_assume(env
, io
, page
);
926 result
= cl_page_flush(env
, io
, page
);
929 * Re-dirty page on error so it retries write,
930 * but not in case when IO has actually
931 * occurred and completed with an error.
933 if (!PageError(vmpage
)) {
934 redirty_page_for_writepage(wbc
, vmpage
);
939 cl_page_disown(env
, io
, page
);
941 lu_ref_del(&page
->cp_reference
,
942 "writepage", current
);
943 cl_page_put(env
, page
);
945 result
= PTR_ERR(page
);
950 if (redirtied
&& wbc
->sync_mode
== WB_SYNC_ALL
) {
951 loff_t offset
= cl_offset(clob
, vmpage
->index
);
953 /* Flush page failed because the extent is being written out.
954 * Wait for the write of extent to be finished to avoid
955 * breaking kernel which assumes ->writepage should mark
956 * PageWriteback or clean the page.
958 result
= cl_sync_file_range(inode
, offset
,
959 offset
+ PAGE_SIZE
- 1,
962 /* actually we may have written more than one page.
963 * decreasing this page because the caller will count
966 wbc
->nr_to_write
-= result
- 1;
971 cl_env_nested_put(&nest
, env
);
976 if (!lli
->lli_async_rc
)
977 lli
->lli_async_rc
= result
;
978 SetPageError(vmpage
);
985 int ll_writepages(struct address_space
*mapping
, struct writeback_control
*wbc
)
987 struct inode
*inode
= mapping
->host
;
988 struct ll_sb_info
*sbi
= ll_i2sbi(inode
);
991 enum cl_fsync_mode mode
;
994 int ignore_layout
= 0;
996 if (wbc
->range_cyclic
) {
997 start
= mapping
->writeback_index
<< PAGE_SHIFT
;
998 end
= OBD_OBJECT_EOF
;
1000 start
= wbc
->range_start
;
1001 end
= wbc
->range_end
;
1002 if (end
== LLONG_MAX
) {
1003 end
= OBD_OBJECT_EOF
;
1004 range_whole
= start
== 0;
1008 mode
= CL_FSYNC_NONE
;
1009 if (wbc
->sync_mode
== WB_SYNC_ALL
)
1010 mode
= CL_FSYNC_LOCAL
;
1012 if (sbi
->ll_umounting
)
1013 /* if the mountpoint is being umounted, all pages have to be
1014 * evicted to avoid hitting LBUG when truncate_inode_pages()
1015 * is called later on.
1018 result
= cl_sync_file_range(inode
, start
, end
, mode
, ignore_layout
);
1020 wbc
->nr_to_write
-= result
;
1024 if (wbc
->range_cyclic
|| (range_whole
&& wbc
->nr_to_write
> 0)) {
1025 if (end
== OBD_OBJECT_EOF
)
1026 mapping
->writeback_index
= 0;
1028 mapping
->writeback_index
= (end
>> PAGE_SHIFT
) + 1;
1033 struct ll_cl_context
*ll_cl_find(struct file
*file
)
1035 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
1036 struct ll_cl_context
*lcc
;
1037 struct ll_cl_context
*found
= NULL
;
1039 read_lock(&fd
->fd_lock
);
1040 list_for_each_entry(lcc
, &fd
->fd_lccs
, lcc_list
) {
1041 if (lcc
->lcc_cookie
== current
) {
1046 read_unlock(&fd
->fd_lock
);
1051 void ll_cl_add(struct file
*file
, const struct lu_env
*env
, struct cl_io
*io
)
1053 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
1054 struct ll_cl_context
*lcc
= &ll_env_info(env
)->lti_io_ctx
;
1056 memset(lcc
, 0, sizeof(*lcc
));
1057 INIT_LIST_HEAD(&lcc
->lcc_list
);
1058 lcc
->lcc_cookie
= current
;
1062 write_lock(&fd
->fd_lock
);
1063 list_add(&lcc
->lcc_list
, &fd
->fd_lccs
);
1064 write_unlock(&fd
->fd_lock
);
1067 void ll_cl_remove(struct file
*file
, const struct lu_env
*env
)
1069 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
1070 struct ll_cl_context
*lcc
= &ll_env_info(env
)->lti_io_ctx
;
1072 write_lock(&fd
->fd_lock
);
1073 list_del_init(&lcc
->lcc_list
);
1074 write_unlock(&fd
->fd_lock
);
1077 int ll_readpage(struct file
*file
, struct page
*vmpage
)
1079 struct cl_object
*clob
= ll_i2info(file_inode(file
))->lli_clob
;
1080 struct ll_cl_context
*lcc
;
1081 const struct lu_env
*env
;
1083 struct cl_page
*page
;
1086 lcc
= ll_cl_find(file
);
1088 unlock_page(vmpage
);
1094 LASSERT(io
->ci_state
== CIS_IO_GOING
);
1095 page
= cl_page_find(env
, clob
, vmpage
->index
, vmpage
, CPT_CACHEABLE
);
1096 if (!IS_ERR(page
)) {
1097 LASSERT(page
->cp_type
== CPT_CACHEABLE
);
1098 if (likely(!PageUptodate(vmpage
))) {
1099 cl_page_assume(env
, io
, page
);
1100 result
= cl_io_read_page(env
, io
, page
);
1102 /* Page from a non-object file. */
1103 unlock_page(vmpage
);
1106 cl_page_put(env
, page
);
1108 unlock_page(vmpage
);
1109 result
= PTR_ERR(page
);
1114 int ll_page_sync_io(const struct lu_env
*env
, struct cl_io
*io
,
1115 struct cl_page
*page
, enum cl_req_type crt
)
1117 struct cl_2queue
*queue
;
1120 LASSERT(io
->ci_type
== CIT_READ
|| io
->ci_type
== CIT_WRITE
);
1122 queue
= &io
->ci_queue
;
1123 cl_2queue_init_page(queue
, page
);
1125 result
= cl_io_submit_sync(env
, io
, crt
, queue
, 0);
1126 LASSERT(cl_page_is_owned(page
, io
));
1128 if (crt
== CRT_READ
)
1130 * in CRT_WRITE case page is left locked even in case of
1133 cl_page_list_disown(env
, io
, &queue
->c2_qin
);
1134 cl_2queue_fini(env
, queue
);