4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2015, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
33 #include <linux/kernel.h>
35 #include <linux/string.h>
36 #include <linux/stat.h>
37 #include <linux/errno.h>
38 #include <linux/unistd.h>
39 #include <linux/uaccess.h>
42 #include <linux/pagemap.h>
44 #define DEBUG_SUBSYSTEM S_LLITE
46 #include "../include/lustre_lite.h"
47 #include "llite_internal.h"
48 #include "../include/linux/lustre_compat25.h"
50 static const struct vm_operations_struct ll_file_vm_ops
;
52 void policy_from_vma(ldlm_policy_data_t
*policy
,
53 struct vm_area_struct
*vma
, unsigned long addr
,
56 policy
->l_extent
.start
= ((addr
- vma
->vm_start
) & PAGE_MASK
) +
57 (vma
->vm_pgoff
<< PAGE_SHIFT
);
58 policy
->l_extent
.end
= (policy
->l_extent
.start
+ count
- 1) |
62 struct vm_area_struct
*our_vma(struct mm_struct
*mm
, unsigned long addr
,
65 struct vm_area_struct
*vma
, *ret
= NULL
;
67 /* mmap_sem must have been held by caller. */
68 LASSERT(!down_write_trylock(&mm
->mmap_sem
));
70 for (vma
= find_vma(mm
, addr
);
71 vma
&& vma
->vm_start
< (addr
+ count
); vma
= vma
->vm_next
) {
72 if (vma
->vm_ops
&& vma
->vm_ops
== &ll_file_vm_ops
&&
73 vma
->vm_flags
& VM_SHARED
) {
82 * API independent part for page fault initialization.
83 * \param vma - virtual memory area addressed to page fault
84 * \param env - corespondent lu_env to processing
85 * \param nest - nested level
86 * \param index - page index corespondent to fault.
87 * \parm ra_flags - vma readahead flags.
89 * \return allocated and initialized env for fault operation.
90 * \retval EINVAL if env can't allocated
91 * \return other error codes from cl_io_init.
94 ll_fault_io_init(struct vm_area_struct
*vma
, struct lu_env
**env_ret
,
95 struct cl_env_nest
*nest
, pgoff_t index
,
96 unsigned long *ra_flags
)
98 struct file
*file
= vma
->vm_file
;
99 struct inode
*inode
= file_inode(file
);
101 struct cl_fault_io
*fio
;
106 if (ll_file_nolock(file
))
107 return ERR_PTR(-EOPNOTSUPP
);
110 * page fault can be called when lustre IO is
111 * already active for the current thread, e.g., when doing read/write
112 * against user level buffer mapped from Lustre buffer. To avoid
113 * stomping on existing context, optionally force an allocation of a new
116 env
= cl_env_nested_get(nest
);
118 return ERR_PTR(-EINVAL
);
123 io
= vvp_env_thread_io(env
);
124 io
->ci_obj
= ll_i2info(inode
)->lli_clob
;
127 fio
= &io
->u
.ci_fault
;
128 fio
->ft_index
= index
;
129 fio
->ft_executable
= vma
->vm_flags
&VM_EXEC
;
132 * disable VM_SEQ_READ and use VM_RAND_READ to make sure that
133 * the kernel will not read other pages not covered by ldlm in
134 * filemap_nopage. we do our readahead in ll_readpage.
137 *ra_flags
= vma
->vm_flags
& (VM_RAND_READ
|VM_SEQ_READ
);
138 vma
->vm_flags
&= ~VM_SEQ_READ
;
139 vma
->vm_flags
|= VM_RAND_READ
;
141 CDEBUG(D_MMAP
, "vm_flags: %lx (%lu %d)\n", vma
->vm_flags
,
142 fio
->ft_index
, fio
->ft_executable
);
144 rc
= cl_io_init(env
, io
, CIT_FAULT
, io
->ci_obj
);
146 struct vvp_io
*vio
= vvp_env_io(env
);
147 struct ll_file_data
*fd
= LUSTRE_FPRIVATE(file
);
149 LASSERT(vio
->vui_cl
.cis_io
== io
);
151 /* mmap lock must be MANDATORY it has to cache pages. */
152 io
->ci_lockreq
= CILR_MANDATORY
;
157 if (io
->ci_need_restart
)
160 cl_env_nested_put(nest
, env
);
167 /* Sharing code of page_mkwrite method for rhel5 and rhel6 */
168 static int ll_page_mkwrite0(struct vm_area_struct
*vma
, struct page
*vmpage
,
174 struct cl_env_nest nest
;
178 struct ll_inode_info
*lli
;
180 io
= ll_fault_io_init(vma
, &env
, &nest
, vmpage
->index
, NULL
);
182 result
= PTR_ERR(io
);
186 result
= io
->ci_result
;
190 io
->u
.ci_fault
.ft_mkwrite
= 1;
191 io
->u
.ci_fault
.ft_writable
= 1;
193 vio
= vvp_env_io(env
);
194 vio
->u
.fault
.ft_vma
= vma
;
195 vio
->u
.fault
.ft_vmpage
= vmpage
;
197 set
= cfs_block_sigsinv(sigmask(SIGKILL
) | sigmask(SIGTERM
));
199 inode
= vvp_object_inode(io
->ci_obj
);
200 lli
= ll_i2info(inode
);
202 result
= cl_io_loop(env
, io
);
204 cfs_restore_sigs(set
);
207 struct inode
*inode
= file_inode(vma
->vm_file
);
208 struct ll_inode_info
*lli
= ll_i2info(inode
);
211 if (!vmpage
->mapping
) {
214 /* page was truncated and lock was cancelled, return
215 * ENODATA so that VM_FAULT_NOPAGE will be returned
216 * to handle_mm_fault().
220 } else if (!PageDirty(vmpage
)) {
221 /* race, the page has been cleaned by ptlrpcd after
222 * it was unlocked, it has to be added into dirty
223 * cache again otherwise this soon-to-dirty page won't
224 * consume any grants, even worse if this page is being
225 * transferred because it will break RPC checksum.
229 CDEBUG(D_MMAP
, "Race on page_mkwrite %p/%lu, page has been written out, retry.\n",
230 vmpage
, vmpage
->index
);
237 spin_lock(&lli
->lli_lock
);
238 lli
->lli_flags
|= LLIF_DATA_MODIFIED
;
239 spin_unlock(&lli
->lli_lock
);
245 cl_env_nested_put(&nest
, env
);
247 CDEBUG(D_MMAP
, "%s mkwrite with %d\n", current
->comm
, result
);
248 LASSERT(ergo(result
== 0, PageLocked(vmpage
)));
253 static inline int to_fault_error(int result
)
257 result
= VM_FAULT_LOCKED
;
260 result
= VM_FAULT_NOPAGE
;
263 result
= VM_FAULT_OOM
;
266 result
= VM_FAULT_SIGBUS
;
273 * Lustre implementation of a vm_operations_struct::fault() method, called by
274 * VM to server page fault (both in kernel and user space).
276 * \param vma - is virtual area struct related to page fault
277 * \param vmf - structure which describe type and address where hit fault
279 * \return allocated and filled _locked_ page for address
280 * \retval VM_FAULT_ERROR on general error
281 * \retval NOPAGE_OOM not have memory for allocate new page
283 static int ll_fault0(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
287 struct vvp_io
*vio
= NULL
;
289 unsigned long ra_flags
;
290 struct cl_env_nest nest
;
294 io
= ll_fault_io_init(vma
, &env
, &nest
, vmf
->pgoff
, &ra_flags
);
296 return to_fault_error(PTR_ERR(io
));
298 result
= io
->ci_result
;
300 vio
= vvp_env_io(env
);
301 vio
->u
.fault
.ft_vma
= vma
;
302 vio
->u
.fault
.ft_vmpage
= NULL
;
303 vio
->u
.fault
.ft_vmf
= vmf
;
304 vio
->u
.fault
.ft_flags
= 0;
305 vio
->u
.fault
.ft_flags_valid
= false;
307 /* May call ll_readpage() */
308 ll_cl_add(vma
->vm_file
, env
, io
);
310 result
= cl_io_loop(env
, io
);
312 ll_cl_remove(vma
->vm_file
, env
);
314 /* ft_flags are only valid if we reached
315 * the call to filemap_fault
317 if (vio
->u
.fault
.ft_flags_valid
)
318 fault_ret
= vio
->u
.fault
.ft_flags
;
320 vmpage
= vio
->u
.fault
.ft_vmpage
;
321 if (result
!= 0 && vmpage
) {
327 cl_env_nested_put(&nest
, env
);
329 vma
->vm_flags
|= ra_flags
;
330 if (result
!= 0 && !(fault_ret
& VM_FAULT_RETRY
))
331 fault_ret
|= to_fault_error(result
);
333 CDEBUG(D_MMAP
, "%s fault %d/%d\n",
334 current
->comm
, fault_ret
, result
);
338 static int ll_fault(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
341 bool printed
= false;
345 /* Only SIGKILL and SIGTERM are allowed for fault/nopage/mkwrite
346 * so that it can be killed by admin but not cause segfault by
349 set
= cfs_block_sigsinv(sigmask(SIGKILL
) | sigmask(SIGTERM
));
352 result
= ll_fault0(vma
, vmf
);
353 LASSERT(!(result
& VM_FAULT_LOCKED
));
355 struct page
*vmpage
= vmf
->page
;
357 /* check if this page has been truncated */
359 if (unlikely(!vmpage
->mapping
)) { /* unlucky */
364 if (!printed
&& ++count
> 16) {
365 CWARN("the page is under heavy contention, maybe your app(%s) needs revising :-)\n",
373 result
= VM_FAULT_LOCKED
;
375 cfs_restore_sigs(set
);
379 static int ll_page_mkwrite(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
382 bool printed
= false;
388 result
= ll_page_mkwrite0(vma
, vmf
->page
, &retry
);
390 if (!printed
&& ++count
> 16) {
391 const struct dentry
*de
= vma
->vm_file
->f_path
.dentry
;
393 CWARN("app(%s): the page %lu of file "DFID
" is under heavy contention\n",
394 current
->comm
, vmf
->pgoff
,
395 PFID(ll_inode2fid(de
->d_inode
)));
402 LASSERT(PageLocked(vmf
->page
));
403 result
= VM_FAULT_LOCKED
;
407 result
= VM_FAULT_NOPAGE
;
410 result
= VM_FAULT_OOM
;
413 result
= VM_FAULT_RETRY
;
416 result
= VM_FAULT_SIGBUS
;
424 * To avoid cancel the locks covering mmapped region for lock cache pressure,
425 * we track the mapped vma count in vvp_object::vob_mmap_cnt.
427 static void ll_vm_open(struct vm_area_struct
*vma
)
429 struct inode
*inode
= file_inode(vma
->vm_file
);
430 struct vvp_object
*vob
= cl_inode2vvp(inode
);
432 LASSERT(vma
->vm_file
);
433 LASSERT(atomic_read(&vob
->vob_mmap_cnt
) >= 0);
434 atomic_inc(&vob
->vob_mmap_cnt
);
438 * Dual to ll_vm_open().
440 static void ll_vm_close(struct vm_area_struct
*vma
)
442 struct inode
*inode
= file_inode(vma
->vm_file
);
443 struct vvp_object
*vob
= cl_inode2vvp(inode
);
445 LASSERT(vma
->vm_file
);
446 atomic_dec(&vob
->vob_mmap_cnt
);
447 LASSERT(atomic_read(&vob
->vob_mmap_cnt
) >= 0);
450 /* XXX put nice comment here. talk about __free_pte -> dirty pages and
451 * nopage's reference passing to the pte
453 int ll_teardown_mmaps(struct address_space
*mapping
, __u64 first
, __u64 last
)
457 LASSERTF(last
> first
, "last %llu first %llu\n", last
, first
);
458 if (mapping_mapped(mapping
)) {
460 unmap_mapping_range(mapping
, first
+ PAGE_SIZE
- 1,
461 last
- first
+ 1, 0);
467 static const struct vm_operations_struct ll_file_vm_ops
= {
469 .page_mkwrite
= ll_page_mkwrite
,
471 .close
= ll_vm_close
,
474 int ll_file_mmap(struct file
*file
, struct vm_area_struct
*vma
)
476 struct inode
*inode
= file_inode(file
);
479 if (ll_file_nolock(file
))
482 ll_stats_ops_tally(ll_i2sbi(inode
), LPROC_LL_MAP
, 1);
483 rc
= generic_file_mmap(file
, vma
);
485 vma
->vm_ops
= &ll_file_vm_ops
;
486 vma
->vm_ops
->open(vma
);
487 /* update the inode's size and mtime */
488 rc
= ll_glimpse_size(inode
);