Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
6a5b99a4 | 18 | * http://www.gnu.org/licenses/gpl-2.0.html |
d7e09d03 | 19 | * |
d7e09d03 PT |
20 | * GPL HEADER END |
21 | */ | |
22 | /* | |
23 | * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. | |
24 | * Use is subject to license terms. | |
25 | * | |
1dc563a6 | 26 | * Copyright (c) 2011, 2015, Intel Corporation. |
d7e09d03 PT |
27 | */ |
28 | /* | |
29 | * This file is part of Lustre, http://www.lustre.org/ | |
30 | * Lustre is a trademark of Sun Microsystems, Inc. | |
31 | */ | |
32 | ||
33 | #include <linux/kernel.h> | |
34 | #include <linux/mm.h> | |
35 | #include <linux/string.h> | |
36 | #include <linux/stat.h> | |
37 | #include <linux/errno.h> | |
38 | #include <linux/unistd.h> | |
e8fd99fd | 39 | #include <linux/uaccess.h> |
d7e09d03 PT |
40 | |
41 | #include <linux/fs.h> | |
d7e09d03 PT |
42 | #include <linux/pagemap.h> |
43 | ||
44 | #define DEBUG_SUBSYSTEM S_LLITE | |
45 | ||
67a235f5 | 46 | #include "../include/lustre_lite.h" |
d7e09d03 | 47 | #include "llite_internal.h" |
67a235f5 | 48 | #include "../include/linux/lustre_compat25.h" |
d7e09d03 | 49 | |
2d95f10e | 50 | static const struct vm_operations_struct ll_file_vm_ops; |
d7e09d03 PT |
51 | |
52 | void policy_from_vma(ldlm_policy_data_t *policy, | |
e15ba45d OD |
53 | struct vm_area_struct *vma, unsigned long addr, |
54 | size_t count) | |
d7e09d03 | 55 | { |
616387e8 | 56 | policy->l_extent.start = ((addr - vma->vm_start) & PAGE_MASK) + |
09cbfeaf | 57 | (vma->vm_pgoff << PAGE_SHIFT); |
d7e09d03 | 58 | policy->l_extent.end = (policy->l_extent.start + count - 1) | |
616387e8 | 59 | ~PAGE_MASK; |
d7e09d03 PT |
60 | } |
61 | ||
62 | struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr, | |
63 | size_t count) | |
64 | { | |
65 | struct vm_area_struct *vma, *ret = NULL; | |
d7e09d03 PT |
66 | |
67 | /* mmap_sem must have been held by caller. */ | |
68 | LASSERT(!down_write_trylock(&mm->mmap_sem)); | |
69 | ||
a58a38ac | 70 | for (vma = find_vma(mm, addr); |
6e16818b | 71 | vma && vma->vm_start < (addr + count); vma = vma->vm_next) { |
d7e09d03 PT |
72 | if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops && |
73 | vma->vm_flags & VM_SHARED) { | |
74 | ret = vma; | |
75 | break; | |
76 | } | |
77 | } | |
0a3bdb00 | 78 | return ret; |
d7e09d03 PT |
79 | } |
80 | ||
81 | /** | |
82 | * API independent part for page fault initialization. | |
83 | * \param vma - virtual memory area addressed to page fault | |
84 | * \param env - corespondent lu_env to processing | |
85 | * \param nest - nested level | |
86 | * \param index - page index corespondent to fault. | |
87 | * \parm ra_flags - vma readahead flags. | |
88 | * | |
89 | * \return allocated and initialized env for fault operation. | |
90 | * \retval EINVAL if env can't allocated | |
91 | * \return other error codes from cl_io_init. | |
92 | */ | |
2d95f10e JH |
93 | static struct cl_io * |
94 | ll_fault_io_init(struct vm_area_struct *vma, struct lu_env **env_ret, | |
95 | struct cl_env_nest *nest, pgoff_t index, | |
96 | unsigned long *ra_flags) | |
d7e09d03 | 97 | { |
8a48df70 | 98 | struct file *file = vma->vm_file; |
2a8a3597 | 99 | struct inode *inode = file_inode(file); |
8a48df70 JH |
100 | struct cl_io *io; |
101 | struct cl_fault_io *fio; | |
102 | struct lu_env *env; | |
103 | int rc; | |
d7e09d03 PT |
104 | |
105 | *env_ret = NULL; | |
106 | if (ll_file_nolock(file)) | |
0a3bdb00 | 107 | return ERR_PTR(-EOPNOTSUPP); |
d7e09d03 PT |
108 | |
109 | /* | |
110 | * page fault can be called when lustre IO is | |
111 | * already active for the current thread, e.g., when doing read/write | |
112 | * against user level buffer mapped from Lustre buffer. To avoid | |
113 | * stomping on existing context, optionally force an allocation of a new | |
114 | * one. | |
115 | */ | |
116 | env = cl_env_nested_get(nest); | |
117 | if (IS_ERR(env)) | |
defa220f | 118 | return ERR_PTR(-EINVAL); |
d7e09d03 PT |
119 | |
120 | *env_ret = env; | |
121 | ||
30889c72 | 122 | restart: |
9acc4500 | 123 | io = vvp_env_thread_io(env); |
d7e09d03 | 124 | io->ci_obj = ll_i2info(inode)->lli_clob; |
6e16818b | 125 | LASSERT(io->ci_obj); |
d7e09d03 PT |
126 | |
127 | fio = &io->u.ci_fault; | |
128 | fio->ft_index = index; | |
cd94f231 | 129 | fio->ft_executable = vma->vm_flags & VM_EXEC; |
d7e09d03 PT |
130 | |
131 | /* | |
132 | * disable VM_SEQ_READ and use VM_RAND_READ to make sure that | |
133 | * the kernel will not read other pages not covered by ldlm in | |
134 | * filemap_nopage. we do our readahead in ll_readpage. | |
135 | */ | |
6e16818b | 136 | if (ra_flags) |
cd94f231 | 137 | *ra_flags = vma->vm_flags & (VM_RAND_READ | VM_SEQ_READ); |
d7e09d03 PT |
138 | vma->vm_flags &= ~VM_SEQ_READ; |
139 | vma->vm_flags |= VM_RAND_READ; | |
140 | ||
141 | CDEBUG(D_MMAP, "vm_flags: %lx (%lu %d)\n", vma->vm_flags, | |
142 | fio->ft_index, fio->ft_executable); | |
143 | ||
8a48df70 JH |
144 | rc = cl_io_init(env, io, CIT_FAULT, io->ci_obj); |
145 | if (rc == 0) { | |
e0a8144b | 146 | struct vvp_io *vio = vvp_env_io(env); |
d7e09d03 PT |
147 | struct ll_file_data *fd = LUSTRE_FPRIVATE(file); |
148 | ||
e0a8144b | 149 | LASSERT(vio->vui_cl.cis_io == io); |
d7e09d03 | 150 | |
c0894c6c | 151 | /* mmap lock must be MANDATORY it has to cache pages. */ |
d7e09d03 | 152 | io->ci_lockreq = CILR_MANDATORY; |
e0a8144b | 153 | vio->vui_fd = fd; |
8a48df70 JH |
154 | } else { |
155 | LASSERT(rc < 0); | |
156 | cl_io_fini(env, io); | |
30889c72 BJ |
157 | if (io->ci_need_restart) |
158 | goto restart; | |
159 | ||
8a48df70 JH |
160 | cl_env_nested_put(nest, env); |
161 | io = ERR_PTR(rc); | |
d7e09d03 PT |
162 | } |
163 | ||
164 | return io; | |
165 | } | |
166 | ||
167 | /* Sharing code of page_mkwrite method for rhel5 and rhel6 */ | |
168 | static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage, | |
169 | bool *retry) | |
170 | { | |
171 | struct lu_env *env; | |
172 | struct cl_io *io; | |
173 | struct vvp_io *vio; | |
174 | struct cl_env_nest nest; | |
175 | int result; | |
176 | sigset_t set; | |
177 | struct inode *inode; | |
178 | struct ll_inode_info *lli; | |
d7e09d03 | 179 | |
d7e09d03 | 180 | io = ll_fault_io_init(vma, &env, &nest, vmpage->index, NULL); |
34e1f2bb JL |
181 | if (IS_ERR(io)) { |
182 | result = PTR_ERR(io); | |
183 | goto out; | |
184 | } | |
d7e09d03 PT |
185 | |
186 | result = io->ci_result; | |
187 | if (result < 0) | |
34e1f2bb | 188 | goto out_io; |
d7e09d03 PT |
189 | |
190 | io->u.ci_fault.ft_mkwrite = 1; | |
191 | io->u.ci_fault.ft_writable = 1; | |
192 | ||
193 | vio = vvp_env_io(env); | |
194 | vio->u.fault.ft_vma = vma; | |
195 | vio->u.fault.ft_vmpage = vmpage; | |
196 | ||
197 | set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM)); | |
198 | ||
8c7b0e1a | 199 | inode = vvp_object_inode(io->ci_obj); |
d7e09d03 | 200 | lli = ll_i2info(inode); |
d7e09d03 PT |
201 | |
202 | result = cl_io_loop(env, io); | |
203 | ||
d7e09d03 PT |
204 | cfs_restore_sigs(set); |
205 | ||
206 | if (result == 0) { | |
2a8a3597 | 207 | struct inode *inode = file_inode(vma->vm_file); |
d7e09d03 PT |
208 | struct ll_inode_info *lli = ll_i2info(inode); |
209 | ||
210 | lock_page(vmpage); | |
6e16818b | 211 | if (!vmpage->mapping) { |
d7e09d03 PT |
212 | unlock_page(vmpage); |
213 | ||
214 | /* page was truncated and lock was cancelled, return | |
215 | * ENODATA so that VM_FAULT_NOPAGE will be returned | |
c0894c6c OD |
216 | * to handle_mm_fault(). |
217 | */ | |
d7e09d03 PT |
218 | if (result == 0) |
219 | result = -ENODATA; | |
220 | } else if (!PageDirty(vmpage)) { | |
221 | /* race, the page has been cleaned by ptlrpcd after | |
222 | * it was unlocked, it has to be added into dirty | |
223 | * cache again otherwise this soon-to-dirty page won't | |
224 | * consume any grants, even worse if this page is being | |
225 | * transferred because it will break RPC checksum. | |
226 | */ | |
227 | unlock_page(vmpage); | |
228 | ||
2d00bd17 | 229 | CDEBUG(D_MMAP, "Race on page_mkwrite %p/%lu, page has been written out, retry.\n", |
d7e09d03 PT |
230 | vmpage, vmpage->index); |
231 | ||
232 | *retry = true; | |
233 | result = -EAGAIN; | |
234 | } | |
235 | ||
236 | if (result == 0) { | |
237 | spin_lock(&lli->lli_lock); | |
238 | lli->lli_flags |= LLIF_DATA_MODIFIED; | |
239 | spin_unlock(&lli->lli_lock); | |
240 | } | |
241 | } | |
d7e09d03 | 242 | |
8a48df70 | 243 | out_io: |
d7e09d03 PT |
244 | cl_io_fini(env, io); |
245 | cl_env_nested_put(&nest, env); | |
8a48df70 | 246 | out: |
d7e09d03 | 247 | CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result); |
d7e09d03 | 248 | LASSERT(ergo(result == 0, PageLocked(vmpage))); |
8a48df70 JH |
249 | |
250 | return result; | |
d7e09d03 PT |
251 | } |
252 | ||
d7e09d03 PT |
253 | static inline int to_fault_error(int result) |
254 | { | |
a58a38ac | 255 | switch (result) { |
d7e09d03 PT |
256 | case 0: |
257 | result = VM_FAULT_LOCKED; | |
258 | break; | |
259 | case -EFAULT: | |
260 | result = VM_FAULT_NOPAGE; | |
261 | break; | |
262 | case -ENOMEM: | |
263 | result = VM_FAULT_OOM; | |
264 | break; | |
265 | default: | |
266 | result = VM_FAULT_SIGBUS; | |
267 | break; | |
268 | } | |
269 | return result; | |
270 | } | |
271 | ||
272 | /** | |
273 | * Lustre implementation of a vm_operations_struct::fault() method, called by | |
274 | * VM to server page fault (both in kernel and user space). | |
275 | * | |
d0a0acc3 | 276 | * \param vma - is virtual area struct related to page fault |
d7e09d03 PT |
277 | * \param vmf - structure which describe type and address where hit fault |
278 | * | |
279 | * \return allocated and filled _locked_ page for address | |
280 | * \retval VM_FAULT_ERROR on general error | |
281 | * \retval NOPAGE_OOM not have memory for allocate new page | |
282 | */ | |
283 | static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf) | |
284 | { | |
285 | struct lu_env *env; | |
286 | struct cl_io *io; | |
287 | struct vvp_io *vio = NULL; | |
288 | struct page *vmpage; | |
289 | unsigned long ra_flags; | |
290 | struct cl_env_nest nest; | |
291 | int result; | |
292 | int fault_ret = 0; | |
d7e09d03 PT |
293 | |
294 | io = ll_fault_io_init(vma, &env, &nest, vmf->pgoff, &ra_flags); | |
295 | if (IS_ERR(io)) | |
0a3bdb00 | 296 | return to_fault_error(PTR_ERR(io)); |
d7e09d03 PT |
297 | |
298 | result = io->ci_result; | |
299 | if (result == 0) { | |
300 | vio = vvp_env_io(env); | |
301 | vio->u.fault.ft_vma = vma; | |
302 | vio->u.fault.ft_vmpage = NULL; | |
10cdef73 JH |
303 | vio->u.fault.ft_vmf = vmf; |
304 | vio->u.fault.ft_flags = 0; | |
305 | vio->u.fault.ft_flags_valid = false; | |
d7e09d03 | 306 | |
966c4a8f JX |
307 | /* May call ll_readpage() */ |
308 | ll_cl_add(vma->vm_file, env, io); | |
309 | ||
d7e09d03 PT |
310 | result = cl_io_loop(env, io); |
311 | ||
966c4a8f JX |
312 | ll_cl_remove(vma->vm_file, env); |
313 | ||
6aa51072 | 314 | /* ft_flags are only valid if we reached |
c0894c6c OD |
315 | * the call to filemap_fault |
316 | */ | |
10cdef73 JH |
317 | if (vio->u.fault.ft_flags_valid) |
318 | fault_ret = vio->u.fault.ft_flags; | |
6aa51072 | 319 | |
d7e09d03 | 320 | vmpage = vio->u.fault.ft_vmpage; |
6e16818b | 321 | if (result != 0 && vmpage) { |
09cbfeaf | 322 | put_page(vmpage); |
d7e09d03 PT |
323 | vmf->page = NULL; |
324 | } | |
325 | } | |
326 | cl_io_fini(env, io); | |
327 | cl_env_nested_put(&nest, env); | |
328 | ||
329 | vma->vm_flags |= ra_flags; | |
330 | if (result != 0 && !(fault_ret & VM_FAULT_RETRY)) | |
331 | fault_ret |= to_fault_error(result); | |
332 | ||
333 | CDEBUG(D_MMAP, "%s fault %d/%d\n", | |
334 | current->comm, fault_ret, result); | |
0a3bdb00 | 335 | return fault_ret; |
d7e09d03 PT |
336 | } |
337 | ||
338 | static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |
339 | { | |
340 | int count = 0; | |
341 | bool printed = false; | |
342 | int result; | |
343 | sigset_t set; | |
344 | ||
c0894c6c | 345 | /* Only SIGKILL and SIGTERM are allowed for fault/nopage/mkwrite |
d7e09d03 | 346 | * so that it can be killed by admin but not cause segfault by |
c0894c6c OD |
347 | * other signals. |
348 | */ | |
d7e09d03 PT |
349 | set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM)); |
350 | ||
351 | restart: | |
352 | result = ll_fault0(vma, vmf); | |
353 | LASSERT(!(result & VM_FAULT_LOCKED)); | |
354 | if (result == 0) { | |
355 | struct page *vmpage = vmf->page; | |
356 | ||
357 | /* check if this page has been truncated */ | |
358 | lock_page(vmpage); | |
6e16818b | 359 | if (unlikely(!vmpage->mapping)) { /* unlucky */ |
d7e09d03 | 360 | unlock_page(vmpage); |
09cbfeaf | 361 | put_page(vmpage); |
d7e09d03 PT |
362 | vmf->page = NULL; |
363 | ||
364 | if (!printed && ++count > 16) { | |
2d00bd17 | 365 | CWARN("the page is under heavy contention, maybe your app(%s) needs revising :-)\n", |
d7e09d03 PT |
366 | current->comm); |
367 | printed = true; | |
368 | } | |
369 | ||
370 | goto restart; | |
371 | } | |
372 | ||
34d1f637 | 373 | result = VM_FAULT_LOCKED; |
d7e09d03 PT |
374 | } |
375 | cfs_restore_sigs(set); | |
376 | return result; | |
377 | } | |
378 | ||
379 | static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |
380 | { | |
381 | int count = 0; | |
382 | bool printed = false; | |
383 | bool retry; | |
384 | int result; | |
385 | ||
386 | do { | |
387 | retry = false; | |
388 | result = ll_page_mkwrite0(vma, vmf->page, &retry); | |
389 | ||
390 | if (!printed && ++count > 16) { | |
97a075cd JN |
391 | const struct dentry *de = vma->vm_file->f_path.dentry; |
392 | ||
393 | CWARN("app(%s): the page %lu of file "DFID" is under heavy contention\n", | |
d7e09d03 | 394 | current->comm, vmf->pgoff, |
97a075cd | 395 | PFID(ll_inode2fid(de->d_inode))); |
d7e09d03 PT |
396 | printed = true; |
397 | } | |
398 | } while (retry); | |
399 | ||
a58a38ac | 400 | switch (result) { |
d7e09d03 PT |
401 | case 0: |
402 | LASSERT(PageLocked(vmf->page)); | |
403 | result = VM_FAULT_LOCKED; | |
404 | break; | |
405 | case -ENODATA: | |
406 | case -EFAULT: | |
407 | result = VM_FAULT_NOPAGE; | |
408 | break; | |
409 | case -ENOMEM: | |
410 | result = VM_FAULT_OOM; | |
411 | break; | |
412 | case -EAGAIN: | |
413 | result = VM_FAULT_RETRY; | |
414 | break; | |
415 | default: | |
416 | result = VM_FAULT_SIGBUS; | |
417 | break; | |
418 | } | |
419 | ||
420 | return result; | |
421 | } | |
422 | ||
423 | /** | |
424 | * To avoid cancel the locks covering mmapped region for lock cache pressure, | |
8c7b0e1a | 425 | * we track the mapped vma count in vvp_object::vob_mmap_cnt. |
d7e09d03 | 426 | */ |
aff9d8e8 | 427 | static void ll_vm_open(struct vm_area_struct *vma) |
d7e09d03 | 428 | { |
2a8a3597 | 429 | struct inode *inode = file_inode(vma->vm_file); |
8c7b0e1a | 430 | struct vvp_object *vob = cl_inode2vvp(inode); |
d7e09d03 | 431 | |
8c7b0e1a JH |
432 | LASSERT(atomic_read(&vob->vob_mmap_cnt) >= 0); |
433 | atomic_inc(&vob->vob_mmap_cnt); | |
d7e09d03 PT |
434 | } |
435 | ||
436 | /** | |
437 | * Dual to ll_vm_open(). | |
438 | */ | |
439 | static void ll_vm_close(struct vm_area_struct *vma) | |
440 | { | |
2a8a3597 | 441 | struct inode *inode = file_inode(vma->vm_file); |
8c7b0e1a | 442 | struct vvp_object *vob = cl_inode2vvp(inode); |
d7e09d03 | 443 | |
8c7b0e1a JH |
444 | atomic_dec(&vob->vob_mmap_cnt); |
445 | LASSERT(atomic_read(&vob->vob_mmap_cnt) >= 0); | |
d7e09d03 PT |
446 | } |
447 | ||
d7e09d03 | 448 | /* XXX put nice comment here. talk about __free_pte -> dirty pages and |
c0894c6c OD |
449 | * nopage's reference passing to the pte |
450 | */ | |
d7e09d03 PT |
451 | int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last) |
452 | { | |
453 | int rc = -ENOENT; | |
d7e09d03 | 454 | |
b0f5aad5 | 455 | LASSERTF(last > first, "last %llu first %llu\n", last, first); |
d7e09d03 PT |
456 | if (mapping_mapped(mapping)) { |
457 | rc = 0; | |
09cbfeaf | 458 | unmap_mapping_range(mapping, first + PAGE_SIZE - 1, |
d7e09d03 PT |
459 | last - first + 1, 0); |
460 | } | |
461 | ||
0a3bdb00 | 462 | return rc; |
d7e09d03 PT |
463 | } |
464 | ||
2d95f10e | 465 | static const struct vm_operations_struct ll_file_vm_ops = { |
d7e09d03 PT |
466 | .fault = ll_fault, |
467 | .page_mkwrite = ll_page_mkwrite, | |
468 | .open = ll_vm_open, | |
469 | .close = ll_vm_close, | |
470 | }; | |
471 | ||
aff9d8e8 | 472 | int ll_file_mmap(struct file *file, struct vm_area_struct *vma) |
d7e09d03 | 473 | { |
2a8a3597 | 474 | struct inode *inode = file_inode(file); |
d7e09d03 | 475 | int rc; |
d7e09d03 PT |
476 | |
477 | if (ll_file_nolock(file)) | |
0a3bdb00 | 478 | return -EOPNOTSUPP; |
d7e09d03 PT |
479 | |
480 | ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_MAP, 1); | |
481 | rc = generic_file_mmap(file, vma); | |
482 | if (rc == 0) { | |
483 | vma->vm_ops = &ll_file_vm_ops; | |
484 | vma->vm_ops->open(vma); | |
485 | /* update the inode's size and mtime */ | |
486 | rc = ll_glimpse_size(inode); | |
487 | } | |
488 | ||
0a3bdb00 | 489 | return rc; |
d7e09d03 | 490 | } |