Commit | Line | Data |
---|---|---|
8ada2c1c SR |
1 | /* |
2 | * Copyright (c) 2014 Mellanox Technologies. All rights reserved. | |
3 | * | |
4 | * This software is available to you under a choice of one of two | |
5 | * licenses. You may choose to be licensed under the terms of the GNU | |
6 | * General Public License (GPL) Version 2, available from the file | |
7 | * COPYING in the main directory of this source tree, or the | |
8 | * OpenIB.org BSD license below: | |
9 | * | |
10 | * Redistribution and use in source and binary forms, with or | |
11 | * without modification, are permitted provided that the following | |
12 | * conditions are met: | |
13 | * | |
14 | * - Redistributions of source code must retain the above | |
15 | * copyright notice, this list of conditions and the following | |
16 | * disclaimer. | |
17 | * | |
18 | * - Redistributions in binary form must reproduce the above | |
19 | * copyright notice, this list of conditions and the following | |
20 | * disclaimer in the documentation and/or other materials | |
21 | * provided with the distribution. | |
22 | * | |
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
30 | * SOFTWARE. | |
31 | */ | |
32 | ||
33 | #include <linux/types.h> | |
34 | #include <linux/sched.h> | |
35 | #include <linux/pid.h> | |
36 | #include <linux/slab.h> | |
37 | #include <linux/export.h> | |
38 | #include <linux/vmalloc.h> | |
39 | ||
40 | #include <rdma/ib_verbs.h> | |
41 | #include <rdma/ib_umem.h> | |
42 | #include <rdma/ib_umem_odp.h> | |
43 | ||
882214e2 HE |
44 | static void ib_umem_notifier_start_account(struct ib_umem *item) |
45 | { | |
46 | mutex_lock(&item->odp_data->umem_mutex); | |
47 | ||
48 | /* Only update private counters for this umem if it has them. | |
49 | * Otherwise skip it. All page faults will be delayed for this umem. */ | |
50 | if (item->odp_data->mn_counters_active) { | |
51 | int notifiers_count = item->odp_data->notifiers_count++; | |
52 | ||
53 | if (notifiers_count == 0) | |
54 | /* Initialize the completion object for waiting on | |
55 | * notifiers. Since notifier_count is zero, no one | |
56 | * should be waiting right now. */ | |
57 | reinit_completion(&item->odp_data->notifier_completion); | |
58 | } | |
59 | mutex_unlock(&item->odp_data->umem_mutex); | |
60 | } | |
61 | ||
62 | static void ib_umem_notifier_end_account(struct ib_umem *item) | |
63 | { | |
64 | mutex_lock(&item->odp_data->umem_mutex); | |
65 | ||
66 | /* Only update private counters for this umem if it has them. | |
67 | * Otherwise skip it. All page faults will be delayed for this umem. */ | |
68 | if (item->odp_data->mn_counters_active) { | |
69 | /* | |
70 | * This sequence increase will notify the QP page fault that | |
71 | * the page that is going to be mapped in the spte could have | |
72 | * been freed. | |
73 | */ | |
74 | ++item->odp_data->notifiers_seq; | |
75 | if (--item->odp_data->notifiers_count == 0) | |
76 | complete_all(&item->odp_data->notifier_completion); | |
77 | } | |
78 | mutex_unlock(&item->odp_data->umem_mutex); | |
79 | } | |
80 | ||
81 | /* Account for a new mmu notifier in an ib_ucontext. */ | |
82 | static void ib_ucontext_notifier_start_account(struct ib_ucontext *context) | |
83 | { | |
84 | atomic_inc(&context->notifier_count); | |
85 | } | |
86 | ||
87 | /* Account for a terminating mmu notifier in an ib_ucontext. | |
88 | * | |
89 | * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since | |
90 | * the function takes the semaphore itself. */ | |
91 | static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) | |
92 | { | |
93 | int zero_notifiers = atomic_dec_and_test(&context->notifier_count); | |
94 | ||
95 | if (zero_notifiers && | |
96 | !list_empty(&context->no_private_counters)) { | |
97 | /* No currently running mmu notifiers. Now is the chance to | |
98 | * add private accounting to all previously added umems. */ | |
99 | struct ib_umem_odp *odp_data, *next; | |
100 | ||
101 | /* Prevent concurrent mmu notifiers from working on the | |
102 | * no_private_counters list. */ | |
103 | down_write(&context->umem_rwsem); | |
104 | ||
105 | /* Read the notifier_count again, with the umem_rwsem | |
106 | * semaphore taken for write. */ | |
107 | if (!atomic_read(&context->notifier_count)) { | |
108 | list_for_each_entry_safe(odp_data, next, | |
109 | &context->no_private_counters, | |
110 | no_private_counters) { | |
111 | mutex_lock(&odp_data->umem_mutex); | |
112 | odp_data->mn_counters_active = true; | |
113 | list_del(&odp_data->no_private_counters); | |
114 | complete_all(&odp_data->notifier_completion); | |
115 | mutex_unlock(&odp_data->umem_mutex); | |
116 | } | |
117 | } | |
118 | ||
119 | up_write(&context->umem_rwsem); | |
120 | } | |
121 | } | |
122 | ||
123 | static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start, | |
124 | u64 end, void *cookie) { | |
125 | /* | |
126 | * Increase the number of notifiers running, to | |
127 | * prevent any further fault handling on this MR. | |
128 | */ | |
129 | ib_umem_notifier_start_account(item); | |
130 | item->odp_data->dying = 1; | |
131 | /* Make sure that the fact the umem is dying is out before we release | |
132 | * all pending page faults. */ | |
133 | smp_wmb(); | |
134 | complete_all(&item->odp_data->notifier_completion); | |
135 | item->context->invalidate_range(item, ib_umem_start(item), | |
136 | ib_umem_end(item)); | |
137 | return 0; | |
138 | } | |
139 | ||
140 | static void ib_umem_notifier_release(struct mmu_notifier *mn, | |
141 | struct mm_struct *mm) | |
142 | { | |
143 | struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); | |
144 | ||
145 | if (!context->invalidate_range) | |
146 | return; | |
147 | ||
148 | ib_ucontext_notifier_start_account(context); | |
149 | down_read(&context->umem_rwsem); | |
150 | rbt_ib_umem_for_each_in_range(&context->umem_tree, 0, | |
151 | ULLONG_MAX, | |
152 | ib_umem_notifier_release_trampoline, | |
153 | NULL); | |
154 | up_read(&context->umem_rwsem); | |
155 | } | |
156 | ||
157 | static int invalidate_page_trampoline(struct ib_umem *item, u64 start, | |
158 | u64 end, void *cookie) | |
159 | { | |
160 | ib_umem_notifier_start_account(item); | |
161 | item->context->invalidate_range(item, start, start + PAGE_SIZE); | |
162 | ib_umem_notifier_end_account(item); | |
163 | return 0; | |
164 | } | |
165 | ||
166 | static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn, | |
167 | struct mm_struct *mm, | |
168 | unsigned long address) | |
169 | { | |
170 | struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); | |
171 | ||
172 | if (!context->invalidate_range) | |
173 | return; | |
174 | ||
175 | ib_ucontext_notifier_start_account(context); | |
176 | down_read(&context->umem_rwsem); | |
177 | rbt_ib_umem_for_each_in_range(&context->umem_tree, address, | |
178 | address + PAGE_SIZE, | |
179 | invalidate_page_trampoline, NULL); | |
180 | up_read(&context->umem_rwsem); | |
181 | ib_ucontext_notifier_end_account(context); | |
182 | } | |
183 | ||
184 | static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start, | |
185 | u64 end, void *cookie) | |
186 | { | |
187 | ib_umem_notifier_start_account(item); | |
188 | item->context->invalidate_range(item, start, end); | |
189 | return 0; | |
190 | } | |
191 | ||
192 | static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, | |
193 | struct mm_struct *mm, | |
194 | unsigned long start, | |
195 | unsigned long end) | |
196 | { | |
197 | struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); | |
198 | ||
199 | if (!context->invalidate_range) | |
200 | return; | |
201 | ||
202 | ib_ucontext_notifier_start_account(context); | |
203 | down_read(&context->umem_rwsem); | |
204 | rbt_ib_umem_for_each_in_range(&context->umem_tree, start, | |
205 | end, | |
206 | invalidate_range_start_trampoline, NULL); | |
207 | up_read(&context->umem_rwsem); | |
208 | } | |
209 | ||
210 | static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start, | |
211 | u64 end, void *cookie) | |
212 | { | |
213 | ib_umem_notifier_end_account(item); | |
214 | return 0; | |
215 | } | |
216 | ||
217 | static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, | |
218 | struct mm_struct *mm, | |
219 | unsigned long start, | |
220 | unsigned long end) | |
221 | { | |
222 | struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); | |
223 | ||
224 | if (!context->invalidate_range) | |
225 | return; | |
226 | ||
227 | down_read(&context->umem_rwsem); | |
228 | rbt_ib_umem_for_each_in_range(&context->umem_tree, start, | |
229 | end, | |
230 | invalidate_range_end_trampoline, NULL); | |
231 | up_read(&context->umem_rwsem); | |
232 | ib_ucontext_notifier_end_account(context); | |
233 | } | |
234 | ||
235 | static struct mmu_notifier_ops ib_umem_notifiers = { | |
236 | .release = ib_umem_notifier_release, | |
237 | .invalidate_page = ib_umem_notifier_invalidate_page, | |
238 | .invalidate_range_start = ib_umem_notifier_invalidate_range_start, | |
239 | .invalidate_range_end = ib_umem_notifier_invalidate_range_end, | |
240 | }; | |
241 | ||
8ada2c1c SR |
242 | int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem) |
243 | { | |
244 | int ret_val; | |
245 | struct pid *our_pid; | |
882214e2 HE |
246 | struct mm_struct *mm = get_task_mm(current); |
247 | ||
248 | if (!mm) | |
249 | return -EINVAL; | |
8ada2c1c SR |
250 | |
251 | /* Prevent creating ODP MRs in child processes */ | |
252 | rcu_read_lock(); | |
253 | our_pid = get_task_pid(current->group_leader, PIDTYPE_PID); | |
254 | rcu_read_unlock(); | |
255 | put_pid(our_pid); | |
882214e2 HE |
256 | if (context->tgid != our_pid) { |
257 | ret_val = -EINVAL; | |
258 | goto out_mm; | |
259 | } | |
8ada2c1c SR |
260 | |
261 | umem->hugetlb = 0; | |
262 | umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL); | |
882214e2 HE |
263 | if (!umem->odp_data) { |
264 | ret_val = -ENOMEM; | |
265 | goto out_mm; | |
266 | } | |
267 | umem->odp_data->umem = umem; | |
8ada2c1c SR |
268 | |
269 | mutex_init(&umem->odp_data->umem_mutex); | |
270 | ||
882214e2 HE |
271 | init_completion(&umem->odp_data->notifier_completion); |
272 | ||
8ada2c1c SR |
273 | umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) * |
274 | sizeof(*umem->odp_data->page_list)); | |
275 | if (!umem->odp_data->page_list) { | |
276 | ret_val = -ENOMEM; | |
277 | goto out_odp_data; | |
278 | } | |
279 | ||
280 | umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) * | |
281 | sizeof(*umem->odp_data->dma_list)); | |
282 | if (!umem->odp_data->dma_list) { | |
283 | ret_val = -ENOMEM; | |
284 | goto out_page_list; | |
285 | } | |
286 | ||
882214e2 HE |
287 | /* |
288 | * When using MMU notifiers, we will get a | |
289 | * notification before the "current" task (and MM) is | |
290 | * destroyed. We use the umem_rwsem semaphore to synchronize. | |
291 | */ | |
292 | down_write(&context->umem_rwsem); | |
293 | context->odp_mrs_count++; | |
294 | if (likely(ib_umem_start(umem) != ib_umem_end(umem))) | |
295 | rbt_ib_umem_insert(&umem->odp_data->interval_tree, | |
296 | &context->umem_tree); | |
4fc701ea HE |
297 | if (likely(!atomic_read(&context->notifier_count)) || |
298 | context->odp_mrs_count == 1) | |
882214e2 HE |
299 | umem->odp_data->mn_counters_active = true; |
300 | else | |
301 | list_add(&umem->odp_data->no_private_counters, | |
302 | &context->no_private_counters); | |
303 | downgrade_write(&context->umem_rwsem); | |
304 | ||
305 | if (context->odp_mrs_count == 1) { | |
306 | /* | |
307 | * Note that at this point, no MMU notifier is running | |
308 | * for this context! | |
309 | */ | |
310 | atomic_set(&context->notifier_count, 0); | |
311 | INIT_HLIST_NODE(&context->mn.hlist); | |
312 | context->mn.ops = &ib_umem_notifiers; | |
313 | /* | |
314 | * Lock-dep detects a false positive for mmap_sem vs. | |
315 | * umem_rwsem, due to not grasping downgrade_write correctly. | |
316 | */ | |
317 | lockdep_off(); | |
318 | ret_val = mmu_notifier_register(&context->mn, mm); | |
319 | lockdep_on(); | |
320 | if (ret_val) { | |
321 | pr_err("Failed to register mmu_notifier %d\n", ret_val); | |
322 | ret_val = -EBUSY; | |
323 | goto out_mutex; | |
324 | } | |
325 | } | |
326 | ||
327 | up_read(&context->umem_rwsem); | |
328 | ||
329 | /* | |
330 | * Note that doing an mmput can cause a notifier for the relevant mm. | |
331 | * If the notifier is called while we hold the umem_rwsem, this will | |
332 | * cause a deadlock. Therefore, we release the reference only after we | |
333 | * released the semaphore. | |
334 | */ | |
335 | mmput(mm); | |
8ada2c1c SR |
336 | return 0; |
337 | ||
882214e2 HE |
338 | out_mutex: |
339 | up_read(&context->umem_rwsem); | |
340 | vfree(umem->odp_data->dma_list); | |
8ada2c1c SR |
341 | out_page_list: |
342 | vfree(umem->odp_data->page_list); | |
343 | out_odp_data: | |
344 | kfree(umem->odp_data); | |
882214e2 HE |
345 | out_mm: |
346 | mmput(mm); | |
8ada2c1c SR |
347 | return ret_val; |
348 | } | |
349 | ||
350 | void ib_umem_odp_release(struct ib_umem *umem) | |
351 | { | |
882214e2 HE |
352 | struct ib_ucontext *context = umem->context; |
353 | ||
8ada2c1c SR |
354 | /* |
355 | * Ensure that no more pages are mapped in the umem. | |
356 | * | |
357 | * It is the driver's responsibility to ensure, before calling us, | |
358 | * that the hardware will not attempt to access the MR any more. | |
359 | */ | |
360 | ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem), | |
361 | ib_umem_end(umem)); | |
362 | ||
882214e2 HE |
363 | down_write(&context->umem_rwsem); |
364 | if (likely(ib_umem_start(umem) != ib_umem_end(umem))) | |
365 | rbt_ib_umem_remove(&umem->odp_data->interval_tree, | |
366 | &context->umem_tree); | |
367 | context->odp_mrs_count--; | |
368 | if (!umem->odp_data->mn_counters_active) { | |
369 | list_del(&umem->odp_data->no_private_counters); | |
370 | complete_all(&umem->odp_data->notifier_completion); | |
371 | } | |
372 | ||
373 | /* | |
374 | * Downgrade the lock to a read lock. This ensures that the notifiers | |
375 | * (who lock the mutex for reading) will be able to finish, and we | |
376 | * will be able to enventually obtain the mmu notifiers SRCU. Note | |
377 | * that since we are doing it atomically, no other user could register | |
378 | * and unregister while we do the check. | |
379 | */ | |
380 | downgrade_write(&context->umem_rwsem); | |
381 | if (!context->odp_mrs_count) { | |
382 | struct task_struct *owning_process = NULL; | |
383 | struct mm_struct *owning_mm = NULL; | |
384 | ||
385 | owning_process = get_pid_task(context->tgid, | |
386 | PIDTYPE_PID); | |
387 | if (owning_process == NULL) | |
388 | /* | |
389 | * The process is already dead, notifier were removed | |
390 | * already. | |
391 | */ | |
392 | goto out; | |
393 | ||
394 | owning_mm = get_task_mm(owning_process); | |
395 | if (owning_mm == NULL) | |
396 | /* | |
397 | * The process' mm is already dead, notifier were | |
398 | * removed already. | |
399 | */ | |
400 | goto out_put_task; | |
401 | mmu_notifier_unregister(&context->mn, owning_mm); | |
402 | ||
403 | mmput(owning_mm); | |
404 | ||
405 | out_put_task: | |
406 | put_task_struct(owning_process); | |
407 | } | |
408 | out: | |
409 | up_read(&context->umem_rwsem); | |
410 | ||
8ada2c1c SR |
411 | vfree(umem->odp_data->dma_list); |
412 | vfree(umem->odp_data->page_list); | |
413 | kfree(umem->odp_data); | |
414 | kfree(umem); | |
415 | } | |
416 | ||
417 | /* | |
418 | * Map for DMA and insert a single page into the on-demand paging page tables. | |
419 | * | |
420 | * @umem: the umem to insert the page to. | |
421 | * @page_index: index in the umem to add the page to. | |
422 | * @page: the page struct to map and add. | |
423 | * @access_mask: access permissions needed for this page. | |
424 | * @current_seq: sequence number for synchronization with invalidations. | |
425 | * the sequence number is taken from | |
426 | * umem->odp_data->notifiers_seq. | |
427 | * | |
882214e2 HE |
428 | * The function returns -EFAULT if the DMA mapping operation fails. It returns |
429 | * -EAGAIN if a concurrent invalidation prevents us from updating the page. | |
8ada2c1c SR |
430 | * |
431 | * The page is released via put_page even if the operation failed. For | |
432 | * on-demand pinning, the page is released whenever it isn't stored in the | |
433 | * umem. | |
434 | */ | |
435 | static int ib_umem_odp_map_dma_single_page( | |
436 | struct ib_umem *umem, | |
437 | int page_index, | |
882214e2 | 438 | u64 base_virt_addr, |
8ada2c1c SR |
439 | struct page *page, |
440 | u64 access_mask, | |
441 | unsigned long current_seq) | |
442 | { | |
443 | struct ib_device *dev = umem->context->device; | |
444 | dma_addr_t dma_addr; | |
445 | int stored_page = 0; | |
882214e2 | 446 | int remove_existing_mapping = 0; |
8ada2c1c SR |
447 | int ret = 0; |
448 | ||
882214e2 HE |
449 | /* |
450 | * Note: we avoid writing if seq is different from the initial seq, to | |
451 | * handle case of a racing notifier. This check also allows us to bail | |
452 | * early if we have a notifier running in parallel with us. | |
453 | */ | |
454 | if (ib_umem_mmu_notifier_retry(umem, current_seq)) { | |
455 | ret = -EAGAIN; | |
456 | goto out; | |
457 | } | |
8ada2c1c SR |
458 | if (!(umem->odp_data->dma_list[page_index])) { |
459 | dma_addr = ib_dma_map_page(dev, | |
460 | page, | |
461 | 0, PAGE_SIZE, | |
462 | DMA_BIDIRECTIONAL); | |
463 | if (ib_dma_mapping_error(dev, dma_addr)) { | |
464 | ret = -EFAULT; | |
465 | goto out; | |
466 | } | |
467 | umem->odp_data->dma_list[page_index] = dma_addr | access_mask; | |
468 | umem->odp_data->page_list[page_index] = page; | |
469 | stored_page = 1; | |
470 | } else if (umem->odp_data->page_list[page_index] == page) { | |
471 | umem->odp_data->dma_list[page_index] |= access_mask; | |
472 | } else { | |
473 | pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", | |
474 | umem->odp_data->page_list[page_index], page); | |
882214e2 HE |
475 | /* Better remove the mapping now, to prevent any further |
476 | * damage. */ | |
477 | remove_existing_mapping = 1; | |
8ada2c1c SR |
478 | } |
479 | ||
480 | out: | |
882214e2 HE |
481 | /* On Demand Paging - avoid pinning the page */ |
482 | if (umem->context->invalidate_range || !stored_page) | |
8ada2c1c SR |
483 | put_page(page); |
484 | ||
882214e2 HE |
485 | if (remove_existing_mapping && umem->context->invalidate_range) { |
486 | invalidate_page_trampoline( | |
487 | umem, | |
488 | base_virt_addr + (page_index * PAGE_SIZE), | |
489 | base_virt_addr + ((page_index+1)*PAGE_SIZE), | |
490 | NULL); | |
491 | ret = -EAGAIN; | |
492 | } | |
493 | ||
8ada2c1c SR |
494 | return ret; |
495 | } | |
496 | ||
497 | /** | |
498 | * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR. | |
499 | * | |
500 | * Pins the range of pages passed in the argument, and maps them to | |
501 | * DMA addresses. The DMA addresses of the mapped pages is updated in | |
502 | * umem->odp_data->dma_list. | |
503 | * | |
504 | * Returns the number of pages mapped in success, negative error code | |
505 | * for failure. | |
882214e2 HE |
506 | * An -EAGAIN error code is returned when a concurrent mmu notifier prevents |
507 | * the function from completing its task. | |
8ada2c1c SR |
508 | * |
509 | * @umem: the umem to map and pin | |
510 | * @user_virt: the address from which we need to map. | |
511 | * @bcnt: the minimal number of bytes to pin and map. The mapping might be | |
512 | * bigger due to alignment, and may also be smaller in case of an error | |
513 | * pinning or mapping a page. The actual pages mapped is returned in | |
514 | * the return value. | |
515 | * @access_mask: bit mask of the requested access permissions for the given | |
516 | * range. | |
517 | * @current_seq: the MMU notifiers sequance value for synchronization with | |
518 | * invalidations. the sequance number is read from | |
519 | * umem->odp_data->notifiers_seq before calling this function | |
520 | */ | |
521 | int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, | |
522 | u64 access_mask, unsigned long current_seq) | |
523 | { | |
524 | struct task_struct *owning_process = NULL; | |
525 | struct mm_struct *owning_mm = NULL; | |
526 | struct page **local_page_list = NULL; | |
527 | u64 off; | |
528 | int j, k, ret = 0, start_idx, npages = 0; | |
882214e2 | 529 | u64 base_virt_addr; |
8ada2c1c SR |
530 | |
531 | if (access_mask == 0) | |
532 | return -EINVAL; | |
533 | ||
534 | if (user_virt < ib_umem_start(umem) || | |
535 | user_virt + bcnt > ib_umem_end(umem)) | |
536 | return -EFAULT; | |
537 | ||
538 | local_page_list = (struct page **)__get_free_page(GFP_KERNEL); | |
539 | if (!local_page_list) | |
540 | return -ENOMEM; | |
541 | ||
542 | off = user_virt & (~PAGE_MASK); | |
543 | user_virt = user_virt & PAGE_MASK; | |
882214e2 | 544 | base_virt_addr = user_virt; |
8ada2c1c SR |
545 | bcnt += off; /* Charge for the first page offset as well. */ |
546 | ||
547 | owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID); | |
548 | if (owning_process == NULL) { | |
549 | ret = -EINVAL; | |
550 | goto out_no_task; | |
551 | } | |
552 | ||
553 | owning_mm = get_task_mm(owning_process); | |
554 | if (owning_mm == NULL) { | |
555 | ret = -EINVAL; | |
556 | goto out_put_task; | |
557 | } | |
558 | ||
559 | start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT; | |
560 | k = start_idx; | |
561 | ||
562 | while (bcnt > 0) { | |
563 | const size_t gup_num_pages = | |
564 | min_t(size_t, ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE, | |
565 | PAGE_SIZE / sizeof(struct page *)); | |
566 | ||
567 | down_read(&owning_mm->mmap_sem); | |
568 | /* | |
569 | * Note: this might result in redundent page getting. We can | |
570 | * avoid this by checking dma_list to be 0 before calling | |
571 | * get_user_pages. However, this make the code much more | |
572 | * complex (and doesn't gain us much performance in most use | |
573 | * cases). | |
574 | */ | |
575 | npages = get_user_pages(owning_process, owning_mm, user_virt, | |
576 | gup_num_pages, | |
577 | access_mask & ODP_WRITE_ALLOWED_BIT, 0, | |
578 | local_page_list, NULL); | |
579 | up_read(&owning_mm->mmap_sem); | |
580 | ||
581 | if (npages < 0) | |
582 | break; | |
583 | ||
584 | bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); | |
585 | user_virt += npages << PAGE_SHIFT; | |
c1d383b5 | 586 | mutex_lock(&umem->odp_data->umem_mutex); |
8ada2c1c SR |
587 | for (j = 0; j < npages; ++j) { |
588 | ret = ib_umem_odp_map_dma_single_page( | |
882214e2 HE |
589 | umem, k, base_virt_addr, local_page_list[j], |
590 | access_mask, current_seq); | |
8ada2c1c SR |
591 | if (ret < 0) |
592 | break; | |
593 | k++; | |
594 | } | |
c1d383b5 | 595 | mutex_unlock(&umem->odp_data->umem_mutex); |
8ada2c1c SR |
596 | |
597 | if (ret < 0) { | |
598 | /* Release left over pages when handling errors. */ | |
599 | for (++j; j < npages; ++j) | |
600 | put_page(local_page_list[j]); | |
601 | break; | |
602 | } | |
603 | } | |
604 | ||
605 | if (ret >= 0) { | |
606 | if (npages < 0 && k == start_idx) | |
607 | ret = npages; | |
608 | else | |
609 | ret = k - start_idx; | |
610 | } | |
611 | ||
612 | mmput(owning_mm); | |
613 | out_put_task: | |
614 | put_task_struct(owning_process); | |
615 | out_no_task: | |
616 | free_page((unsigned long)local_page_list); | |
617 | return ret; | |
618 | } | |
619 | EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); | |
620 | ||
621 | void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, | |
622 | u64 bound) | |
623 | { | |
624 | int idx; | |
625 | u64 addr; | |
626 | struct ib_device *dev = umem->context->device; | |
627 | ||
628 | virt = max_t(u64, virt, ib_umem_start(umem)); | |
629 | bound = min_t(u64, bound, ib_umem_end(umem)); | |
882214e2 HE |
630 | /* Note that during the run of this function, the |
631 | * notifiers_count of the MR is > 0, preventing any racing | |
632 | * faults from completion. We might be racing with other | |
633 | * invalidations, so we must make sure we free each page only | |
634 | * once. */ | |
c1d383b5 | 635 | mutex_lock(&umem->odp_data->umem_mutex); |
8ada2c1c SR |
636 | for (addr = virt; addr < bound; addr += (u64)umem->page_size) { |
637 | idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; | |
8ada2c1c SR |
638 | if (umem->odp_data->page_list[idx]) { |
639 | struct page *page = umem->odp_data->page_list[idx]; | |
8ada2c1c SR |
640 | dma_addr_t dma = umem->odp_data->dma_list[idx]; |
641 | dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; | |
642 | ||
643 | WARN_ON(!dma_addr); | |
644 | ||
645 | ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE, | |
646 | DMA_BIDIRECTIONAL); | |
325ad061 GS |
647 | if (dma & ODP_WRITE_ALLOWED_BIT) { |
648 | struct page *head_page = compound_head(page); | |
882214e2 HE |
649 | /* |
650 | * set_page_dirty prefers being called with | |
651 | * the page lock. However, MMU notifiers are | |
652 | * called sometimes with and sometimes without | |
653 | * the lock. We rely on the umem_mutex instead | |
654 | * to prevent other mmu notifiers from | |
655 | * continuing and allowing the page mapping to | |
656 | * be removed. | |
657 | */ | |
658 | set_page_dirty(head_page); | |
325ad061 | 659 | } |
882214e2 HE |
660 | /* on demand pinning support */ |
661 | if (!umem->context->invalidate_range) | |
662 | put_page(page); | |
663 | umem->odp_data->page_list[idx] = NULL; | |
664 | umem->odp_data->dma_list[idx] = 0; | |
8ada2c1c | 665 | } |
8ada2c1c | 666 | } |
c1d383b5 | 667 | mutex_unlock(&umem->odp_data->umem_mutex); |
8ada2c1c SR |
668 | } |
669 | EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); |