Commit | Line | Data |
---|---|---|
f727a0c3 MH |
1 | /* |
2 | * | |
3 | * This file is provided under a dual BSD/GPLv2 license. When using or | |
4 | * redistributing this file, you may do so under either license. | |
5 | * | |
6 | * GPL LICENSE SUMMARY | |
7 | * | |
a92ba6d6 | 8 | * Copyright(c) 2015, 2016 Intel Corporation. |
f727a0c3 MH |
9 | * |
10 | * This program is free software; you can redistribute it and/or modify | |
11 | * it under the terms of version 2 of the GNU General Public License as | |
12 | * published by the Free Software Foundation. | |
13 | * | |
14 | * This program is distributed in the hope that it will be useful, but | |
15 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 | * General Public License for more details. | |
18 | * | |
19 | * BSD LICENSE | |
20 | * | |
a92ba6d6 | 21 | * Copyright(c) 2015, 2016 Intel Corporation. |
f727a0c3 MH |
22 | * |
23 | * Redistribution and use in source and binary forms, with or without | |
24 | * modification, are permitted provided that the following conditions | |
25 | * are met: | |
26 | * | |
27 | * - Redistributions of source code must retain the above copyright | |
28 | * notice, this list of conditions and the following disclaimer. | |
29 | * - Redistributions in binary form must reproduce the above copyright | |
30 | * notice, this list of conditions and the following disclaimer in | |
31 | * the documentation and/or other materials provided with the | |
32 | * distribution. | |
33 | * - Neither the name of Intel Corporation nor the names of its | |
34 | * contributors may be used to endorse or promote products derived | |
35 | * from this software without specific prior written permission. | |
36 | * | |
37 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
38 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
39 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
40 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
41 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
42 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
43 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
44 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
45 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
46 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
47 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
48 | * | |
49 | */ | |
50 | #include <asm/page.h> | |
51 | ||
52 | #include "user_exp_rcv.h" | |
53 | #include "trace.h" | |
54 | ||
b8abe346 MH |
55 | struct tid_group { |
56 | struct list_head list; | |
57 | unsigned base; | |
58 | u8 size; | |
59 | u8 used; | |
60 | u8 map; | |
61 | }; | |
62 | ||
f727a0c3 MH |
63 | struct mmu_rb_node { |
64 | struct rb_node rbnode; | |
65 | unsigned long virt; | |
66 | unsigned long phys; | |
67 | unsigned long len; | |
68 | struct tid_group *grp; | |
69 | u32 rcventry; | |
70 | dma_addr_t dma_addr; | |
71 | bool freed; | |
72 | unsigned npages; | |
73 | struct page *pages[0]; | |
74 | }; | |
75 | ||
76 | enum mmu_call_types { | |
77 | MMU_INVALIDATE_PAGE = 0, | |
78 | MMU_INVALIDATE_RANGE = 1 | |
79 | }; | |
80 | ||
81 | static const char * const mmu_types[] = { | |
82 | "PAGE", | |
83 | "RANGE" | |
84 | }; | |
85 | ||
f88e0c8a MH |
86 | struct tid_pageset { |
87 | u16 idx; | |
88 | u16 count; | |
89 | }; | |
90 | ||
b8abe346 MH |
91 | #define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list)) |
92 | ||
3abb33ac MH |
93 | #define num_user_pages(vaddr, len) \ |
94 | (1 + (((((unsigned long)(vaddr) + \ | |
95 | (unsigned long)(len) - 1) & PAGE_MASK) - \ | |
96 | ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT)) | |
97 | ||
f88e0c8a | 98 | static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *, |
3abb33ac | 99 | struct rb_root *); |
7e7a436e | 100 | static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *); |
f88e0c8a | 101 | static int set_rcvarray_entry(struct file *, unsigned long, u32, |
3abb33ac | 102 | struct tid_group *, struct page **, unsigned); |
f727a0c3 MH |
103 | static inline int mmu_addr_cmp(struct mmu_rb_node *, unsigned long, |
104 | unsigned long); | |
a92ba6d6 MH |
105 | static struct mmu_rb_node *mmu_rb_search(struct rb_root *, unsigned long); |
106 | static int mmu_rb_insert_by_addr(struct hfi1_filedata *, struct rb_root *, | |
107 | struct mmu_rb_node *); | |
108 | static int mmu_rb_insert_by_entry(struct hfi1_filedata *, struct rb_root *, | |
109 | struct mmu_rb_node *); | |
110 | static void mmu_rb_remove_by_addr(struct hfi1_filedata *, struct rb_root *, | |
111 | struct mmu_rb_node *); | |
112 | static void mmu_rb_remove_by_entry(struct hfi1_filedata *, struct rb_root *, | |
113 | struct mmu_rb_node *); | |
f727a0c3 MH |
114 | static void mmu_notifier_mem_invalidate(struct mmu_notifier *, |
115 | unsigned long, unsigned long, | |
116 | enum mmu_call_types); | |
117 | static inline void mmu_notifier_page(struct mmu_notifier *, struct mm_struct *, | |
118 | unsigned long); | |
119 | static inline void mmu_notifier_range_start(struct mmu_notifier *, | |
120 | struct mm_struct *, | |
121 | unsigned long, unsigned long); | |
f88e0c8a MH |
122 | static int program_rcvarray(struct file *, unsigned long, struct tid_group *, |
123 | struct tid_pageset *, unsigned, u16, struct page **, | |
7e7a436e | 124 | u32 *, unsigned *, unsigned *); |
455d7f1a MH |
125 | static int unprogram_rcvarray(struct file *, u32, struct tid_group **); |
126 | static void clear_tid_node(struct hfi1_filedata *, u16, struct mmu_rb_node *); | |
f88e0c8a MH |
127 | |
128 | static inline u32 rcventry2tidinfo(u32 rcventry) | |
129 | { | |
130 | u32 pair = rcventry & ~0x1; | |
131 | ||
132 | return EXP_TID_SET(IDX, pair >> 1) | | |
133 | EXP_TID_SET(CTRL, 1 << (rcventry - pair)); | |
134 | } | |
f727a0c3 | 135 | |
b8abe346 MH |
136 | static inline void exp_tid_group_init(struct exp_tid_set *set) |
137 | { | |
138 | INIT_LIST_HEAD(&set->list); | |
139 | set->count = 0; | |
140 | } | |
141 | ||
142 | static inline void tid_group_remove(struct tid_group *grp, | |
143 | struct exp_tid_set *set) | |
144 | { | |
145 | list_del_init(&grp->list); | |
146 | set->count--; | |
147 | } | |
148 | ||
149 | static inline void tid_group_add_tail(struct tid_group *grp, | |
150 | struct exp_tid_set *set) | |
151 | { | |
152 | list_add_tail(&grp->list, &set->list); | |
153 | set->count++; | |
154 | } | |
155 | ||
156 | static inline struct tid_group *tid_group_pop(struct exp_tid_set *set) | |
157 | { | |
158 | struct tid_group *grp = | |
159 | list_first_entry(&set->list, struct tid_group, list); | |
160 | list_del_init(&grp->list); | |
161 | set->count--; | |
162 | return grp; | |
163 | } | |
164 | ||
165 | static inline void tid_group_move(struct tid_group *group, | |
166 | struct exp_tid_set *s1, | |
167 | struct exp_tid_set *s2) | |
168 | { | |
169 | tid_group_remove(group, s1); | |
170 | tid_group_add_tail(group, s2); | |
171 | } | |
172 | ||
3abb33ac | 173 | static struct mmu_notifier_ops mn_opts = { |
f727a0c3 MH |
174 | .invalidate_page = mmu_notifier_page, |
175 | .invalidate_range_start = mmu_notifier_range_start, | |
176 | }; | |
177 | ||
178 | /* | |
179 | * Initialize context and file private data needed for Expected | |
180 | * receive caching. This needs to be done after the context has | |
181 | * been configured with the eager/expected RcvEntry counts. | |
182 | */ | |
183 | int hfi1_user_exp_rcv_init(struct file *fp) | |
184 | { | |
3abb33ac MH |
185 | struct hfi1_filedata *fd = fp->private_data; |
186 | struct hfi1_ctxtdata *uctxt = fd->uctxt; | |
187 | struct hfi1_devdata *dd = uctxt->dd; | |
188 | unsigned tidbase; | |
189 | int i, ret = 0; | |
190 | ||
191 | INIT_HLIST_NODE(&fd->mn.hlist); | |
192 | spin_lock_init(&fd->rb_lock); | |
193 | spin_lock_init(&fd->tid_lock); | |
194 | spin_lock_init(&fd->invalid_lock); | |
195 | fd->mn.ops = &mn_opts; | |
196 | fd->tid_rb_root = RB_ROOT; | |
197 | ||
198 | if (!uctxt->subctxt_cnt || !fd->subctxt) { | |
199 | exp_tid_group_init(&uctxt->tid_group_list); | |
200 | exp_tid_group_init(&uctxt->tid_used_list); | |
201 | exp_tid_group_init(&uctxt->tid_full_list); | |
202 | ||
203 | tidbase = uctxt->expected_base; | |
204 | for (i = 0; i < uctxt->expected_count / | |
205 | dd->rcv_entries.group_size; i++) { | |
206 | struct tid_group *grp; | |
207 | ||
208 | grp = kzalloc(sizeof(*grp), GFP_KERNEL); | |
209 | if (!grp) { | |
210 | /* | |
211 | * If we fail here, the groups already | |
212 | * allocated will be freed by the close | |
213 | * call. | |
214 | */ | |
215 | ret = -ENOMEM; | |
216 | goto done; | |
217 | } | |
218 | grp->size = dd->rcv_entries.group_size; | |
219 | grp->base = tidbase; | |
220 | tid_group_add_tail(grp, &uctxt->tid_group_list); | |
221 | tidbase += dd->rcv_entries.group_size; | |
222 | } | |
223 | } | |
224 | ||
a92ba6d6 MH |
225 | fd->entry_to_rb = kcalloc(uctxt->expected_count, |
226 | sizeof(struct rb_node *), | |
227 | GFP_KERNEL); | |
228 | if (!fd->entry_to_rb) | |
229 | return -ENOMEM; | |
230 | ||
3abb33ac MH |
231 | if (!HFI1_CAP_IS_USET(TID_UNMAP)) { |
232 | fd->invalid_tid_idx = 0; | |
233 | fd->invalid_tids = kzalloc(uctxt->expected_count * | |
234 | sizeof(u32), GFP_KERNEL); | |
235 | if (!fd->invalid_tids) { | |
236 | ret = -ENOMEM; | |
237 | goto done; | |
a92ba6d6 MH |
238 | } |
239 | ||
240 | /* | |
241 | * Register MMU notifier callbacks. If the registration | |
242 | * fails, continue but turn off the TID caching for | |
243 | * all user contexts. | |
244 | */ | |
245 | ret = mmu_notifier_register(&fd->mn, current->mm); | |
246 | if (ret) { | |
247 | dd_dev_info(dd, | |
248 | "Failed MMU notifier registration %d\n", | |
249 | ret); | |
250 | HFI1_CAP_USET(TID_UNMAP); | |
251 | ret = 0; | |
3abb33ac MH |
252 | } |
253 | } | |
254 | ||
a92ba6d6 | 255 | if (HFI1_CAP_IS_USET(TID_UNMAP)) { |
3abb33ac | 256 | fd->mmu_rb_insert = mmu_rb_insert_by_entry; |
a92ba6d6 MH |
257 | fd->mmu_rb_remove = mmu_rb_remove_by_entry; |
258 | } else { | |
3abb33ac | 259 | fd->mmu_rb_insert = mmu_rb_insert_by_addr; |
a92ba6d6 MH |
260 | fd->mmu_rb_remove = mmu_rb_remove_by_addr; |
261 | } | |
3abb33ac MH |
262 | |
263 | /* | |
264 | * PSM does not have a good way to separate, count, and | |
265 | * effectively enforce a limit on RcvArray entries used by | |
266 | * subctxts (when context sharing is used) when TID caching | |
267 | * is enabled. To help with that, we calculate a per-process | |
268 | * RcvArray entry share and enforce that. | |
269 | * If TID caching is not in use, PSM deals with usage on its | |
270 | * own. In that case, we allow any subctxt to take all of the | |
271 | * entries. | |
272 | * | |
273 | * Make sure that we set the tid counts only after successful | |
274 | * init. | |
275 | */ | |
455d7f1a | 276 | spin_lock(&fd->tid_lock); |
3abb33ac MH |
277 | if (uctxt->subctxt_cnt && !HFI1_CAP_IS_USET(TID_UNMAP)) { |
278 | u16 remainder; | |
279 | ||
280 | fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt; | |
281 | remainder = uctxt->expected_count % uctxt->subctxt_cnt; | |
282 | if (remainder && fd->subctxt < remainder) | |
283 | fd->tid_limit++; | |
284 | } else { | |
285 | fd->tid_limit = uctxt->expected_count; | |
286 | } | |
455d7f1a | 287 | spin_unlock(&fd->tid_lock); |
3abb33ac MH |
288 | done: |
289 | return ret; | |
f727a0c3 MH |
290 | } |
291 | ||
292 | int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd) | |
293 | { | |
3abb33ac MH |
294 | struct hfi1_ctxtdata *uctxt = fd->uctxt; |
295 | struct tid_group *grp, *gptr; | |
296 | ||
297 | /* | |
298 | * The notifier would have been removed when the process'es mm | |
299 | * was freed. | |
300 | */ | |
301 | if (current->mm && !HFI1_CAP_IS_USET(TID_UNMAP)) | |
302 | mmu_notifier_unregister(&fd->mn, current->mm); | |
303 | ||
304 | kfree(fd->invalid_tids); | |
305 | ||
306 | if (!uctxt->cnt) { | |
307 | if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list)) | |
308 | unlock_exp_tids(uctxt, &uctxt->tid_full_list, | |
309 | &fd->tid_rb_root); | |
310 | if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list)) | |
311 | unlock_exp_tids(uctxt, &uctxt->tid_used_list, | |
312 | &fd->tid_rb_root); | |
313 | list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list, | |
314 | list) { | |
315 | list_del_init(&grp->list); | |
316 | kfree(grp); | |
317 | } | |
318 | spin_lock(&fd->rb_lock); | |
319 | if (!RB_EMPTY_ROOT(&fd->tid_rb_root)) { | |
320 | struct rb_node *node; | |
321 | struct mmu_rb_node *rbnode; | |
322 | ||
323 | while ((node = rb_first(&fd->tid_rb_root))) { | |
324 | rbnode = rb_entry(node, struct mmu_rb_node, | |
325 | rbnode); | |
326 | rb_erase(&rbnode->rbnode, &fd->tid_rb_root); | |
327 | kfree(rbnode); | |
328 | } | |
329 | } | |
330 | spin_unlock(&fd->rb_lock); | |
331 | hfi1_clear_tids(uctxt); | |
332 | } | |
a92ba6d6 MH |
333 | |
334 | kfree(fd->entry_to_rb); | |
3abb33ac | 335 | return 0; |
f727a0c3 MH |
336 | } |
337 | ||
b8abe346 MH |
338 | /* |
339 | * Write an "empty" RcvArray entry. | |
340 | * This function exists so the TID registaration code can use it | |
341 | * to write to unused/unneeded entries and still take advantage | |
342 | * of the WC performance improvements. The HFI will ignore this | |
343 | * write to the RcvArray entry. | |
344 | */ | |
345 | static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index) | |
346 | { | |
347 | /* | |
348 | * Doing the WC fill writes only makes sense if the device is | |
349 | * present and the RcvArray has been mapped as WC memory. | |
350 | */ | |
351 | if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc) | |
352 | writeq(0, dd->rcvarray_wc + (index * 8)); | |
353 | } | |
354 | ||
7e7a436e MH |
355 | /* |
356 | * RcvArray entry allocation for Expected Receives is done by the | |
357 | * following algorithm: | |
358 | * | |
359 | * The context keeps 3 lists of groups of RcvArray entries: | |
360 | * 1. List of empty groups - tid_group_list | |
361 | * This list is created during user context creation and | |
362 | * contains elements which describe sets (of 8) of empty | |
363 | * RcvArray entries. | |
364 | * 2. List of partially used groups - tid_used_list | |
365 | * This list contains sets of RcvArray entries which are | |
366 | * not completely used up. Another mapping request could | |
367 | * use some of all of the remaining entries. | |
368 | * 3. List of full groups - tid_full_list | |
369 | * This is the list where sets that are completely used | |
370 | * up go. | |
371 | * | |
372 | * An attempt to optimize the usage of RcvArray entries is | |
373 | * made by finding all sets of physically contiguous pages in a | |
374 | * user's buffer. | |
375 | * These physically contiguous sets are further split into | |
376 | * sizes supported by the receive engine of the HFI. The | |
377 | * resulting sets of pages are stored in struct tid_pageset, | |
378 | * which describes the sets as: | |
379 | * * .count - number of pages in this set | |
380 | * * .idx - starting index into struct page ** array | |
381 | * of this set | |
382 | * | |
383 | * From this point on, the algorithm deals with the page sets | |
384 | * described above. The number of pagesets is divided by the | |
385 | * RcvArray group size to produce the number of full groups | |
386 | * needed. | |
387 | * | |
388 | * Groups from the 3 lists are manipulated using the following | |
389 | * rules: | |
390 | * 1. For each set of 8 pagesets, a complete group from | |
391 | * tid_group_list is taken, programmed, and moved to | |
392 | * the tid_full_list list. | |
393 | * 2. For all remaining pagesets: | |
394 | * 2.1 If the tid_used_list is empty and the tid_group_list | |
395 | * is empty, stop processing pageset and return only | |
396 | * what has been programmed up to this point. | |
397 | * 2.2 If the tid_used_list is empty and the tid_group_list | |
398 | * is not empty, move a group from tid_group_list to | |
399 | * tid_used_list. | |
400 | * 2.3 For each group is tid_used_group, program as much as | |
401 | * can fit into the group. If the group becomes fully | |
402 | * used, move it to tid_full_list. | |
403 | */ | |
f727a0c3 MH |
404 | int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo) |
405 | { | |
7e7a436e MH |
406 | int ret = 0, need_group = 0, pinned; |
407 | struct hfi1_filedata *fd = fp->private_data; | |
408 | struct hfi1_ctxtdata *uctxt = fd->uctxt; | |
409 | struct hfi1_devdata *dd = uctxt->dd; | |
410 | unsigned npages, ngroups, pageidx = 0, pageset_count, npagesets, | |
411 | tididx = 0, mapped, mapped_pages = 0; | |
412 | unsigned long vaddr = tinfo->vaddr; | |
413 | struct page **pages = NULL; | |
414 | u32 *tidlist = NULL; | |
415 | struct tid_pageset *pagesets = NULL; | |
416 | ||
417 | /* Get the number of pages the user buffer spans */ | |
418 | npages = num_user_pages(vaddr, tinfo->length); | |
419 | if (!npages) | |
420 | return -EINVAL; | |
421 | ||
422 | if (npages > uctxt->expected_count) { | |
423 | dd_dev_err(dd, "Expected buffer too big\n"); | |
424 | return -EINVAL; | |
425 | } | |
426 | ||
427 | /* Verify that access is OK for the user buffer */ | |
428 | if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, | |
429 | npages * PAGE_SIZE)) { | |
430 | dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n", | |
431 | (void *)vaddr, npages); | |
432 | return -EFAULT; | |
433 | } | |
434 | ||
435 | pagesets = kcalloc(uctxt->expected_count, sizeof(*pagesets), | |
436 | GFP_KERNEL); | |
437 | if (!pagesets) | |
438 | return -ENOMEM; | |
439 | ||
440 | /* Allocate the array of struct page pointers needed for pinning */ | |
441 | pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); | |
442 | if (!pages) { | |
443 | ret = -ENOMEM; | |
444 | goto bail; | |
445 | } | |
446 | ||
447 | /* | |
448 | * Pin all the pages of the user buffer. If we can't pin all the | |
449 | * pages, accept the amount pinned so far and program only that. | |
450 | * User space knows how to deal with partially programmed buffers. | |
451 | */ | |
452 | pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages); | |
453 | if (pinned <= 0) { | |
454 | ret = pinned; | |
455 | goto bail; | |
456 | } | |
457 | ||
458 | /* Find sets of physically contiguous pages */ | |
459 | npagesets = find_phys_blocks(pages, pinned, pagesets); | |
460 | ||
461 | /* | |
462 | * We don't need to access this under a lock since tid_used is per | |
463 | * process and the same process cannot be in hfi1_user_exp_rcv_clear() | |
464 | * and hfi1_user_exp_rcv_setup() at the same time. | |
465 | */ | |
466 | spin_lock(&fd->tid_lock); | |
467 | if (fd->tid_used + npagesets > fd->tid_limit) | |
468 | pageset_count = fd->tid_limit - fd->tid_used; | |
469 | else | |
470 | pageset_count = npagesets; | |
471 | spin_unlock(&fd->tid_lock); | |
472 | ||
473 | if (!pageset_count) | |
474 | goto bail; | |
475 | ||
476 | ngroups = pageset_count / dd->rcv_entries.group_size; | |
477 | tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL); | |
478 | if (!tidlist) { | |
479 | ret = -ENOMEM; | |
480 | goto nomem; | |
481 | } | |
482 | ||
483 | tididx = 0; | |
484 | ||
485 | /* | |
486 | * From this point on, we are going to be using shared (between master | |
487 | * and subcontexts) context resources. We need to take the lock. | |
488 | */ | |
489 | mutex_lock(&uctxt->exp_lock); | |
490 | /* | |
491 | * The first step is to program the RcvArray entries which are complete | |
492 | * groups. | |
493 | */ | |
494 | while (ngroups && uctxt->tid_group_list.count) { | |
495 | struct tid_group *grp = | |
496 | tid_group_pop(&uctxt->tid_group_list); | |
497 | ||
498 | ret = program_rcvarray(fp, vaddr, grp, pagesets, | |
499 | pageidx, dd->rcv_entries.group_size, | |
500 | pages, tidlist, &tididx, &mapped); | |
501 | /* | |
502 | * If there was a failure to program the RcvArray | |
503 | * entries for the entire group, reset the grp fields | |
504 | * and add the grp back to the free group list. | |
505 | */ | |
506 | if (ret <= 0) { | |
507 | tid_group_add_tail(grp, &uctxt->tid_group_list); | |
508 | hfi1_cdbg(TID, | |
509 | "Failed to program RcvArray group %d", ret); | |
510 | goto unlock; | |
511 | } | |
512 | ||
513 | tid_group_add_tail(grp, &uctxt->tid_full_list); | |
514 | ngroups--; | |
515 | pageidx += ret; | |
516 | mapped_pages += mapped; | |
517 | } | |
518 | ||
519 | while (pageidx < pageset_count) { | |
520 | struct tid_group *grp, *ptr; | |
521 | /* | |
522 | * If we don't have any partially used tid groups, check | |
523 | * if we have empty groups. If so, take one from there and | |
524 | * put in the partially used list. | |
525 | */ | |
526 | if (!uctxt->tid_used_list.count || need_group) { | |
527 | if (!uctxt->tid_group_list.count) | |
528 | goto unlock; | |
529 | ||
530 | grp = tid_group_pop(&uctxt->tid_group_list); | |
531 | tid_group_add_tail(grp, &uctxt->tid_used_list); | |
532 | need_group = 0; | |
533 | } | |
534 | /* | |
535 | * There is an optimization opportunity here - instead of | |
536 | * fitting as many page sets as we can, check for a group | |
537 | * later on in the list that could fit all of them. | |
538 | */ | |
539 | list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list, | |
540 | list) { | |
541 | unsigned use = min_t(unsigned, pageset_count - pageidx, | |
542 | grp->size - grp->used); | |
543 | ||
544 | ret = program_rcvarray(fp, vaddr, grp, pagesets, | |
545 | pageidx, use, pages, tidlist, | |
546 | &tididx, &mapped); | |
547 | if (ret < 0) { | |
548 | hfi1_cdbg(TID, | |
549 | "Failed to program RcvArray entries %d", | |
550 | ret); | |
551 | ret = -EFAULT; | |
552 | goto unlock; | |
553 | } else if (ret > 0) { | |
554 | if (grp->used == grp->size) | |
555 | tid_group_move(grp, | |
556 | &uctxt->tid_used_list, | |
557 | &uctxt->tid_full_list); | |
558 | pageidx += ret; | |
559 | mapped_pages += mapped; | |
560 | need_group = 0; | |
561 | /* Check if we are done so we break out early */ | |
562 | if (pageidx >= pageset_count) | |
563 | break; | |
564 | } else if (WARN_ON(ret == 0)) { | |
565 | /* | |
566 | * If ret is 0, we did not program any entries | |
567 | * into this group, which can only happen if | |
568 | * we've screwed up the accounting somewhere. | |
569 | * Warn and try to continue. | |
570 | */ | |
571 | need_group = 1; | |
572 | } | |
573 | } | |
574 | } | |
575 | unlock: | |
576 | mutex_unlock(&uctxt->exp_lock); | |
577 | nomem: | |
578 | hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx, | |
579 | mapped_pages, ret); | |
580 | if (tididx) { | |
581 | spin_lock(&fd->tid_lock); | |
582 | fd->tid_used += tididx; | |
583 | spin_unlock(&fd->tid_lock); | |
584 | tinfo->tidcnt = tididx; | |
585 | tinfo->length = mapped_pages * PAGE_SIZE; | |
586 | ||
587 | if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist, | |
588 | tidlist, sizeof(tidlist[0]) * tididx)) { | |
589 | /* | |
590 | * On failure to copy to the user level, we need to undo | |
591 | * everything done so far so we don't leak resources. | |
592 | */ | |
593 | tinfo->tidlist = (unsigned long)&tidlist; | |
594 | hfi1_user_exp_rcv_clear(fp, tinfo); | |
595 | tinfo->tidlist = 0; | |
596 | ret = -EFAULT; | |
597 | goto bail; | |
598 | } | |
599 | } | |
600 | ||
601 | /* | |
602 | * If not everything was mapped (due to insufficient RcvArray entries, | |
603 | * for example), unpin all unmapped pages so we can pin them nex time. | |
604 | */ | |
605 | if (mapped_pages != pinned) | |
606 | hfi1_release_user_pages(&pages[mapped_pages], | |
607 | pinned - mapped_pages, | |
608 | false); | |
609 | bail: | |
610 | kfree(pagesets); | |
611 | kfree(pages); | |
612 | kfree(tidlist); | |
613 | return ret > 0 ? 0 : ret; | |
f727a0c3 MH |
614 | } |
615 | ||
616 | int hfi1_user_exp_rcv_clear(struct file *fp, struct hfi1_tid_info *tinfo) | |
617 | { | |
455d7f1a MH |
618 | int ret = 0; |
619 | struct hfi1_filedata *fd = fp->private_data; | |
620 | struct hfi1_ctxtdata *uctxt = fd->uctxt; | |
621 | u32 *tidinfo; | |
622 | unsigned tididx; | |
623 | ||
624 | tidinfo = kcalloc(tinfo->tidcnt, sizeof(*tidinfo), GFP_KERNEL); | |
625 | if (!tidinfo) | |
626 | return -ENOMEM; | |
627 | ||
628 | if (copy_from_user(tidinfo, (void __user *)(unsigned long) | |
629 | tinfo->tidlist, sizeof(tidinfo[0]) * | |
630 | tinfo->tidcnt)) { | |
631 | ret = -EFAULT; | |
632 | goto done; | |
633 | } | |
634 | ||
635 | mutex_lock(&uctxt->exp_lock); | |
636 | for (tididx = 0; tididx < tinfo->tidcnt; tididx++) { | |
637 | ret = unprogram_rcvarray(fp, tidinfo[tididx], NULL); | |
638 | if (ret) { | |
639 | hfi1_cdbg(TID, "Failed to unprogram rcv array %d", | |
640 | ret); | |
641 | break; | |
642 | } | |
643 | } | |
644 | spin_lock(&fd->tid_lock); | |
645 | fd->tid_used -= tididx; | |
646 | spin_unlock(&fd->tid_lock); | |
647 | tinfo->tidcnt = tididx; | |
648 | mutex_unlock(&uctxt->exp_lock); | |
649 | done: | |
650 | kfree(tidinfo); | |
651 | return ret; | |
f727a0c3 MH |
652 | } |
653 | ||
654 | int hfi1_user_exp_rcv_invalid(struct file *fp, struct hfi1_tid_info *tinfo) | |
655 | { | |
455d7f1a MH |
656 | struct hfi1_filedata *fd = fp->private_data; |
657 | struct hfi1_ctxtdata *uctxt = fd->uctxt; | |
658 | unsigned long *ev = uctxt->dd->events + | |
659 | (((uctxt->ctxt - uctxt->dd->first_user_ctxt) * | |
660 | HFI1_MAX_SHARED_CTXTS) + fd->subctxt); | |
661 | u32 *array; | |
662 | int ret = 0; | |
663 | ||
664 | if (!fd->invalid_tids) | |
665 | return -EINVAL; | |
666 | ||
667 | /* | |
668 | * copy_to_user() can sleep, which will leave the invalid_lock | |
669 | * locked and cause the MMU notifier to be blocked on the lock | |
670 | * for a long time. | |
671 | * Copy the data to a local buffer so we can release the lock. | |
672 | */ | |
673 | array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL); | |
674 | if (!array) | |
675 | return -EFAULT; | |
676 | ||
677 | spin_lock(&fd->invalid_lock); | |
678 | if (fd->invalid_tid_idx) { | |
679 | memcpy(array, fd->invalid_tids, sizeof(*array) * | |
680 | fd->invalid_tid_idx); | |
681 | memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) * | |
682 | fd->invalid_tid_idx); | |
683 | tinfo->tidcnt = fd->invalid_tid_idx; | |
684 | fd->invalid_tid_idx = 0; | |
685 | /* | |
686 | * Reset the user flag while still holding the lock. | |
687 | * Otherwise, PSM can miss events. | |
688 | */ | |
689 | clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev); | |
690 | } else { | |
691 | tinfo->tidcnt = 0; | |
692 | } | |
693 | spin_unlock(&fd->invalid_lock); | |
694 | ||
695 | if (tinfo->tidcnt) { | |
696 | if (copy_to_user((void __user *)tinfo->tidlist, | |
697 | array, sizeof(*array) * tinfo->tidcnt)) | |
698 | ret = -EFAULT; | |
699 | } | |
700 | kfree(array); | |
701 | ||
702 | return ret; | |
f727a0c3 MH |
703 | } |
704 | ||
f88e0c8a MH |
705 | static u32 find_phys_blocks(struct page **pages, unsigned npages, |
706 | struct tid_pageset *list) | |
707 | { | |
708 | unsigned pagecount, pageidx, setcount = 0, i; | |
709 | unsigned long pfn, this_pfn; | |
710 | ||
711 | if (!npages) | |
712 | return 0; | |
713 | ||
714 | /* | |
715 | * Look for sets of physically contiguous pages in the user buffer. | |
716 | * This will allow us to optimize Expected RcvArray entry usage by | |
717 | * using the bigger supported sizes. | |
718 | */ | |
719 | pfn = page_to_pfn(pages[0]); | |
720 | for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) { | |
721 | this_pfn = i < npages ? page_to_pfn(pages[i]) : 0; | |
722 | ||
723 | /* | |
724 | * If the pfn's are not sequential, pages are not physically | |
725 | * contiguous. | |
726 | */ | |
727 | if (this_pfn != ++pfn) { | |
728 | /* | |
729 | * At this point we have to loop over the set of | |
730 | * physically contiguous pages and break them down it | |
731 | * sizes supported by the HW. | |
732 | * There are two main constraints: | |
733 | * 1. The max buffer size is MAX_EXPECTED_BUFFER. | |
734 | * If the total set size is bigger than that | |
735 | * program only a MAX_EXPECTED_BUFFER chunk. | |
736 | * 2. The buffer size has to be a power of two. If | |
737 | * it is not, round down to the closes power of | |
738 | * 2 and program that size. | |
739 | */ | |
740 | while (pagecount) { | |
741 | int maxpages = pagecount; | |
742 | u32 bufsize = pagecount * PAGE_SIZE; | |
743 | ||
744 | if (bufsize > MAX_EXPECTED_BUFFER) | |
745 | maxpages = | |
746 | MAX_EXPECTED_BUFFER >> | |
747 | PAGE_SHIFT; | |
748 | else if (!is_power_of_2(bufsize)) | |
749 | maxpages = | |
750 | rounddown_pow_of_two(bufsize) >> | |
751 | PAGE_SHIFT; | |
752 | ||
753 | list[setcount].idx = pageidx; | |
754 | list[setcount].count = maxpages; | |
755 | pagecount -= maxpages; | |
756 | pageidx += maxpages; | |
757 | setcount++; | |
758 | } | |
759 | pageidx = i; | |
760 | pagecount = 1; | |
761 | pfn = this_pfn; | |
762 | } else { | |
763 | pagecount++; | |
764 | } | |
765 | } | |
766 | return setcount; | |
767 | } | |
768 | ||
769 | /** | |
770 | * program_rcvarray() - program an RcvArray group with receive buffers | |
771 | * @fp: file pointer | |
772 | * @vaddr: starting user virtual address | |
773 | * @grp: RcvArray group | |
774 | * @sets: array of struct tid_pageset holding information on physically | |
775 | * contiguous chunks from the user buffer | |
776 | * @start: starting index into sets array | |
777 | * @count: number of struct tid_pageset's to program | |
778 | * @pages: an array of struct page * for the user buffer | |
779 | * @tidlist: the array of u32 elements when the information about the | |
780 | * programmed RcvArray entries is to be encoded. | |
781 | * @tididx: starting offset into tidlist | |
782 | * @pmapped: (output parameter) number of pages programmed into the RcvArray | |
783 | * entries. | |
784 | * | |
785 | * This function will program up to 'count' number of RcvArray entries from the | |
786 | * group 'grp'. To make best use of write-combining writes, the function will | |
787 | * perform writes to the unused RcvArray entries which will be ignored by the | |
788 | * HW. Each RcvArray entry will be programmed with a physically contiguous | |
789 | * buffer chunk from the user's virtual buffer. | |
790 | * | |
791 | * Return: | |
792 | * -EINVAL if the requested count is larger than the size of the group, | |
793 | * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or | |
794 | * number of RcvArray entries programmed. | |
795 | */ | |
796 | static int program_rcvarray(struct file *fp, unsigned long vaddr, | |
797 | struct tid_group *grp, | |
798 | struct tid_pageset *sets, | |
799 | unsigned start, u16 count, struct page **pages, | |
800 | u32 *tidlist, unsigned *tididx, unsigned *pmapped) | |
801 | { | |
802 | struct hfi1_filedata *fd = fp->private_data; | |
803 | struct hfi1_ctxtdata *uctxt = fd->uctxt; | |
804 | struct hfi1_devdata *dd = uctxt->dd; | |
805 | u16 idx; | |
806 | u32 tidinfo = 0, rcventry, useidx = 0; | |
807 | int mapped = 0; | |
808 | ||
809 | /* Count should never be larger than the group size */ | |
810 | if (count > grp->size) | |
811 | return -EINVAL; | |
812 | ||
813 | /* Find the first unused entry in the group */ | |
814 | for (idx = 0; idx < grp->size; idx++) { | |
815 | if (!(grp->map & (1 << idx))) { | |
816 | useidx = idx; | |
817 | break; | |
818 | } | |
819 | rcv_array_wc_fill(dd, grp->base + idx); | |
820 | } | |
821 | ||
822 | idx = 0; | |
823 | while (idx < count) { | |
824 | u16 npages, pageidx, setidx = start + idx; | |
825 | int ret = 0; | |
826 | ||
827 | /* | |
828 | * If this entry in the group is used, move to the next one. | |
829 | * If we go past the end of the group, exit the loop. | |
830 | */ | |
831 | if (useidx >= grp->size) { | |
832 | break; | |
833 | } else if (grp->map & (1 << useidx)) { | |
834 | rcv_array_wc_fill(dd, grp->base + useidx); | |
835 | useidx++; | |
836 | continue; | |
837 | } | |
838 | ||
839 | rcventry = grp->base + useidx; | |
840 | npages = sets[setidx].count; | |
841 | pageidx = sets[setidx].idx; | |
842 | ||
843 | ret = set_rcvarray_entry(fp, vaddr + (pageidx * PAGE_SIZE), | |
844 | rcventry, grp, pages + pageidx, | |
845 | npages); | |
846 | if (ret) | |
847 | return ret; | |
848 | mapped += npages; | |
849 | ||
850 | tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) | | |
851 | EXP_TID_SET(LEN, npages); | |
852 | tidlist[(*tididx)++] = tidinfo; | |
853 | grp->used++; | |
854 | grp->map |= 1 << useidx++; | |
855 | idx++; | |
856 | } | |
857 | ||
858 | /* Fill the rest of the group with "blank" writes */ | |
859 | for (; useidx < grp->size; useidx++) | |
860 | rcv_array_wc_fill(dd, grp->base + useidx); | |
861 | *pmapped = mapped; | |
862 | return idx; | |
863 | } | |
864 | ||
865 | static int set_rcvarray_entry(struct file *fp, unsigned long vaddr, | |
866 | u32 rcventry, struct tid_group *grp, | |
867 | struct page **pages, unsigned npages) | |
868 | { | |
869 | int ret; | |
870 | struct hfi1_filedata *fd = fp->private_data; | |
871 | struct hfi1_ctxtdata *uctxt = fd->uctxt; | |
872 | struct mmu_rb_node *node; | |
873 | struct hfi1_devdata *dd = uctxt->dd; | |
874 | struct rb_root *root = &fd->tid_rb_root; | |
875 | dma_addr_t phys; | |
876 | ||
877 | /* | |
878 | * Allocate the node first so we can handle a potential | |
879 | * failure before we've programmed anything. | |
880 | */ | |
881 | node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages), | |
882 | GFP_KERNEL); | |
883 | if (!node) | |
884 | return -ENOMEM; | |
885 | ||
886 | phys = pci_map_single(dd->pcidev, | |
887 | __va(page_to_phys(pages[0])), | |
888 | npages * PAGE_SIZE, PCI_DMA_FROMDEVICE); | |
889 | if (dma_mapping_error(&dd->pcidev->dev, phys)) { | |
890 | dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n", | |
891 | phys); | |
892 | kfree(node); | |
893 | return -EFAULT; | |
894 | } | |
895 | ||
896 | node->virt = vaddr; | |
897 | node->phys = page_to_phys(pages[0]); | |
898 | node->len = npages * PAGE_SIZE; | |
899 | node->npages = npages; | |
900 | node->rcventry = rcventry; | |
901 | node->dma_addr = phys; | |
902 | node->grp = grp; | |
903 | node->freed = false; | |
904 | memcpy(node->pages, pages, sizeof(struct page *) * npages); | |
905 | ||
906 | spin_lock(&fd->rb_lock); | |
a92ba6d6 | 907 | ret = fd->mmu_rb_insert(fd, root, node); |
f88e0c8a MH |
908 | spin_unlock(&fd->rb_lock); |
909 | ||
910 | if (ret) { | |
911 | hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d", | |
912 | node->rcventry, node->virt, node->phys, ret); | |
913 | pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE, | |
914 | PCI_DMA_FROMDEVICE); | |
915 | kfree(node); | |
916 | return -EFAULT; | |
917 | } | |
918 | hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1); | |
0b091fb3 MH |
919 | trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, |
920 | npages, node->virt, node->phys, phys); | |
f88e0c8a MH |
921 | return 0; |
922 | } | |
923 | ||
924 | static int unprogram_rcvarray(struct file *fp, u32 tidinfo, | |
925 | struct tid_group **grp) | |
926 | { | |
927 | struct hfi1_filedata *fd = fp->private_data; | |
928 | struct hfi1_ctxtdata *uctxt = fd->uctxt; | |
929 | struct hfi1_devdata *dd = uctxt->dd; | |
930 | struct mmu_rb_node *node; | |
931 | u8 tidctrl = EXP_TID_GET(tidinfo, CTRL); | |
a92ba6d6 | 932 | u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry; |
f88e0c8a MH |
933 | |
934 | if (tididx >= uctxt->expected_count) { | |
935 | dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n", | |
936 | tididx, uctxt->ctxt); | |
937 | return -EINVAL; | |
938 | } | |
939 | ||
940 | if (tidctrl == 0x3) | |
941 | return -EINVAL; | |
942 | ||
a92ba6d6 | 943 | rcventry = tididx + (tidctrl - 1); |
f88e0c8a MH |
944 | |
945 | spin_lock(&fd->rb_lock); | |
a92ba6d6 MH |
946 | node = fd->entry_to_rb[rcventry]; |
947 | if (!node || node->rcventry != (uctxt->expected_base + rcventry)) { | |
f88e0c8a MH |
948 | spin_unlock(&fd->rb_lock); |
949 | return -EBADF; | |
950 | } | |
a92ba6d6 | 951 | fd->mmu_rb_remove(fd, &fd->tid_rb_root, node); |
f88e0c8a MH |
952 | spin_unlock(&fd->rb_lock); |
953 | if (grp) | |
954 | *grp = node->grp; | |
955 | clear_tid_node(fd, fd->subctxt, node); | |
956 | return 0; | |
957 | } | |
958 | ||
959 | static void clear_tid_node(struct hfi1_filedata *fd, u16 subctxt, | |
960 | struct mmu_rb_node *node) | |
961 | { | |
962 | struct hfi1_ctxtdata *uctxt = fd->uctxt; | |
963 | struct hfi1_devdata *dd = uctxt->dd; | |
964 | ||
0b091fb3 MH |
965 | trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry, |
966 | node->npages, node->virt, node->phys, | |
967 | node->dma_addr); | |
968 | ||
f88e0c8a MH |
969 | hfi1_put_tid(dd, node->rcventry, PT_INVALID, 0, 0); |
970 | /* | |
971 | * Make sure device has seen the write before we unpin the | |
972 | * pages. | |
973 | */ | |
974 | flush_wc(); | |
975 | ||
976 | pci_unmap_single(dd->pcidev, node->dma_addr, node->len, | |
977 | PCI_DMA_FROMDEVICE); | |
978 | hfi1_release_user_pages(node->pages, node->npages, true); | |
979 | ||
980 | node->grp->used--; | |
981 | node->grp->map &= ~(1 << (node->rcventry - node->grp->base)); | |
982 | ||
983 | if (node->grp->used == node->grp->size - 1) | |
984 | tid_group_move(node->grp, &uctxt->tid_full_list, | |
985 | &uctxt->tid_used_list); | |
986 | else if (!node->grp->used) | |
987 | tid_group_move(node->grp, &uctxt->tid_used_list, | |
988 | &uctxt->tid_group_list); | |
989 | kfree(node); | |
990 | } | |
991 | ||
992 | static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, | |
993 | struct exp_tid_set *set, struct rb_root *root) | |
994 | { | |
995 | struct tid_group *grp, *ptr; | |
996 | struct hfi1_filedata *fd = container_of(root, struct hfi1_filedata, | |
997 | tid_rb_root); | |
998 | int i; | |
999 | ||
1000 | list_for_each_entry_safe(grp, ptr, &set->list, list) { | |
1001 | list_del_init(&grp->list); | |
1002 | ||
1003 | spin_lock(&fd->rb_lock); | |
1004 | for (i = 0; i < grp->size; i++) { | |
1005 | if (grp->map & (1 << i)) { | |
1006 | u16 rcventry = grp->base + i; | |
1007 | struct mmu_rb_node *node; | |
1008 | ||
a92ba6d6 MH |
1009 | node = fd->entry_to_rb[rcventry - |
1010 | uctxt->expected_base]; | |
1011 | if (!node || node->rcventry != rcventry) | |
f88e0c8a | 1012 | continue; |
a92ba6d6 | 1013 | fd->mmu_rb_remove(fd, root, node); |
f88e0c8a MH |
1014 | clear_tid_node(fd, -1, node); |
1015 | } | |
1016 | } | |
1017 | spin_unlock(&fd->rb_lock); | |
1018 | } | |
1019 | } | |
1020 | ||
f727a0c3 MH |
1021 | static inline void mmu_notifier_page(struct mmu_notifier *mn, |
1022 | struct mm_struct *mm, unsigned long addr) | |
1023 | { | |
1024 | mmu_notifier_mem_invalidate(mn, addr, addr + PAGE_SIZE, | |
1025 | MMU_INVALIDATE_PAGE); | |
1026 | } | |
1027 | ||
1028 | static inline void mmu_notifier_range_start(struct mmu_notifier *mn, | |
1029 | struct mm_struct *mm, | |
1030 | unsigned long start, | |
1031 | unsigned long end) | |
1032 | { | |
1033 | mmu_notifier_mem_invalidate(mn, start, end, MMU_INVALIDATE_RANGE); | |
1034 | } | |
1035 | ||
1036 | static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn, | |
1037 | unsigned long start, unsigned long end, | |
1038 | enum mmu_call_types type) | |
1039 | { | |
b5eb3b2f MH |
1040 | struct hfi1_filedata *fd = container_of(mn, struct hfi1_filedata, mn); |
1041 | struct hfi1_ctxtdata *uctxt = fd->uctxt; | |
1042 | struct rb_root *root = &fd->tid_rb_root; | |
1043 | struct mmu_rb_node *node; | |
1044 | unsigned long addr = start; | |
1045 | ||
0b091fb3 MH |
1046 | trace_hfi1_mmu_invalidate(uctxt->ctxt, fd->subctxt, mmu_types[type], |
1047 | start, end); | |
1048 | ||
b5eb3b2f MH |
1049 | spin_lock(&fd->rb_lock); |
1050 | while (addr < end) { | |
a92ba6d6 | 1051 | node = mmu_rb_search(root, addr); |
b5eb3b2f MH |
1052 | |
1053 | if (!node) { | |
1054 | /* | |
1055 | * Didn't find a node at this address. However, the | |
1056 | * range could be bigger than what we have registered | |
1057 | * so we have to keep looking. | |
1058 | */ | |
1059 | addr += PAGE_SIZE; | |
1060 | continue; | |
1061 | } | |
1062 | ||
1063 | /* | |
1064 | * The next address to be looked up is computed based | |
1065 | * on the node's starting address. This is due to the | |
1066 | * fact that the range where we start might be in the | |
1067 | * middle of the node's buffer so simply incrementing | |
1068 | * the address by the node's size would result is a | |
1069 | * bad address. | |
1070 | */ | |
1071 | addr = node->virt + (node->npages * PAGE_SIZE); | |
1072 | if (node->freed) | |
1073 | continue; | |
1074 | ||
0b091fb3 MH |
1075 | trace_hfi1_exp_tid_inval(uctxt->ctxt, fd->subctxt, node->virt, |
1076 | node->rcventry, node->npages, | |
1077 | node->dma_addr); | |
b5eb3b2f MH |
1078 | node->freed = true; |
1079 | ||
1080 | spin_lock(&fd->invalid_lock); | |
1081 | if (fd->invalid_tid_idx < uctxt->expected_count) { | |
1082 | fd->invalid_tids[fd->invalid_tid_idx] = | |
1083 | rcventry2tidinfo(node->rcventry - | |
1084 | uctxt->expected_base); | |
1085 | fd->invalid_tids[fd->invalid_tid_idx] |= | |
1086 | EXP_TID_SET(LEN, node->npages); | |
1087 | if (!fd->invalid_tid_idx) { | |
1088 | unsigned long *ev; | |
1089 | ||
1090 | /* | |
1091 | * hfi1_set_uevent_bits() sets a user event flag | |
1092 | * for all processes. Because calling into the | |
1093 | * driver to process TID cache invalidations is | |
1094 | * expensive and TID cache invalidations are | |
1095 | * handled on a per-process basis, we can | |
1096 | * optimize this to set the flag only for the | |
1097 | * process in question. | |
1098 | */ | |
1099 | ev = uctxt->dd->events + | |
1100 | (((uctxt->ctxt - | |
1101 | uctxt->dd->first_user_ctxt) * | |
1102 | HFI1_MAX_SHARED_CTXTS) + fd->subctxt); | |
1103 | set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev); | |
1104 | } | |
1105 | fd->invalid_tid_idx++; | |
1106 | } | |
1107 | spin_unlock(&fd->invalid_lock); | |
1108 | } | |
1109 | spin_unlock(&fd->rb_lock); | |
f727a0c3 MH |
1110 | } |
1111 | ||
1112 | static inline int mmu_addr_cmp(struct mmu_rb_node *node, unsigned long addr, | |
1113 | unsigned long len) | |
1114 | { | |
1115 | if ((addr + len) <= node->virt) | |
1116 | return -1; | |
1117 | else if (addr >= node->virt && addr < (node->virt + node->len)) | |
1118 | return 0; | |
1119 | else | |
1120 | return 1; | |
1121 | } | |
1122 | ||
1123 | static inline int mmu_entry_cmp(struct mmu_rb_node *node, u32 entry) | |
1124 | { | |
1125 | if (entry < node->rcventry) | |
1126 | return -1; | |
1127 | else if (entry > node->rcventry) | |
1128 | return 1; | |
1129 | else | |
1130 | return 0; | |
1131 | } | |
1132 | ||
a92ba6d6 MH |
1133 | static struct mmu_rb_node *mmu_rb_search(struct rb_root *root, |
1134 | unsigned long addr) | |
f727a0c3 MH |
1135 | { |
1136 | struct rb_node *node = root->rb_node; | |
1137 | ||
1138 | while (node) { | |
1139 | struct mmu_rb_node *mnode = | |
1140 | container_of(node, struct mmu_rb_node, rbnode); | |
1141 | /* | |
1142 | * When searching, use at least one page length for size. The | |
1143 | * MMU notifier will not give us anything less than that. We | |
1144 | * also don't need anything more than a page because we are | |
1145 | * guaranteed to have non-overlapping buffers in the tree. | |
1146 | */ | |
1147 | int result = mmu_addr_cmp(mnode, addr, PAGE_SIZE); | |
1148 | ||
1149 | if (result < 0) | |
1150 | node = node->rb_left; | |
1151 | else if (result > 0) | |
1152 | node = node->rb_right; | |
1153 | else | |
1154 | return mnode; | |
1155 | } | |
1156 | return NULL; | |
1157 | } | |
1158 | ||
a92ba6d6 MH |
1159 | static int mmu_rb_insert_by_entry(struct hfi1_filedata *fdata, |
1160 | struct rb_root *root, | |
f727a0c3 MH |
1161 | struct mmu_rb_node *node) |
1162 | { | |
a92ba6d6 | 1163 | u32 base = fdata->uctxt->expected_base; |
f727a0c3 | 1164 | |
a92ba6d6 | 1165 | fdata->entry_to_rb[node->rcventry - base] = node; |
f727a0c3 MH |
1166 | return 0; |
1167 | } | |
1168 | ||
a92ba6d6 MH |
1169 | static int mmu_rb_insert_by_addr(struct hfi1_filedata *fdata, |
1170 | struct rb_root *root, struct mmu_rb_node *node) | |
f727a0c3 MH |
1171 | { |
1172 | struct rb_node **new = &root->rb_node, *parent = NULL; | |
a92ba6d6 | 1173 | u32 base = fdata->uctxt->expected_base; |
f727a0c3 MH |
1174 | |
1175 | /* Figure out where to put new node */ | |
1176 | while (*new) { | |
1177 | struct mmu_rb_node *this = | |
1178 | container_of(*new, struct mmu_rb_node, rbnode); | |
1179 | int result = mmu_addr_cmp(this, node->virt, node->len); | |
1180 | ||
1181 | parent = *new; | |
1182 | if (result < 0) | |
1183 | new = &((*new)->rb_left); | |
1184 | else if (result > 0) | |
1185 | new = &((*new)->rb_right); | |
1186 | else | |
1187 | return 1; | |
1188 | } | |
1189 | ||
1190 | /* Add new node and rebalance tree. */ | |
1191 | rb_link_node(&node->rbnode, parent, new); | |
1192 | rb_insert_color(&node->rbnode, root); | |
1193 | ||
a92ba6d6 | 1194 | fdata->entry_to_rb[node->rcventry - base] = node; |
f727a0c3 MH |
1195 | return 0; |
1196 | } | |
a92ba6d6 MH |
1197 | |
1198 | static void mmu_rb_remove_by_entry(struct hfi1_filedata *fdata, | |
1199 | struct rb_root *root, | |
1200 | struct mmu_rb_node *node) | |
1201 | { | |
1202 | u32 base = fdata->uctxt->expected_base; | |
1203 | ||
1204 | fdata->entry_to_rb[node->rcventry - base] = NULL; | |
1205 | } | |
1206 | ||
1207 | static void mmu_rb_remove_by_addr(struct hfi1_filedata *fdata, | |
1208 | struct rb_root *root, | |
1209 | struct mmu_rb_node *node) | |
1210 | { | |
1211 | u32 base = fdata->uctxt->expected_base; | |
1212 | ||
1213 | fdata->entry_to_rb[node->rcventry - base] = NULL; | |
1214 | rb_erase(&node->rbnode, root); | |
1215 | } |