Commit | Line | Data |
---|---|---|
f727a0c3 MH |
1 | /* |
2 | * | |
3 | * This file is provided under a dual BSD/GPLv2 license. When using or | |
4 | * redistributing this file, you may do so under either license. | |
5 | * | |
6 | * GPL LICENSE SUMMARY | |
7 | * | |
8 | * Copyright(c) 2015 Intel Corporation. | |
9 | * | |
10 | * This program is free software; you can redistribute it and/or modify | |
11 | * it under the terms of version 2 of the GNU General Public License as | |
12 | * published by the Free Software Foundation. | |
13 | * | |
14 | * This program is distributed in the hope that it will be useful, but | |
15 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 | * General Public License for more details. | |
18 | * | |
19 | * BSD LICENSE | |
20 | * | |
21 | * Copyright(c) 2015 Intel Corporation. | |
22 | * | |
23 | * Redistribution and use in source and binary forms, with or without | |
24 | * modification, are permitted provided that the following conditions | |
25 | * are met: | |
26 | * | |
27 | * - Redistributions of source code must retain the above copyright | |
28 | * notice, this list of conditions and the following disclaimer. | |
29 | * - Redistributions in binary form must reproduce the above copyright | |
30 | * notice, this list of conditions and the following disclaimer in | |
31 | * the documentation and/or other materials provided with the | |
32 | * distribution. | |
33 | * - Neither the name of Intel Corporation nor the names of its | |
34 | * contributors may be used to endorse or promote products derived | |
35 | * from this software without specific prior written permission. | |
36 | * | |
37 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
38 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
39 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
40 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
41 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
42 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
43 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
44 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
45 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
46 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
47 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
48 | * | |
49 | */ | |
50 | #include <asm/page.h> | |
51 | ||
52 | #include "user_exp_rcv.h" | |
53 | #include "trace.h" | |
54 | ||
b8abe346 MH |
55 | struct tid_group { |
56 | struct list_head list; | |
57 | unsigned base; | |
58 | u8 size; | |
59 | u8 used; | |
60 | u8 map; | |
61 | }; | |
62 | ||
f727a0c3 MH |
63 | struct mmu_rb_node { |
64 | struct rb_node rbnode; | |
65 | unsigned long virt; | |
66 | unsigned long phys; | |
67 | unsigned long len; | |
68 | struct tid_group *grp; | |
69 | u32 rcventry; | |
70 | dma_addr_t dma_addr; | |
71 | bool freed; | |
72 | unsigned npages; | |
73 | struct page *pages[0]; | |
74 | }; | |
75 | ||
76 | enum mmu_call_types { | |
77 | MMU_INVALIDATE_PAGE = 0, | |
78 | MMU_INVALIDATE_RANGE = 1 | |
79 | }; | |
80 | ||
81 | static const char * const mmu_types[] = { | |
82 | "PAGE", | |
83 | "RANGE" | |
84 | }; | |
85 | ||
f88e0c8a MH |
86 | struct tid_pageset { |
87 | u16 idx; | |
88 | u16 count; | |
89 | }; | |
90 | ||
b8abe346 MH |
91 | #define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list)) |
92 | ||
3abb33ac MH |
93 | #define num_user_pages(vaddr, len) \ |
94 | (1 + (((((unsigned long)(vaddr) + \ | |
95 | (unsigned long)(len) - 1) & PAGE_MASK) - \ | |
96 | ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT)) | |
97 | ||
f88e0c8a | 98 | static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *, |
3abb33ac | 99 | struct rb_root *); |
f88e0c8a MH |
100 | static u32 find_phys_blocks(struct page **, unsigned, |
101 | struct tid_pageset *) __maybe_unused; | |
102 | static int set_rcvarray_entry(struct file *, unsigned long, u32, | |
3abb33ac | 103 | struct tid_group *, struct page **, unsigned); |
f727a0c3 MH |
104 | static inline int mmu_addr_cmp(struct mmu_rb_node *, unsigned long, |
105 | unsigned long); | |
106 | static struct mmu_rb_node *mmu_rb_search_by_addr(struct rb_root *, | |
b5eb3b2f | 107 | unsigned long); |
f727a0c3 MH |
108 | static inline struct mmu_rb_node *mmu_rb_search_by_entry(struct rb_root *, |
109 | u32); | |
3abb33ac MH |
110 | static int mmu_rb_insert_by_addr(struct rb_root *, struct mmu_rb_node *); |
111 | static int mmu_rb_insert_by_entry(struct rb_root *, struct mmu_rb_node *); | |
f727a0c3 MH |
112 | static void mmu_notifier_mem_invalidate(struct mmu_notifier *, |
113 | unsigned long, unsigned long, | |
114 | enum mmu_call_types); | |
115 | static inline void mmu_notifier_page(struct mmu_notifier *, struct mm_struct *, | |
116 | unsigned long); | |
117 | static inline void mmu_notifier_range_start(struct mmu_notifier *, | |
118 | struct mm_struct *, | |
119 | unsigned long, unsigned long); | |
f88e0c8a MH |
120 | static int program_rcvarray(struct file *, unsigned long, struct tid_group *, |
121 | struct tid_pageset *, unsigned, u16, struct page **, | |
122 | u32 *, unsigned *, unsigned *) __maybe_unused; | |
123 | static int unprogram_rcvarray(struct file *, u32, | |
124 | struct tid_group **) __maybe_unused; | |
125 | static void clear_tid_node(struct hfi1_filedata *, u16, | |
126 | struct mmu_rb_node *) __maybe_unused; | |
127 | ||
128 | static inline u32 rcventry2tidinfo(u32 rcventry) | |
129 | { | |
130 | u32 pair = rcventry & ~0x1; | |
131 | ||
132 | return EXP_TID_SET(IDX, pair >> 1) | | |
133 | EXP_TID_SET(CTRL, 1 << (rcventry - pair)); | |
134 | } | |
f727a0c3 | 135 | |
b8abe346 MH |
136 | static inline void exp_tid_group_init(struct exp_tid_set *set) |
137 | { | |
138 | INIT_LIST_HEAD(&set->list); | |
139 | set->count = 0; | |
140 | } | |
141 | ||
142 | static inline void tid_group_remove(struct tid_group *grp, | |
143 | struct exp_tid_set *set) | |
144 | { | |
145 | list_del_init(&grp->list); | |
146 | set->count--; | |
147 | } | |
148 | ||
149 | static inline void tid_group_add_tail(struct tid_group *grp, | |
150 | struct exp_tid_set *set) | |
151 | { | |
152 | list_add_tail(&grp->list, &set->list); | |
153 | set->count++; | |
154 | } | |
155 | ||
156 | static inline struct tid_group *tid_group_pop(struct exp_tid_set *set) | |
157 | { | |
158 | struct tid_group *grp = | |
159 | list_first_entry(&set->list, struct tid_group, list); | |
160 | list_del_init(&grp->list); | |
161 | set->count--; | |
162 | return grp; | |
163 | } | |
164 | ||
165 | static inline void tid_group_move(struct tid_group *group, | |
166 | struct exp_tid_set *s1, | |
167 | struct exp_tid_set *s2) | |
168 | { | |
169 | tid_group_remove(group, s1); | |
170 | tid_group_add_tail(group, s2); | |
171 | } | |
172 | ||
3abb33ac | 173 | static struct mmu_notifier_ops mn_opts = { |
f727a0c3 MH |
174 | .invalidate_page = mmu_notifier_page, |
175 | .invalidate_range_start = mmu_notifier_range_start, | |
176 | }; | |
177 | ||
178 | /* | |
179 | * Initialize context and file private data needed for Expected | |
180 | * receive caching. This needs to be done after the context has | |
181 | * been configured with the eager/expected RcvEntry counts. | |
182 | */ | |
183 | int hfi1_user_exp_rcv_init(struct file *fp) | |
184 | { | |
3abb33ac MH |
185 | struct hfi1_filedata *fd = fp->private_data; |
186 | struct hfi1_ctxtdata *uctxt = fd->uctxt; | |
187 | struct hfi1_devdata *dd = uctxt->dd; | |
188 | unsigned tidbase; | |
189 | int i, ret = 0; | |
190 | ||
191 | INIT_HLIST_NODE(&fd->mn.hlist); | |
192 | spin_lock_init(&fd->rb_lock); | |
193 | spin_lock_init(&fd->tid_lock); | |
194 | spin_lock_init(&fd->invalid_lock); | |
195 | fd->mn.ops = &mn_opts; | |
196 | fd->tid_rb_root = RB_ROOT; | |
197 | ||
198 | if (!uctxt->subctxt_cnt || !fd->subctxt) { | |
199 | exp_tid_group_init(&uctxt->tid_group_list); | |
200 | exp_tid_group_init(&uctxt->tid_used_list); | |
201 | exp_tid_group_init(&uctxt->tid_full_list); | |
202 | ||
203 | tidbase = uctxt->expected_base; | |
204 | for (i = 0; i < uctxt->expected_count / | |
205 | dd->rcv_entries.group_size; i++) { | |
206 | struct tid_group *grp; | |
207 | ||
208 | grp = kzalloc(sizeof(*grp), GFP_KERNEL); | |
209 | if (!grp) { | |
210 | /* | |
211 | * If we fail here, the groups already | |
212 | * allocated will be freed by the close | |
213 | * call. | |
214 | */ | |
215 | ret = -ENOMEM; | |
216 | goto done; | |
217 | } | |
218 | grp->size = dd->rcv_entries.group_size; | |
219 | grp->base = tidbase; | |
220 | tid_group_add_tail(grp, &uctxt->tid_group_list); | |
221 | tidbase += dd->rcv_entries.group_size; | |
222 | } | |
223 | } | |
224 | ||
225 | if (!HFI1_CAP_IS_USET(TID_UNMAP)) { | |
226 | fd->invalid_tid_idx = 0; | |
227 | fd->invalid_tids = kzalloc(uctxt->expected_count * | |
228 | sizeof(u32), GFP_KERNEL); | |
229 | if (!fd->invalid_tids) { | |
230 | ret = -ENOMEM; | |
231 | goto done; | |
232 | } else { | |
233 | /* | |
234 | * Register MMU notifier callbacks. If the registration | |
235 | * fails, continue but turn off the TID caching for | |
236 | * all user contexts. | |
237 | */ | |
238 | ret = mmu_notifier_register(&fd->mn, current->mm); | |
239 | if (ret) { | |
240 | dd_dev_info(dd, | |
241 | "Failed MMU notifier registration %d\n", | |
242 | ret); | |
243 | HFI1_CAP_USET(TID_UNMAP); | |
244 | ret = 0; | |
245 | } | |
246 | } | |
247 | } | |
248 | ||
249 | if (HFI1_CAP_IS_USET(TID_UNMAP)) | |
250 | fd->mmu_rb_insert = mmu_rb_insert_by_entry; | |
251 | else | |
252 | fd->mmu_rb_insert = mmu_rb_insert_by_addr; | |
253 | ||
254 | /* | |
255 | * PSM does not have a good way to separate, count, and | |
256 | * effectively enforce a limit on RcvArray entries used by | |
257 | * subctxts (when context sharing is used) when TID caching | |
258 | * is enabled. To help with that, we calculate a per-process | |
259 | * RcvArray entry share and enforce that. | |
260 | * If TID caching is not in use, PSM deals with usage on its | |
261 | * own. In that case, we allow any subctxt to take all of the | |
262 | * entries. | |
263 | * | |
264 | * Make sure that we set the tid counts only after successful | |
265 | * init. | |
266 | */ | |
267 | if (uctxt->subctxt_cnt && !HFI1_CAP_IS_USET(TID_UNMAP)) { | |
268 | u16 remainder; | |
269 | ||
270 | fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt; | |
271 | remainder = uctxt->expected_count % uctxt->subctxt_cnt; | |
272 | if (remainder && fd->subctxt < remainder) | |
273 | fd->tid_limit++; | |
274 | } else { | |
275 | fd->tid_limit = uctxt->expected_count; | |
276 | } | |
277 | done: | |
278 | return ret; | |
f727a0c3 MH |
279 | } |
280 | ||
281 | int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd) | |
282 | { | |
3abb33ac MH |
283 | struct hfi1_ctxtdata *uctxt = fd->uctxt; |
284 | struct tid_group *grp, *gptr; | |
285 | ||
286 | /* | |
287 | * The notifier would have been removed when the process'es mm | |
288 | * was freed. | |
289 | */ | |
290 | if (current->mm && !HFI1_CAP_IS_USET(TID_UNMAP)) | |
291 | mmu_notifier_unregister(&fd->mn, current->mm); | |
292 | ||
293 | kfree(fd->invalid_tids); | |
294 | ||
295 | if (!uctxt->cnt) { | |
296 | if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list)) | |
297 | unlock_exp_tids(uctxt, &uctxt->tid_full_list, | |
298 | &fd->tid_rb_root); | |
299 | if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list)) | |
300 | unlock_exp_tids(uctxt, &uctxt->tid_used_list, | |
301 | &fd->tid_rb_root); | |
302 | list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list, | |
303 | list) { | |
304 | list_del_init(&grp->list); | |
305 | kfree(grp); | |
306 | } | |
307 | spin_lock(&fd->rb_lock); | |
308 | if (!RB_EMPTY_ROOT(&fd->tid_rb_root)) { | |
309 | struct rb_node *node; | |
310 | struct mmu_rb_node *rbnode; | |
311 | ||
312 | while ((node = rb_first(&fd->tid_rb_root))) { | |
313 | rbnode = rb_entry(node, struct mmu_rb_node, | |
314 | rbnode); | |
315 | rb_erase(&rbnode->rbnode, &fd->tid_rb_root); | |
316 | kfree(rbnode); | |
317 | } | |
318 | } | |
319 | spin_unlock(&fd->rb_lock); | |
320 | hfi1_clear_tids(uctxt); | |
321 | } | |
322 | return 0; | |
f727a0c3 MH |
323 | } |
324 | ||
b8abe346 MH |
325 | /* |
326 | * Write an "empty" RcvArray entry. | |
327 | * This function exists so the TID registaration code can use it | |
328 | * to write to unused/unneeded entries and still take advantage | |
329 | * of the WC performance improvements. The HFI will ignore this | |
330 | * write to the RcvArray entry. | |
331 | */ | |
332 | static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index) | |
333 | { | |
334 | /* | |
335 | * Doing the WC fill writes only makes sense if the device is | |
336 | * present and the RcvArray has been mapped as WC memory. | |
337 | */ | |
338 | if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc) | |
339 | writeq(0, dd->rcvarray_wc + (index * 8)); | |
340 | } | |
341 | ||
f727a0c3 MH |
342 | int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo) |
343 | { | |
344 | return -EINVAL; | |
345 | } | |
346 | ||
347 | int hfi1_user_exp_rcv_clear(struct file *fp, struct hfi1_tid_info *tinfo) | |
348 | { | |
349 | return -EINVAL; | |
350 | } | |
351 | ||
352 | int hfi1_user_exp_rcv_invalid(struct file *fp, struct hfi1_tid_info *tinfo) | |
353 | { | |
354 | return -EINVAL; | |
355 | } | |
356 | ||
f88e0c8a MH |
357 | static u32 find_phys_blocks(struct page **pages, unsigned npages, |
358 | struct tid_pageset *list) | |
359 | { | |
360 | unsigned pagecount, pageidx, setcount = 0, i; | |
361 | unsigned long pfn, this_pfn; | |
362 | ||
363 | if (!npages) | |
364 | return 0; | |
365 | ||
366 | /* | |
367 | * Look for sets of physically contiguous pages in the user buffer. | |
368 | * This will allow us to optimize Expected RcvArray entry usage by | |
369 | * using the bigger supported sizes. | |
370 | */ | |
371 | pfn = page_to_pfn(pages[0]); | |
372 | for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) { | |
373 | this_pfn = i < npages ? page_to_pfn(pages[i]) : 0; | |
374 | ||
375 | /* | |
376 | * If the pfn's are not sequential, pages are not physically | |
377 | * contiguous. | |
378 | */ | |
379 | if (this_pfn != ++pfn) { | |
380 | /* | |
381 | * At this point we have to loop over the set of | |
382 | * physically contiguous pages and break them down it | |
383 | * sizes supported by the HW. | |
384 | * There are two main constraints: | |
385 | * 1. The max buffer size is MAX_EXPECTED_BUFFER. | |
386 | * If the total set size is bigger than that | |
387 | * program only a MAX_EXPECTED_BUFFER chunk. | |
388 | * 2. The buffer size has to be a power of two. If | |
389 | * it is not, round down to the closes power of | |
390 | * 2 and program that size. | |
391 | */ | |
392 | while (pagecount) { | |
393 | int maxpages = pagecount; | |
394 | u32 bufsize = pagecount * PAGE_SIZE; | |
395 | ||
396 | if (bufsize > MAX_EXPECTED_BUFFER) | |
397 | maxpages = | |
398 | MAX_EXPECTED_BUFFER >> | |
399 | PAGE_SHIFT; | |
400 | else if (!is_power_of_2(bufsize)) | |
401 | maxpages = | |
402 | rounddown_pow_of_two(bufsize) >> | |
403 | PAGE_SHIFT; | |
404 | ||
405 | list[setcount].idx = pageidx; | |
406 | list[setcount].count = maxpages; | |
407 | pagecount -= maxpages; | |
408 | pageidx += maxpages; | |
409 | setcount++; | |
410 | } | |
411 | pageidx = i; | |
412 | pagecount = 1; | |
413 | pfn = this_pfn; | |
414 | } else { | |
415 | pagecount++; | |
416 | } | |
417 | } | |
418 | return setcount; | |
419 | } | |
420 | ||
421 | /** | |
422 | * program_rcvarray() - program an RcvArray group with receive buffers | |
423 | * @fp: file pointer | |
424 | * @vaddr: starting user virtual address | |
425 | * @grp: RcvArray group | |
426 | * @sets: array of struct tid_pageset holding information on physically | |
427 | * contiguous chunks from the user buffer | |
428 | * @start: starting index into sets array | |
429 | * @count: number of struct tid_pageset's to program | |
430 | * @pages: an array of struct page * for the user buffer | |
431 | * @tidlist: the array of u32 elements when the information about the | |
432 | * programmed RcvArray entries is to be encoded. | |
433 | * @tididx: starting offset into tidlist | |
434 | * @pmapped: (output parameter) number of pages programmed into the RcvArray | |
435 | * entries. | |
436 | * | |
437 | * This function will program up to 'count' number of RcvArray entries from the | |
438 | * group 'grp'. To make best use of write-combining writes, the function will | |
439 | * perform writes to the unused RcvArray entries which will be ignored by the | |
440 | * HW. Each RcvArray entry will be programmed with a physically contiguous | |
441 | * buffer chunk from the user's virtual buffer. | |
442 | * | |
443 | * Return: | |
444 | * -EINVAL if the requested count is larger than the size of the group, | |
445 | * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or | |
446 | * number of RcvArray entries programmed. | |
447 | */ | |
448 | static int program_rcvarray(struct file *fp, unsigned long vaddr, | |
449 | struct tid_group *grp, | |
450 | struct tid_pageset *sets, | |
451 | unsigned start, u16 count, struct page **pages, | |
452 | u32 *tidlist, unsigned *tididx, unsigned *pmapped) | |
453 | { | |
454 | struct hfi1_filedata *fd = fp->private_data; | |
455 | struct hfi1_ctxtdata *uctxt = fd->uctxt; | |
456 | struct hfi1_devdata *dd = uctxt->dd; | |
457 | u16 idx; | |
458 | u32 tidinfo = 0, rcventry, useidx = 0; | |
459 | int mapped = 0; | |
460 | ||
461 | /* Count should never be larger than the group size */ | |
462 | if (count > grp->size) | |
463 | return -EINVAL; | |
464 | ||
465 | /* Find the first unused entry in the group */ | |
466 | for (idx = 0; idx < grp->size; idx++) { | |
467 | if (!(grp->map & (1 << idx))) { | |
468 | useidx = idx; | |
469 | break; | |
470 | } | |
471 | rcv_array_wc_fill(dd, grp->base + idx); | |
472 | } | |
473 | ||
474 | idx = 0; | |
475 | while (idx < count) { | |
476 | u16 npages, pageidx, setidx = start + idx; | |
477 | int ret = 0; | |
478 | ||
479 | /* | |
480 | * If this entry in the group is used, move to the next one. | |
481 | * If we go past the end of the group, exit the loop. | |
482 | */ | |
483 | if (useidx >= grp->size) { | |
484 | break; | |
485 | } else if (grp->map & (1 << useidx)) { | |
486 | rcv_array_wc_fill(dd, grp->base + useidx); | |
487 | useidx++; | |
488 | continue; | |
489 | } | |
490 | ||
491 | rcventry = grp->base + useidx; | |
492 | npages = sets[setidx].count; | |
493 | pageidx = sets[setidx].idx; | |
494 | ||
495 | ret = set_rcvarray_entry(fp, vaddr + (pageidx * PAGE_SIZE), | |
496 | rcventry, grp, pages + pageidx, | |
497 | npages); | |
498 | if (ret) | |
499 | return ret; | |
500 | mapped += npages; | |
501 | ||
502 | tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) | | |
503 | EXP_TID_SET(LEN, npages); | |
504 | tidlist[(*tididx)++] = tidinfo; | |
505 | grp->used++; | |
506 | grp->map |= 1 << useidx++; | |
507 | idx++; | |
508 | } | |
509 | ||
510 | /* Fill the rest of the group with "blank" writes */ | |
511 | for (; useidx < grp->size; useidx++) | |
512 | rcv_array_wc_fill(dd, grp->base + useidx); | |
513 | *pmapped = mapped; | |
514 | return idx; | |
515 | } | |
516 | ||
517 | static int set_rcvarray_entry(struct file *fp, unsigned long vaddr, | |
518 | u32 rcventry, struct tid_group *grp, | |
519 | struct page **pages, unsigned npages) | |
520 | { | |
521 | int ret; | |
522 | struct hfi1_filedata *fd = fp->private_data; | |
523 | struct hfi1_ctxtdata *uctxt = fd->uctxt; | |
524 | struct mmu_rb_node *node; | |
525 | struct hfi1_devdata *dd = uctxt->dd; | |
526 | struct rb_root *root = &fd->tid_rb_root; | |
527 | dma_addr_t phys; | |
528 | ||
529 | /* | |
530 | * Allocate the node first so we can handle a potential | |
531 | * failure before we've programmed anything. | |
532 | */ | |
533 | node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages), | |
534 | GFP_KERNEL); | |
535 | if (!node) | |
536 | return -ENOMEM; | |
537 | ||
538 | phys = pci_map_single(dd->pcidev, | |
539 | __va(page_to_phys(pages[0])), | |
540 | npages * PAGE_SIZE, PCI_DMA_FROMDEVICE); | |
541 | if (dma_mapping_error(&dd->pcidev->dev, phys)) { | |
542 | dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n", | |
543 | phys); | |
544 | kfree(node); | |
545 | return -EFAULT; | |
546 | } | |
547 | ||
548 | node->virt = vaddr; | |
549 | node->phys = page_to_phys(pages[0]); | |
550 | node->len = npages * PAGE_SIZE; | |
551 | node->npages = npages; | |
552 | node->rcventry = rcventry; | |
553 | node->dma_addr = phys; | |
554 | node->grp = grp; | |
555 | node->freed = false; | |
556 | memcpy(node->pages, pages, sizeof(struct page *) * npages); | |
557 | ||
558 | spin_lock(&fd->rb_lock); | |
559 | ret = fd->mmu_rb_insert(root, node); | |
560 | spin_unlock(&fd->rb_lock); | |
561 | ||
562 | if (ret) { | |
563 | hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d", | |
564 | node->rcventry, node->virt, node->phys, ret); | |
565 | pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE, | |
566 | PCI_DMA_FROMDEVICE); | |
567 | kfree(node); | |
568 | return -EFAULT; | |
569 | } | |
570 | hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1); | |
571 | return 0; | |
572 | } | |
573 | ||
574 | static int unprogram_rcvarray(struct file *fp, u32 tidinfo, | |
575 | struct tid_group **grp) | |
576 | { | |
577 | struct hfi1_filedata *fd = fp->private_data; | |
578 | struct hfi1_ctxtdata *uctxt = fd->uctxt; | |
579 | struct hfi1_devdata *dd = uctxt->dd; | |
580 | struct mmu_rb_node *node; | |
581 | u8 tidctrl = EXP_TID_GET(tidinfo, CTRL); | |
582 | u32 tidbase = uctxt->expected_base, | |
583 | tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry; | |
584 | ||
585 | if (tididx >= uctxt->expected_count) { | |
586 | dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n", | |
587 | tididx, uctxt->ctxt); | |
588 | return -EINVAL; | |
589 | } | |
590 | ||
591 | if (tidctrl == 0x3) | |
592 | return -EINVAL; | |
593 | ||
594 | rcventry = tidbase + tididx + (tidctrl - 1); | |
595 | ||
596 | spin_lock(&fd->rb_lock); | |
597 | node = mmu_rb_search_by_entry(&fd->tid_rb_root, rcventry); | |
598 | if (!node) { | |
599 | spin_unlock(&fd->rb_lock); | |
600 | return -EBADF; | |
601 | } | |
602 | rb_erase(&node->rbnode, &fd->tid_rb_root); | |
603 | spin_unlock(&fd->rb_lock); | |
604 | if (grp) | |
605 | *grp = node->grp; | |
606 | clear_tid_node(fd, fd->subctxt, node); | |
607 | return 0; | |
608 | } | |
609 | ||
610 | static void clear_tid_node(struct hfi1_filedata *fd, u16 subctxt, | |
611 | struct mmu_rb_node *node) | |
612 | { | |
613 | struct hfi1_ctxtdata *uctxt = fd->uctxt; | |
614 | struct hfi1_devdata *dd = uctxt->dd; | |
615 | ||
616 | hfi1_put_tid(dd, node->rcventry, PT_INVALID, 0, 0); | |
617 | /* | |
618 | * Make sure device has seen the write before we unpin the | |
619 | * pages. | |
620 | */ | |
621 | flush_wc(); | |
622 | ||
623 | pci_unmap_single(dd->pcidev, node->dma_addr, node->len, | |
624 | PCI_DMA_FROMDEVICE); | |
625 | hfi1_release_user_pages(node->pages, node->npages, true); | |
626 | ||
627 | node->grp->used--; | |
628 | node->grp->map &= ~(1 << (node->rcventry - node->grp->base)); | |
629 | ||
630 | if (node->grp->used == node->grp->size - 1) | |
631 | tid_group_move(node->grp, &uctxt->tid_full_list, | |
632 | &uctxt->tid_used_list); | |
633 | else if (!node->grp->used) | |
634 | tid_group_move(node->grp, &uctxt->tid_used_list, | |
635 | &uctxt->tid_group_list); | |
636 | kfree(node); | |
637 | } | |
638 | ||
639 | static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, | |
640 | struct exp_tid_set *set, struct rb_root *root) | |
641 | { | |
642 | struct tid_group *grp, *ptr; | |
643 | struct hfi1_filedata *fd = container_of(root, struct hfi1_filedata, | |
644 | tid_rb_root); | |
645 | int i; | |
646 | ||
647 | list_for_each_entry_safe(grp, ptr, &set->list, list) { | |
648 | list_del_init(&grp->list); | |
649 | ||
650 | spin_lock(&fd->rb_lock); | |
651 | for (i = 0; i < grp->size; i++) { | |
652 | if (grp->map & (1 << i)) { | |
653 | u16 rcventry = grp->base + i; | |
654 | struct mmu_rb_node *node; | |
655 | ||
656 | node = mmu_rb_search_by_entry(root, rcventry); | |
657 | if (!node) | |
658 | continue; | |
659 | rb_erase(&node->rbnode, root); | |
660 | clear_tid_node(fd, -1, node); | |
661 | } | |
662 | } | |
663 | spin_unlock(&fd->rb_lock); | |
664 | } | |
665 | } | |
666 | ||
f727a0c3 MH |
667 | static inline void mmu_notifier_page(struct mmu_notifier *mn, |
668 | struct mm_struct *mm, unsigned long addr) | |
669 | { | |
670 | mmu_notifier_mem_invalidate(mn, addr, addr + PAGE_SIZE, | |
671 | MMU_INVALIDATE_PAGE); | |
672 | } | |
673 | ||
674 | static inline void mmu_notifier_range_start(struct mmu_notifier *mn, | |
675 | struct mm_struct *mm, | |
676 | unsigned long start, | |
677 | unsigned long end) | |
678 | { | |
679 | mmu_notifier_mem_invalidate(mn, start, end, MMU_INVALIDATE_RANGE); | |
680 | } | |
681 | ||
682 | static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn, | |
683 | unsigned long start, unsigned long end, | |
684 | enum mmu_call_types type) | |
685 | { | |
b5eb3b2f MH |
686 | struct hfi1_filedata *fd = container_of(mn, struct hfi1_filedata, mn); |
687 | struct hfi1_ctxtdata *uctxt = fd->uctxt; | |
688 | struct rb_root *root = &fd->tid_rb_root; | |
689 | struct mmu_rb_node *node; | |
690 | unsigned long addr = start; | |
691 | ||
692 | spin_lock(&fd->rb_lock); | |
693 | while (addr < end) { | |
694 | node = mmu_rb_search_by_addr(root, addr); | |
695 | ||
696 | if (!node) { | |
697 | /* | |
698 | * Didn't find a node at this address. However, the | |
699 | * range could be bigger than what we have registered | |
700 | * so we have to keep looking. | |
701 | */ | |
702 | addr += PAGE_SIZE; | |
703 | continue; | |
704 | } | |
705 | ||
706 | /* | |
707 | * The next address to be looked up is computed based | |
708 | * on the node's starting address. This is due to the | |
709 | * fact that the range where we start might be in the | |
710 | * middle of the node's buffer so simply incrementing | |
711 | * the address by the node's size would result is a | |
712 | * bad address. | |
713 | */ | |
714 | addr = node->virt + (node->npages * PAGE_SIZE); | |
715 | if (node->freed) | |
716 | continue; | |
717 | ||
718 | node->freed = true; | |
719 | ||
720 | spin_lock(&fd->invalid_lock); | |
721 | if (fd->invalid_tid_idx < uctxt->expected_count) { | |
722 | fd->invalid_tids[fd->invalid_tid_idx] = | |
723 | rcventry2tidinfo(node->rcventry - | |
724 | uctxt->expected_base); | |
725 | fd->invalid_tids[fd->invalid_tid_idx] |= | |
726 | EXP_TID_SET(LEN, node->npages); | |
727 | if (!fd->invalid_tid_idx) { | |
728 | unsigned long *ev; | |
729 | ||
730 | /* | |
731 | * hfi1_set_uevent_bits() sets a user event flag | |
732 | * for all processes. Because calling into the | |
733 | * driver to process TID cache invalidations is | |
734 | * expensive and TID cache invalidations are | |
735 | * handled on a per-process basis, we can | |
736 | * optimize this to set the flag only for the | |
737 | * process in question. | |
738 | */ | |
739 | ev = uctxt->dd->events + | |
740 | (((uctxt->ctxt - | |
741 | uctxt->dd->first_user_ctxt) * | |
742 | HFI1_MAX_SHARED_CTXTS) + fd->subctxt); | |
743 | set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev); | |
744 | } | |
745 | fd->invalid_tid_idx++; | |
746 | } | |
747 | spin_unlock(&fd->invalid_lock); | |
748 | } | |
749 | spin_unlock(&fd->rb_lock); | |
f727a0c3 MH |
750 | } |
751 | ||
752 | static inline int mmu_addr_cmp(struct mmu_rb_node *node, unsigned long addr, | |
753 | unsigned long len) | |
754 | { | |
755 | if ((addr + len) <= node->virt) | |
756 | return -1; | |
757 | else if (addr >= node->virt && addr < (node->virt + node->len)) | |
758 | return 0; | |
759 | else | |
760 | return 1; | |
761 | } | |
762 | ||
763 | static inline int mmu_entry_cmp(struct mmu_rb_node *node, u32 entry) | |
764 | { | |
765 | if (entry < node->rcventry) | |
766 | return -1; | |
767 | else if (entry > node->rcventry) | |
768 | return 1; | |
769 | else | |
770 | return 0; | |
771 | } | |
772 | ||
773 | static struct mmu_rb_node *mmu_rb_search_by_addr(struct rb_root *root, | |
774 | unsigned long addr) | |
775 | { | |
776 | struct rb_node *node = root->rb_node; | |
777 | ||
778 | while (node) { | |
779 | struct mmu_rb_node *mnode = | |
780 | container_of(node, struct mmu_rb_node, rbnode); | |
781 | /* | |
782 | * When searching, use at least one page length for size. The | |
783 | * MMU notifier will not give us anything less than that. We | |
784 | * also don't need anything more than a page because we are | |
785 | * guaranteed to have non-overlapping buffers in the tree. | |
786 | */ | |
787 | int result = mmu_addr_cmp(mnode, addr, PAGE_SIZE); | |
788 | ||
789 | if (result < 0) | |
790 | node = node->rb_left; | |
791 | else if (result > 0) | |
792 | node = node->rb_right; | |
793 | else | |
794 | return mnode; | |
795 | } | |
796 | return NULL; | |
797 | } | |
798 | ||
799 | static inline struct mmu_rb_node *mmu_rb_search_by_entry(struct rb_root *root, | |
800 | u32 index) | |
801 | { | |
802 | struct mmu_rb_node *rbnode; | |
803 | struct rb_node *node; | |
804 | ||
805 | if (root && !RB_EMPTY_ROOT(root)) | |
806 | for (node = rb_first(root); node; node = rb_next(node)) { | |
807 | rbnode = rb_entry(node, struct mmu_rb_node, rbnode); | |
808 | if (rbnode->rcventry == index) | |
809 | return rbnode; | |
810 | } | |
811 | return NULL; | |
812 | } | |
813 | ||
814 | static int mmu_rb_insert_by_entry(struct rb_root *root, | |
815 | struct mmu_rb_node *node) | |
816 | { | |
817 | struct rb_node **new = &root->rb_node, *parent = NULL; | |
818 | ||
819 | while (*new) { | |
820 | struct mmu_rb_node *this = | |
821 | container_of(*new, struct mmu_rb_node, rbnode); | |
822 | int result = mmu_entry_cmp(this, node->rcventry); | |
823 | ||
824 | parent = *new; | |
825 | if (result < 0) | |
826 | new = &((*new)->rb_left); | |
827 | else if (result > 0) | |
828 | new = &((*new)->rb_right); | |
829 | else | |
830 | return 1; | |
831 | } | |
832 | ||
833 | rb_link_node(&node->rbnode, parent, new); | |
834 | rb_insert_color(&node->rbnode, root); | |
835 | return 0; | |
836 | } | |
837 | ||
838 | static int mmu_rb_insert_by_addr(struct rb_root *root, struct mmu_rb_node *node) | |
839 | { | |
840 | struct rb_node **new = &root->rb_node, *parent = NULL; | |
841 | ||
842 | /* Figure out where to put new node */ | |
843 | while (*new) { | |
844 | struct mmu_rb_node *this = | |
845 | container_of(*new, struct mmu_rb_node, rbnode); | |
846 | int result = mmu_addr_cmp(this, node->virt, node->len); | |
847 | ||
848 | parent = *new; | |
849 | if (result < 0) | |
850 | new = &((*new)->rb_left); | |
851 | else if (result > 0) | |
852 | new = &((*new)->rb_right); | |
853 | else | |
854 | return 1; | |
855 | } | |
856 | ||
857 | /* Add new node and rebalance tree. */ | |
858 | rb_link_node(&node->rbnode, parent, new); | |
859 | rb_insert_color(&node->rbnode, root); | |
860 | ||
861 | return 0; | |
862 | } |