Commit | Line | Data |
---|---|---|
957558c9 | 1 | /* |
05d6ac1d | 2 | * Copyright(c) 2015, 2016 Intel Corporation. |
957558c9 MH |
3 | * |
4 | * This file is provided under a dual BSD/GPLv2 license. When using or | |
5 | * redistributing this file, you may do so under either license. | |
6 | * | |
7 | * GPL LICENSE SUMMARY | |
8 | * | |
957558c9 MH |
9 | * This program is free software; you can redistribute it and/or modify |
10 | * it under the terms of version 2 of the GNU General Public License as | |
11 | * published by the Free Software Foundation. | |
12 | * | |
13 | * This program is distributed in the hope that it will be useful, but | |
14 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | * General Public License for more details. | |
17 | * | |
18 | * BSD LICENSE | |
19 | * | |
957558c9 MH |
20 | * Redistribution and use in source and binary forms, with or without |
21 | * modification, are permitted provided that the following conditions | |
22 | * are met: | |
23 | * | |
24 | * - Redistributions of source code must retain the above copyright | |
25 | * notice, this list of conditions and the following disclaimer. | |
26 | * - Redistributions in binary form must reproduce the above copyright | |
27 | * notice, this list of conditions and the following disclaimer in | |
28 | * the documentation and/or other materials provided with the | |
29 | * distribution. | |
30 | * - Neither the name of Intel Corporation nor the names of its | |
31 | * contributors may be used to endorse or promote products derived | |
32 | * from this software without specific prior written permission. | |
33 | * | |
34 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
35 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
36 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
37 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
38 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
39 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
40 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
41 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
42 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
43 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
44 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
45 | * | |
46 | */ | |
47 | #include <linux/topology.h> | |
48 | #include <linux/cpumask.h> | |
49 | #include <linux/module.h> | |
50 | ||
51 | #include "hfi.h" | |
52 | #include "affinity.h" | |
53 | #include "sdma.h" | |
54 | #include "trace.h" | |
55 | ||
4197344b DD |
56 | struct hfi1_affinity_node_list node_affinity = { |
57 | .list = LIST_HEAD_INIT(node_affinity.list), | |
58 | .lock = __SPIN_LOCK_UNLOCKED(&node_affinity.lock), | |
59 | }; | |
60 | ||
957558c9 MH |
61 | /* Name of IRQ types, indexed by enum irq_type */ |
62 | static const char * const irq_type_names[] = { | |
63 | "SDMA", | |
64 | "RCVCTXT", | |
65 | "GENERAL", | |
66 | "OTHER", | |
67 | }; | |
68 | ||
d6373019 SS |
69 | /* Per NUMA node count of HFI devices */ |
70 | static unsigned int *hfi1_per_node_cntr; | |
71 | ||
957558c9 MH |
72 | static inline void init_cpu_mask_set(struct cpu_mask_set *set) |
73 | { | |
74 | cpumask_clear(&set->mask); | |
75 | cpumask_clear(&set->used); | |
76 | set->gen = 0; | |
77 | } | |
78 | ||
0852d241 | 79 | /* Initialize non-HT cpu cores mask */ |
4197344b | 80 | void init_real_cpu_mask(void) |
0852d241 | 81 | { |
0852d241 JJ |
82 | int possible, curr_cpu, i, ht; |
83 | ||
4197344b | 84 | cpumask_clear(&node_affinity.real_cpu_mask); |
0852d241 JJ |
85 | |
86 | /* Start with cpu online mask as the real cpu mask */ | |
4197344b | 87 | cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask); |
0852d241 JJ |
88 | |
89 | /* | |
90 | * Remove HT cores from the real cpu mask. Do this in two steps below. | |
91 | */ | |
4197344b | 92 | possible = cpumask_weight(&node_affinity.real_cpu_mask); |
0852d241 | 93 | ht = cpumask_weight(topology_sibling_cpumask( |
4197344b | 94 | cpumask_first(&node_affinity.real_cpu_mask))); |
0852d241 JJ |
95 | /* |
96 | * Step 1. Skip over the first N HT siblings and use them as the | |
97 | * "real" cores. Assumes that HT cores are not enumerated in | |
98 | * succession (except in the single core case). | |
99 | */ | |
4197344b | 100 | curr_cpu = cpumask_first(&node_affinity.real_cpu_mask); |
0852d241 | 101 | for (i = 0; i < possible / ht; i++) |
4197344b | 102 | curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask); |
0852d241 JJ |
103 | /* |
104 | * Step 2. Remove the remaining HT siblings. Use cpumask_next() to | |
105 | * skip any gaps. | |
106 | */ | |
107 | for (; i < possible; i++) { | |
4197344b DD |
108 | cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask); |
109 | curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask); | |
0852d241 | 110 | } |
4197344b | 111 | } |
0852d241 | 112 | |
d6373019 | 113 | int node_affinity_init(void) |
4197344b | 114 | { |
d6373019 SS |
115 | int node; |
116 | struct pci_dev *dev = NULL; | |
117 | const struct pci_device_id *ids = hfi1_pci_tbl; | |
118 | ||
b094a36f | 119 | cpumask_clear(&node_affinity.proc.used); |
4197344b | 120 | cpumask_copy(&node_affinity.proc.mask, cpu_online_mask); |
b094a36f SS |
121 | |
122 | node_affinity.proc.gen = 0; | |
123 | node_affinity.num_core_siblings = | |
124 | cpumask_weight(topology_sibling_cpumask( | |
125 | cpumask_first(&node_affinity.proc.mask) | |
126 | )); | |
127 | node_affinity.num_online_nodes = num_online_nodes(); | |
128 | node_affinity.num_online_cpus = num_online_cpus(); | |
129 | ||
4197344b DD |
130 | /* |
131 | * The real cpu mask is part of the affinity struct but it has to be | |
132 | * initialized early. It is needed to calculate the number of user | |
133 | * contexts in set_up_context_variables(). | |
134 | */ | |
135 | init_real_cpu_mask(); | |
d6373019 SS |
136 | |
137 | hfi1_per_node_cntr = kcalloc(num_possible_nodes(), | |
138 | sizeof(*hfi1_per_node_cntr), GFP_KERNEL); | |
139 | if (!hfi1_per_node_cntr) | |
140 | return -ENOMEM; | |
141 | ||
142 | while (ids->vendor) { | |
143 | dev = NULL; | |
144 | while ((dev = pci_get_device(ids->vendor, ids->device, dev))) { | |
145 | node = pcibus_to_node(dev->bus); | |
146 | if (node < 0) | |
147 | node = numa_node_id(); | |
148 | ||
149 | hfi1_per_node_cntr[node]++; | |
150 | } | |
151 | ids++; | |
152 | } | |
153 | ||
154 | return 0; | |
4197344b DD |
155 | } |
156 | ||
157 | void node_affinity_destroy(void) | |
158 | { | |
159 | struct list_head *pos, *q; | |
160 | struct hfi1_affinity_node *entry; | |
161 | ||
162 | spin_lock(&node_affinity.lock); | |
163 | list_for_each_safe(pos, q, &node_affinity.list) { | |
164 | entry = list_entry(pos, struct hfi1_affinity_node, | |
165 | list); | |
166 | list_del(pos); | |
167 | kfree(entry); | |
168 | } | |
169 | spin_unlock(&node_affinity.lock); | |
d6373019 | 170 | kfree(hfi1_per_node_cntr); |
4197344b DD |
171 | } |
172 | ||
173 | static struct hfi1_affinity_node *node_affinity_allocate(int node) | |
174 | { | |
175 | struct hfi1_affinity_node *entry; | |
176 | ||
177 | entry = kzalloc(sizeof(*entry), GFP_KERNEL); | |
178 | if (!entry) | |
179 | return NULL; | |
180 | entry->node = node; | |
181 | INIT_LIST_HEAD(&entry->list); | |
182 | ||
183 | return entry; | |
184 | } | |
185 | ||
186 | /* | |
187 | * It appends an entry to the list. | |
188 | * It *must* be called with node_affinity.lock held. | |
189 | */ | |
190 | static void node_affinity_add_tail(struct hfi1_affinity_node *entry) | |
191 | { | |
192 | list_add_tail(&entry->list, &node_affinity.list); | |
193 | } | |
194 | ||
195 | /* It must be called with node_affinity.lock held */ | |
196 | static struct hfi1_affinity_node *node_affinity_lookup(int node) | |
197 | { | |
198 | struct list_head *pos; | |
199 | struct hfi1_affinity_node *entry; | |
200 | ||
201 | list_for_each(pos, &node_affinity.list) { | |
202 | entry = list_entry(pos, struct hfi1_affinity_node, list); | |
203 | if (entry->node == node) | |
204 | return entry; | |
205 | } | |
206 | ||
207 | return NULL; | |
0852d241 JJ |
208 | } |
209 | ||
957558c9 MH |
210 | /* |
211 | * Interrupt affinity. | |
212 | * | |
213 | * non-rcv avail gets a default mask that | |
214 | * starts as possible cpus with threads reset | |
215 | * and each rcv avail reset. | |
216 | * | |
217 | * rcv avail gets node relative 1 wrapping back | |
218 | * to the node relative 1 as necessary. | |
219 | * | |
220 | */ | |
4197344b | 221 | int hfi1_dev_affinity_init(struct hfi1_devdata *dd) |
957558c9 MH |
222 | { |
223 | int node = pcibus_to_node(dd->pcidev->bus); | |
4197344b | 224 | struct hfi1_affinity_node *entry; |
957558c9 | 225 | const struct cpumask *local_mask; |
0852d241 | 226 | int curr_cpu, possible, i; |
957558c9 MH |
227 | |
228 | if (node < 0) | |
229 | node = numa_node_id(); | |
230 | dd->node = node; | |
231 | ||
957558c9 MH |
232 | local_mask = cpumask_of_node(dd->node); |
233 | if (cpumask_first(local_mask) >= nr_cpu_ids) | |
234 | local_mask = topology_core_cpumask(0); | |
4197344b DD |
235 | |
236 | spin_lock(&node_affinity.lock); | |
237 | entry = node_affinity_lookup(dd->node); | |
238 | spin_unlock(&node_affinity.lock); | |
239 | ||
240 | /* | |
241 | * If this is the first time this NUMA node's affinity is used, | |
242 | * create an entry in the global affinity structure and initialize it. | |
243 | */ | |
244 | if (!entry) { | |
245 | entry = node_affinity_allocate(node); | |
246 | if (!entry) { | |
247 | dd_dev_err(dd, | |
248 | "Unable to allocate global affinity node\n"); | |
249 | return -ENOMEM; | |
957558c9 | 250 | } |
4197344b DD |
251 | init_cpu_mask_set(&entry->def_intr); |
252 | init_cpu_mask_set(&entry->rcv_intr); | |
d6373019 | 253 | cpumask_clear(&entry->general_intr_mask); |
4197344b DD |
254 | /* Use the "real" cpu mask of this node as the default */ |
255 | cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask, | |
256 | local_mask); | |
257 | ||
258 | /* fill in the receive list */ | |
259 | possible = cpumask_weight(&entry->def_intr.mask); | |
260 | curr_cpu = cpumask_first(&entry->def_intr.mask); | |
261 | ||
262 | if (possible == 1) { | |
263 | /* only one CPU, everyone will use it */ | |
264 | cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask); | |
d6373019 | 265 | cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); |
4197344b DD |
266 | } else { |
267 | /* | |
d6373019 SS |
268 | * The general/control context will be the first CPU in |
269 | * the default list, so it is removed from the default | |
270 | * list and added to the general interrupt list. | |
4197344b | 271 | */ |
d6373019 SS |
272 | cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask); |
273 | cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); | |
4197344b DD |
274 | curr_cpu = cpumask_next(curr_cpu, |
275 | &entry->def_intr.mask); | |
957558c9 | 276 | |
4197344b DD |
277 | /* |
278 | * Remove the remaining kernel receive queues from | |
279 | * the default list and add them to the receive list. | |
280 | */ | |
d6373019 SS |
281 | for (i = 0; |
282 | i < (dd->n_krcv_queues - 1) * | |
283 | hfi1_per_node_cntr[dd->node]; | |
284 | i++) { | |
4197344b DD |
285 | cpumask_clear_cpu(curr_cpu, |
286 | &entry->def_intr.mask); | |
287 | cpumask_set_cpu(curr_cpu, | |
288 | &entry->rcv_intr.mask); | |
289 | curr_cpu = cpumask_next(curr_cpu, | |
290 | &entry->def_intr.mask); | |
291 | if (curr_cpu >= nr_cpu_ids) | |
292 | break; | |
293 | } | |
d6373019 SS |
294 | |
295 | /* | |
296 | * If there ends up being 0 CPU cores leftover for SDMA | |
297 | * engines, use the same CPU cores as general/control | |
298 | * context. | |
299 | */ | |
300 | if (cpumask_weight(&entry->def_intr.mask) == 0) | |
301 | cpumask_copy(&entry->def_intr.mask, | |
302 | &entry->general_intr_mask); | |
4197344b | 303 | } |
957558c9 | 304 | |
4197344b DD |
305 | spin_lock(&node_affinity.lock); |
306 | node_affinity_add_tail(entry); | |
307 | spin_unlock(&node_affinity.lock); | |
308 | } | |
309 | ||
310 | return 0; | |
957558c9 MH |
311 | } |
312 | ||
313 | int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) | |
314 | { | |
315 | int ret; | |
316 | cpumask_var_t diff; | |
4197344b | 317 | struct hfi1_affinity_node *entry; |
d6373019 | 318 | struct cpu_mask_set *set = NULL; |
957558c9 MH |
319 | struct sdma_engine *sde = NULL; |
320 | struct hfi1_ctxtdata *rcd = NULL; | |
321 | char extra[64]; | |
322 | int cpu = -1; | |
323 | ||
324 | extra[0] = '\0'; | |
325 | cpumask_clear(&msix->mask); | |
326 | ||
327 | ret = zalloc_cpumask_var(&diff, GFP_KERNEL); | |
328 | if (!ret) | |
329 | return -ENOMEM; | |
330 | ||
4197344b DD |
331 | spin_lock(&node_affinity.lock); |
332 | entry = node_affinity_lookup(dd->node); | |
333 | spin_unlock(&node_affinity.lock); | |
334 | ||
957558c9 MH |
335 | switch (msix->type) { |
336 | case IRQ_SDMA: | |
337 | sde = (struct sdma_engine *)msix->arg; | |
338 | scnprintf(extra, 64, "engine %u", sde->this_idx); | |
4197344b | 339 | set = &entry->def_intr; |
957558c9 | 340 | break; |
d6373019 SS |
341 | case IRQ_GENERAL: |
342 | cpu = cpumask_first(&entry->general_intr_mask); | |
343 | break; | |
957558c9 MH |
344 | case IRQ_RCVCTXT: |
345 | rcd = (struct hfi1_ctxtdata *)msix->arg; | |
d6373019 SS |
346 | if (rcd->ctxt == HFI1_CTRL_CTXT) |
347 | cpu = cpumask_first(&entry->general_intr_mask); | |
348 | else | |
4197344b | 349 | set = &entry->rcv_intr; |
957558c9 MH |
350 | scnprintf(extra, 64, "ctxt %u", rcd->ctxt); |
351 | break; | |
352 | default: | |
353 | dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type); | |
354 | return -EINVAL; | |
355 | } | |
356 | ||
357 | /* | |
d6373019 SS |
358 | * The general and control contexts are placed on a particular |
359 | * CPU, which is set above. Skip accounting for it. Everything else | |
360 | * finds its CPU here. | |
957558c9 | 361 | */ |
4197344b DD |
362 | if (cpu == -1 && set) { |
363 | spin_lock(&node_affinity.lock); | |
957558c9 MH |
364 | if (cpumask_equal(&set->mask, &set->used)) { |
365 | /* | |
366 | * We've used up all the CPUs, bump up the generation | |
367 | * and reset the 'used' map | |
368 | */ | |
369 | set->gen++; | |
370 | cpumask_clear(&set->used); | |
371 | } | |
372 | cpumask_andnot(diff, &set->mask, &set->used); | |
373 | cpu = cpumask_first(diff); | |
374 | cpumask_set_cpu(cpu, &set->used); | |
4197344b | 375 | spin_unlock(&node_affinity.lock); |
957558c9 MH |
376 | } |
377 | ||
378 | switch (msix->type) { | |
379 | case IRQ_SDMA: | |
380 | sde->cpu = cpu; | |
381 | break; | |
382 | case IRQ_GENERAL: | |
383 | case IRQ_RCVCTXT: | |
384 | case IRQ_OTHER: | |
385 | break; | |
386 | } | |
387 | ||
388 | cpumask_set_cpu(cpu, &msix->mask); | |
389 | dd_dev_info(dd, "IRQ vector: %u, type %s %s -> cpu: %d\n", | |
390 | msix->msix.vector, irq_type_names[msix->type], | |
391 | extra, cpu); | |
392 | irq_set_affinity_hint(msix->msix.vector, &msix->mask); | |
393 | ||
394 | free_cpumask_var(diff); | |
395 | return 0; | |
396 | } | |
397 | ||
398 | void hfi1_put_irq_affinity(struct hfi1_devdata *dd, | |
399 | struct hfi1_msix_entry *msix) | |
400 | { | |
401 | struct cpu_mask_set *set = NULL; | |
402 | struct hfi1_ctxtdata *rcd; | |
4197344b DD |
403 | struct hfi1_affinity_node *entry; |
404 | ||
405 | spin_lock(&node_affinity.lock); | |
406 | entry = node_affinity_lookup(dd->node); | |
407 | spin_unlock(&node_affinity.lock); | |
957558c9 MH |
408 | |
409 | switch (msix->type) { | |
410 | case IRQ_SDMA: | |
4197344b | 411 | set = &entry->def_intr; |
957558c9 | 412 | break; |
d6373019 | 413 | case IRQ_GENERAL: |
b094a36f | 414 | /* Don't do accounting for general contexts */ |
d6373019 | 415 | break; |
957558c9 MH |
416 | case IRQ_RCVCTXT: |
417 | rcd = (struct hfi1_ctxtdata *)msix->arg; | |
d6373019 | 418 | /* Don't do accounting for control contexts */ |
957558c9 | 419 | if (rcd->ctxt != HFI1_CTRL_CTXT) |
4197344b | 420 | set = &entry->rcv_intr; |
957558c9 MH |
421 | break; |
422 | default: | |
423 | return; | |
424 | } | |
425 | ||
426 | if (set) { | |
4197344b | 427 | spin_lock(&node_affinity.lock); |
957558c9 MH |
428 | cpumask_andnot(&set->used, &set->used, &msix->mask); |
429 | if (cpumask_empty(&set->used) && set->gen) { | |
430 | set->gen--; | |
431 | cpumask_copy(&set->used, &set->mask); | |
432 | } | |
4197344b | 433 | spin_unlock(&node_affinity.lock); |
957558c9 MH |
434 | } |
435 | ||
436 | irq_set_affinity_hint(msix->msix.vector, NULL); | |
437 | cpumask_clear(&msix->mask); | |
438 | } | |
439 | ||
b094a36f SS |
440 | /* This should be called with node_affinity.lock held */ |
441 | static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask, | |
442 | struct hfi1_affinity_node_list *affinity) | |
443 | { | |
444 | int possible, curr_cpu, i; | |
445 | uint num_cores_per_socket = node_affinity.num_online_cpus / | |
446 | affinity->num_core_siblings / | |
447 | node_affinity.num_online_nodes; | |
448 | ||
449 | cpumask_copy(hw_thread_mask, &affinity->proc.mask); | |
450 | if (affinity->num_core_siblings > 0) { | |
451 | /* Removing other siblings not needed for now */ | |
452 | possible = cpumask_weight(hw_thread_mask); | |
453 | curr_cpu = cpumask_first(hw_thread_mask); | |
454 | for (i = 0; | |
455 | i < num_cores_per_socket * node_affinity.num_online_nodes; | |
456 | i++) | |
457 | curr_cpu = cpumask_next(curr_cpu, hw_thread_mask); | |
458 | ||
459 | for (; i < possible; i++) { | |
460 | cpumask_clear_cpu(curr_cpu, hw_thread_mask); | |
461 | curr_cpu = cpumask_next(curr_cpu, hw_thread_mask); | |
462 | } | |
463 | ||
464 | /* Identifying correct HW threads within physical cores */ | |
465 | cpumask_shift_left(hw_thread_mask, hw_thread_mask, | |
466 | num_cores_per_socket * | |
467 | node_affinity.num_online_nodes * | |
468 | hw_thread_no); | |
469 | } | |
470 | } | |
471 | ||
472 | int hfi1_get_proc_affinity(int node) | |
957558c9 | 473 | { |
b094a36f | 474 | int cpu = -1, ret, i; |
4197344b | 475 | struct hfi1_affinity_node *entry; |
b094a36f | 476 | cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask; |
957558c9 MH |
477 | const struct cpumask *node_mask, |
478 | *proc_mask = tsk_cpus_allowed(current); | |
b094a36f SS |
479 | struct hfi1_affinity_node_list *affinity = &node_affinity; |
480 | struct cpu_mask_set *set = &affinity->proc; | |
957558c9 MH |
481 | |
482 | /* | |
483 | * check whether process/context affinity has already | |
484 | * been set | |
485 | */ | |
486 | if (cpumask_weight(proc_mask) == 1) { | |
f242d93a LR |
487 | hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl", |
488 | current->pid, current->comm, | |
489 | cpumask_pr_args(proc_mask)); | |
957558c9 MH |
490 | /* |
491 | * Mark the pre-set CPU as used. This is atomic so we don't | |
492 | * need the lock | |
493 | */ | |
494 | cpu = cpumask_first(proc_mask); | |
495 | cpumask_set_cpu(cpu, &set->used); | |
496 | goto done; | |
497 | } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) { | |
f242d93a LR |
498 | hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl", |
499 | current->pid, current->comm, | |
500 | cpumask_pr_args(proc_mask)); | |
957558c9 MH |
501 | goto done; |
502 | } | |
503 | ||
504 | /* | |
505 | * The process does not have a preset CPU affinity so find one to | |
b094a36f SS |
506 | * recommend using the following algorithm: |
507 | * | |
508 | * For each user process that is opening a context on HFI Y: | |
509 | * a) If all cores are filled, reinitialize the bitmask | |
510 | * b) Fill real cores first, then HT cores (First set of HT | |
511 | * cores on all physical cores, then second set of HT core, | |
512 | * and, so on) in the following order: | |
513 | * | |
514 | * 1. Same NUMA node as HFI Y and not running an IRQ | |
515 | * handler | |
516 | * 2. Same NUMA node as HFI Y and running an IRQ handler | |
517 | * 3. Different NUMA node to HFI Y and not running an IRQ | |
518 | * handler | |
519 | * 4. Different NUMA node to HFI Y and running an IRQ | |
520 | * handler | |
521 | * c) Mark core as filled in the bitmask. As user processes are | |
522 | * done, clear cores from the bitmask. | |
957558c9 MH |
523 | */ |
524 | ||
525 | ret = zalloc_cpumask_var(&diff, GFP_KERNEL); | |
526 | if (!ret) | |
527 | goto done; | |
b094a36f | 528 | ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL); |
957558c9 MH |
529 | if (!ret) |
530 | goto free_diff; | |
b094a36f | 531 | ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL); |
957558c9 | 532 | if (!ret) |
b094a36f SS |
533 | goto free_hw_thread_mask; |
534 | ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL); | |
535 | if (!ret) | |
536 | goto free_available_mask; | |
957558c9 | 537 | |
b094a36f | 538 | spin_lock(&affinity->lock); |
957558c9 | 539 | /* |
b094a36f | 540 | * If we've used all available HW threads, clear the mask and start |
957558c9 MH |
541 | * overloading. |
542 | */ | |
543 | if (cpumask_equal(&set->mask, &set->used)) { | |
544 | set->gen++; | |
545 | cpumask_clear(&set->used); | |
546 | } | |
547 | ||
d6373019 SS |
548 | /* |
549 | * If NUMA node has CPUs used by interrupt handlers, include them in the | |
550 | * interrupt handler mask. | |
551 | */ | |
552 | entry = node_affinity_lookup(node); | |
553 | if (entry) { | |
b094a36f SS |
554 | cpumask_copy(intrs_mask, (entry->def_intr.gen ? |
555 | &entry->def_intr.mask : | |
556 | &entry->def_intr.used)); | |
557 | cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ? | |
558 | &entry->rcv_intr.mask : | |
559 | &entry->rcv_intr.used)); | |
560 | cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask); | |
d6373019 | 561 | } |
f242d93a | 562 | hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl", |
b094a36f SS |
563 | cpumask_pr_args(intrs_mask)); |
564 | ||
565 | cpumask_copy(hw_thread_mask, &set->mask); | |
957558c9 MH |
566 | |
567 | /* | |
b094a36f SS |
568 | * If HT cores are enabled, identify which HW threads within the |
569 | * physical cores should be used. | |
957558c9 | 570 | */ |
b094a36f SS |
571 | if (affinity->num_core_siblings > 0) { |
572 | for (i = 0; i < affinity->num_core_siblings; i++) { | |
573 | find_hw_thread_mask(i, hw_thread_mask, affinity); | |
574 | ||
575 | /* | |
576 | * If there's at least one available core for this HW | |
577 | * thread number, stop looking for a core. | |
578 | * | |
579 | * diff will always be not empty at least once in this | |
580 | * loop as the used mask gets reset when | |
581 | * (set->mask == set->used) before this loop. | |
582 | */ | |
583 | cpumask_andnot(diff, hw_thread_mask, &set->used); | |
584 | if (!cpumask_empty(diff)) | |
585 | break; | |
586 | } | |
587 | } | |
588 | hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl", | |
589 | cpumask_pr_args(hw_thread_mask)); | |
590 | ||
957558c9 | 591 | node_mask = cpumask_of_node(node); |
b094a36f | 592 | hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node, |
f242d93a | 593 | cpumask_pr_args(node_mask)); |
957558c9 | 594 | |
b094a36f SS |
595 | /* Get cpumask of available CPUs on preferred NUMA */ |
596 | cpumask_and(available_mask, hw_thread_mask, node_mask); | |
597 | cpumask_andnot(available_mask, available_mask, &set->used); | |
598 | hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node, | |
599 | cpumask_pr_args(available_mask)); | |
957558c9 MH |
600 | |
601 | /* | |
602 | * At first, we don't want to place processes on the same | |
b094a36f SS |
603 | * CPUs as interrupt handlers. Then, CPUs running interrupt |
604 | * handlers are used. | |
605 | * | |
606 | * 1) If diff is not empty, then there are CPUs not running | |
607 | * non-interrupt handlers available, so diff gets copied | |
608 | * over to available_mask. | |
609 | * 2) If diff is empty, then all CPUs not running interrupt | |
610 | * handlers are taken, so available_mask contains all | |
611 | * available CPUs running interrupt handlers. | |
612 | * 3) If available_mask is empty, then all CPUs on the | |
613 | * preferred NUMA node are taken, so other NUMA nodes are | |
614 | * used for process assignments using the same method as | |
615 | * the preferred NUMA node. | |
957558c9 | 616 | */ |
b094a36f | 617 | cpumask_andnot(diff, available_mask, intrs_mask); |
957558c9 | 618 | if (!cpumask_empty(diff)) |
b094a36f | 619 | cpumask_copy(available_mask, diff); |
957558c9 | 620 | |
b094a36f SS |
621 | /* If we don't have CPUs on the preferred node, use other NUMA nodes */ |
622 | if (cpumask_empty(available_mask)) { | |
623 | cpumask_andnot(available_mask, hw_thread_mask, &set->used); | |
624 | /* Excluding preferred NUMA cores */ | |
625 | cpumask_andnot(available_mask, available_mask, node_mask); | |
626 | hfi1_cdbg(PROC, | |
627 | "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl", | |
628 | cpumask_pr_args(available_mask)); | |
629 | ||
630 | /* | |
631 | * At first, we don't want to place processes on the same | |
632 | * CPUs as interrupt handlers. | |
633 | */ | |
634 | cpumask_andnot(diff, available_mask, intrs_mask); | |
635 | if (!cpumask_empty(diff)) | |
636 | cpumask_copy(available_mask, diff); | |
957558c9 | 637 | } |
b094a36f SS |
638 | hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl", |
639 | cpumask_pr_args(available_mask)); | |
957558c9 | 640 | |
b094a36f | 641 | cpu = cpumask_first(available_mask); |
957558c9 MH |
642 | if (cpu >= nr_cpu_ids) /* empty */ |
643 | cpu = -1; | |
644 | else | |
645 | cpumask_set_cpu(cpu, &set->used); | |
b094a36f SS |
646 | spin_unlock(&affinity->lock); |
647 | hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu); | |
648 | ||
649 | free_cpumask_var(intrs_mask); | |
650 | free_available_mask: | |
651 | free_cpumask_var(available_mask); | |
652 | free_hw_thread_mask: | |
653 | free_cpumask_var(hw_thread_mask); | |
957558c9 MH |
654 | free_diff: |
655 | free_cpumask_var(diff); | |
656 | done: | |
657 | return cpu; | |
658 | } | |
659 | ||
b094a36f | 660 | void hfi1_put_proc_affinity(int cpu) |
957558c9 | 661 | { |
b094a36f SS |
662 | struct hfi1_affinity_node_list *affinity = &node_affinity; |
663 | struct cpu_mask_set *set = &affinity->proc; | |
957558c9 MH |
664 | |
665 | if (cpu < 0) | |
666 | return; | |
b094a36f | 667 | spin_lock(&affinity->lock); |
957558c9 | 668 | cpumask_clear_cpu(cpu, &set->used); |
b094a36f | 669 | hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu); |
957558c9 MH |
670 | if (cpumask_empty(&set->used) && set->gen) { |
671 | set->gen--; | |
672 | cpumask_copy(&set->used, &set->mask); | |
673 | } | |
b094a36f | 674 | spin_unlock(&affinity->lock); |
957558c9 | 675 | } |
b14db1f0 TS |
676 | |
677 | /* Prevents concurrent reads and writes of the sdma_affinity attrib */ | |
678 | static DEFINE_MUTEX(sdma_affinity_mutex); | |
679 | ||
680 | int hfi1_set_sdma_affinity(struct hfi1_devdata *dd, const char *buf, | |
681 | size_t count) | |
682 | { | |
683 | struct hfi1_affinity_node *entry; | |
8303f683 | 684 | cpumask_var_t mask; |
b14db1f0 TS |
685 | int ret, i; |
686 | ||
687 | spin_lock(&node_affinity.lock); | |
688 | entry = node_affinity_lookup(dd->node); | |
689 | spin_unlock(&node_affinity.lock); | |
690 | ||
691 | if (!entry) | |
692 | return -EINVAL; | |
693 | ||
8303f683 TS |
694 | ret = zalloc_cpumask_var(&mask, GFP_KERNEL); |
695 | if (!ret) | |
696 | return -ENOMEM; | |
697 | ||
698 | ret = cpulist_parse(buf, mask); | |
b14db1f0 | 699 | if (ret) |
8303f683 | 700 | goto out; |
b14db1f0 | 701 | |
8303f683 | 702 | if (!cpumask_subset(mask, cpu_online_mask) || cpumask_empty(mask)) { |
b14db1f0 | 703 | dd_dev_warn(dd, "Invalid CPU mask\n"); |
8303f683 TS |
704 | ret = -EINVAL; |
705 | goto out; | |
b14db1f0 TS |
706 | } |
707 | ||
708 | mutex_lock(&sdma_affinity_mutex); | |
709 | /* reset the SDMA interrupt affinity details */ | |
710 | init_cpu_mask_set(&entry->def_intr); | |
8303f683 | 711 | cpumask_copy(&entry->def_intr.mask, mask); |
b14db1f0 TS |
712 | /* |
713 | * Reassign the affinity for each SDMA interrupt. | |
714 | */ | |
715 | for (i = 0; i < dd->num_msix_entries; i++) { | |
716 | struct hfi1_msix_entry *msix; | |
717 | ||
718 | msix = &dd->msix_entries[i]; | |
719 | if (msix->type != IRQ_SDMA) | |
720 | continue; | |
721 | ||
722 | ret = hfi1_get_irq_affinity(dd, msix); | |
723 | ||
724 | if (ret) | |
725 | break; | |
726 | } | |
b14db1f0 | 727 | mutex_unlock(&sdma_affinity_mutex); |
8303f683 TS |
728 | out: |
729 | free_cpumask_var(mask); | |
b14db1f0 TS |
730 | return ret ? ret : strnlen(buf, PAGE_SIZE); |
731 | } | |
732 | ||
733 | int hfi1_get_sdma_affinity(struct hfi1_devdata *dd, char *buf) | |
734 | { | |
735 | struct hfi1_affinity_node *entry; | |
736 | ||
737 | spin_lock(&node_affinity.lock); | |
738 | entry = node_affinity_lookup(dd->node); | |
739 | spin_unlock(&node_affinity.lock); | |
740 | ||
741 | if (!entry) | |
742 | return -EINVAL; | |
743 | ||
744 | mutex_lock(&sdma_affinity_mutex); | |
745 | cpumap_print_to_pagebuf(true, buf, &entry->def_intr.mask); | |
746 | mutex_unlock(&sdma_affinity_mutex); | |
747 | return strnlen(buf, PAGE_SIZE); | |
748 | } |