Commit | Line | Data |
---|---|---|
957558c9 | 1 | /* |
05d6ac1d | 2 | * Copyright(c) 2015, 2016 Intel Corporation. |
957558c9 MH |
3 | * |
4 | * This file is provided under a dual BSD/GPLv2 license. When using or | |
5 | * redistributing this file, you may do so under either license. | |
6 | * | |
7 | * GPL LICENSE SUMMARY | |
8 | * | |
957558c9 MH |
9 | * This program is free software; you can redistribute it and/or modify |
10 | * it under the terms of version 2 of the GNU General Public License as | |
11 | * published by the Free Software Foundation. | |
12 | * | |
13 | * This program is distributed in the hope that it will be useful, but | |
14 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | * General Public License for more details. | |
17 | * | |
18 | * BSD LICENSE | |
19 | * | |
957558c9 MH |
20 | * Redistribution and use in source and binary forms, with or without |
21 | * modification, are permitted provided that the following conditions | |
22 | * are met: | |
23 | * | |
24 | * - Redistributions of source code must retain the above copyright | |
25 | * notice, this list of conditions and the following disclaimer. | |
26 | * - Redistributions in binary form must reproduce the above copyright | |
27 | * notice, this list of conditions and the following disclaimer in | |
28 | * the documentation and/or other materials provided with the | |
29 | * distribution. | |
30 | * - Neither the name of Intel Corporation nor the names of its | |
31 | * contributors may be used to endorse or promote products derived | |
32 | * from this software without specific prior written permission. | |
33 | * | |
34 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
35 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
36 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
37 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
38 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
39 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
40 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
41 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
42 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
43 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
44 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
45 | * | |
46 | */ | |
47 | #include <linux/topology.h> | |
48 | #include <linux/cpumask.h> | |
49 | #include <linux/module.h> | |
b14db1f0 | 50 | #include <linux/cpumask.h> |
957558c9 MH |
51 | |
52 | #include "hfi.h" | |
53 | #include "affinity.h" | |
54 | #include "sdma.h" | |
55 | #include "trace.h" | |
56 | ||
4197344b DD |
57 | struct hfi1_affinity_node_list node_affinity = { |
58 | .list = LIST_HEAD_INIT(node_affinity.list), | |
59 | .lock = __SPIN_LOCK_UNLOCKED(&node_affinity.lock), | |
60 | }; | |
61 | ||
957558c9 MH |
62 | /* Name of IRQ types, indexed by enum irq_type */ |
63 | static const char * const irq_type_names[] = { | |
64 | "SDMA", | |
65 | "RCVCTXT", | |
66 | "GENERAL", | |
67 | "OTHER", | |
68 | }; | |
69 | ||
d6373019 SS |
70 | /* Per NUMA node count of HFI devices */ |
71 | static unsigned int *hfi1_per_node_cntr; | |
72 | ||
957558c9 MH |
73 | static inline void init_cpu_mask_set(struct cpu_mask_set *set) |
74 | { | |
75 | cpumask_clear(&set->mask); | |
76 | cpumask_clear(&set->used); | |
77 | set->gen = 0; | |
78 | } | |
79 | ||
0852d241 | 80 | /* Initialize non-HT cpu cores mask */ |
4197344b | 81 | void init_real_cpu_mask(void) |
0852d241 | 82 | { |
0852d241 JJ |
83 | int possible, curr_cpu, i, ht; |
84 | ||
4197344b | 85 | cpumask_clear(&node_affinity.real_cpu_mask); |
0852d241 JJ |
86 | |
87 | /* Start with cpu online mask as the real cpu mask */ | |
4197344b | 88 | cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask); |
0852d241 JJ |
89 | |
90 | /* | |
91 | * Remove HT cores from the real cpu mask. Do this in two steps below. | |
92 | */ | |
4197344b | 93 | possible = cpumask_weight(&node_affinity.real_cpu_mask); |
0852d241 | 94 | ht = cpumask_weight(topology_sibling_cpumask( |
4197344b | 95 | cpumask_first(&node_affinity.real_cpu_mask))); |
0852d241 JJ |
96 | /* |
97 | * Step 1. Skip over the first N HT siblings and use them as the | |
98 | * "real" cores. Assumes that HT cores are not enumerated in | |
99 | * succession (except in the single core case). | |
100 | */ | |
4197344b | 101 | curr_cpu = cpumask_first(&node_affinity.real_cpu_mask); |
0852d241 | 102 | for (i = 0; i < possible / ht; i++) |
4197344b | 103 | curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask); |
0852d241 JJ |
104 | /* |
105 | * Step 2. Remove the remaining HT siblings. Use cpumask_next() to | |
106 | * skip any gaps. | |
107 | */ | |
108 | for (; i < possible; i++) { | |
4197344b DD |
109 | cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask); |
110 | curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask); | |
0852d241 | 111 | } |
4197344b | 112 | } |
0852d241 | 113 | |
d6373019 | 114 | int node_affinity_init(void) |
4197344b | 115 | { |
d6373019 SS |
116 | int node; |
117 | struct pci_dev *dev = NULL; | |
118 | const struct pci_device_id *ids = hfi1_pci_tbl; | |
119 | ||
b094a36f | 120 | cpumask_clear(&node_affinity.proc.used); |
4197344b | 121 | cpumask_copy(&node_affinity.proc.mask, cpu_online_mask); |
b094a36f SS |
122 | |
123 | node_affinity.proc.gen = 0; | |
124 | node_affinity.num_core_siblings = | |
125 | cpumask_weight(topology_sibling_cpumask( | |
126 | cpumask_first(&node_affinity.proc.mask) | |
127 | )); | |
128 | node_affinity.num_online_nodes = num_online_nodes(); | |
129 | node_affinity.num_online_cpus = num_online_cpus(); | |
130 | ||
4197344b DD |
131 | /* |
132 | * The real cpu mask is part of the affinity struct but it has to be | |
133 | * initialized early. It is needed to calculate the number of user | |
134 | * contexts in set_up_context_variables(). | |
135 | */ | |
136 | init_real_cpu_mask(); | |
d6373019 SS |
137 | |
138 | hfi1_per_node_cntr = kcalloc(num_possible_nodes(), | |
139 | sizeof(*hfi1_per_node_cntr), GFP_KERNEL); | |
140 | if (!hfi1_per_node_cntr) | |
141 | return -ENOMEM; | |
142 | ||
143 | while (ids->vendor) { | |
144 | dev = NULL; | |
145 | while ((dev = pci_get_device(ids->vendor, ids->device, dev))) { | |
146 | node = pcibus_to_node(dev->bus); | |
147 | if (node < 0) | |
148 | node = numa_node_id(); | |
149 | ||
150 | hfi1_per_node_cntr[node]++; | |
151 | } | |
152 | ids++; | |
153 | } | |
154 | ||
155 | return 0; | |
4197344b DD |
156 | } |
157 | ||
158 | void node_affinity_destroy(void) | |
159 | { | |
160 | struct list_head *pos, *q; | |
161 | struct hfi1_affinity_node *entry; | |
162 | ||
163 | spin_lock(&node_affinity.lock); | |
164 | list_for_each_safe(pos, q, &node_affinity.list) { | |
165 | entry = list_entry(pos, struct hfi1_affinity_node, | |
166 | list); | |
167 | list_del(pos); | |
168 | kfree(entry); | |
169 | } | |
170 | spin_unlock(&node_affinity.lock); | |
d6373019 | 171 | kfree(hfi1_per_node_cntr); |
4197344b DD |
172 | } |
173 | ||
174 | static struct hfi1_affinity_node *node_affinity_allocate(int node) | |
175 | { | |
176 | struct hfi1_affinity_node *entry; | |
177 | ||
178 | entry = kzalloc(sizeof(*entry), GFP_KERNEL); | |
179 | if (!entry) | |
180 | return NULL; | |
181 | entry->node = node; | |
182 | INIT_LIST_HEAD(&entry->list); | |
183 | ||
184 | return entry; | |
185 | } | |
186 | ||
187 | /* | |
188 | * It appends an entry to the list. | |
189 | * It *must* be called with node_affinity.lock held. | |
190 | */ | |
191 | static void node_affinity_add_tail(struct hfi1_affinity_node *entry) | |
192 | { | |
193 | list_add_tail(&entry->list, &node_affinity.list); | |
194 | } | |
195 | ||
196 | /* It must be called with node_affinity.lock held */ | |
197 | static struct hfi1_affinity_node *node_affinity_lookup(int node) | |
198 | { | |
199 | struct list_head *pos; | |
200 | struct hfi1_affinity_node *entry; | |
201 | ||
202 | list_for_each(pos, &node_affinity.list) { | |
203 | entry = list_entry(pos, struct hfi1_affinity_node, list); | |
204 | if (entry->node == node) | |
205 | return entry; | |
206 | } | |
207 | ||
208 | return NULL; | |
0852d241 JJ |
209 | } |
210 | ||
957558c9 MH |
211 | /* |
212 | * Interrupt affinity. | |
213 | * | |
214 | * non-rcv avail gets a default mask that | |
215 | * starts as possible cpus with threads reset | |
216 | * and each rcv avail reset. | |
217 | * | |
218 | * rcv avail gets node relative 1 wrapping back | |
219 | * to the node relative 1 as necessary. | |
220 | * | |
221 | */ | |
4197344b | 222 | int hfi1_dev_affinity_init(struct hfi1_devdata *dd) |
957558c9 MH |
223 | { |
224 | int node = pcibus_to_node(dd->pcidev->bus); | |
4197344b | 225 | struct hfi1_affinity_node *entry; |
957558c9 | 226 | const struct cpumask *local_mask; |
0852d241 | 227 | int curr_cpu, possible, i; |
957558c9 MH |
228 | |
229 | if (node < 0) | |
230 | node = numa_node_id(); | |
231 | dd->node = node; | |
232 | ||
957558c9 MH |
233 | local_mask = cpumask_of_node(dd->node); |
234 | if (cpumask_first(local_mask) >= nr_cpu_ids) | |
235 | local_mask = topology_core_cpumask(0); | |
4197344b DD |
236 | |
237 | spin_lock(&node_affinity.lock); | |
238 | entry = node_affinity_lookup(dd->node); | |
239 | spin_unlock(&node_affinity.lock); | |
240 | ||
241 | /* | |
242 | * If this is the first time this NUMA node's affinity is used, | |
243 | * create an entry in the global affinity structure and initialize it. | |
244 | */ | |
245 | if (!entry) { | |
246 | entry = node_affinity_allocate(node); | |
247 | if (!entry) { | |
248 | dd_dev_err(dd, | |
249 | "Unable to allocate global affinity node\n"); | |
250 | return -ENOMEM; | |
957558c9 | 251 | } |
4197344b DD |
252 | init_cpu_mask_set(&entry->def_intr); |
253 | init_cpu_mask_set(&entry->rcv_intr); | |
d6373019 | 254 | cpumask_clear(&entry->general_intr_mask); |
4197344b DD |
255 | /* Use the "real" cpu mask of this node as the default */ |
256 | cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask, | |
257 | local_mask); | |
258 | ||
259 | /* fill in the receive list */ | |
260 | possible = cpumask_weight(&entry->def_intr.mask); | |
261 | curr_cpu = cpumask_first(&entry->def_intr.mask); | |
262 | ||
263 | if (possible == 1) { | |
264 | /* only one CPU, everyone will use it */ | |
265 | cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask); | |
d6373019 | 266 | cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); |
4197344b DD |
267 | } else { |
268 | /* | |
d6373019 SS |
269 | * The general/control context will be the first CPU in |
270 | * the default list, so it is removed from the default | |
271 | * list and added to the general interrupt list. | |
4197344b | 272 | */ |
d6373019 SS |
273 | cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask); |
274 | cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); | |
4197344b DD |
275 | curr_cpu = cpumask_next(curr_cpu, |
276 | &entry->def_intr.mask); | |
957558c9 | 277 | |
4197344b DD |
278 | /* |
279 | * Remove the remaining kernel receive queues from | |
280 | * the default list and add them to the receive list. | |
281 | */ | |
d6373019 SS |
282 | for (i = 0; |
283 | i < (dd->n_krcv_queues - 1) * | |
284 | hfi1_per_node_cntr[dd->node]; | |
285 | i++) { | |
4197344b DD |
286 | cpumask_clear_cpu(curr_cpu, |
287 | &entry->def_intr.mask); | |
288 | cpumask_set_cpu(curr_cpu, | |
289 | &entry->rcv_intr.mask); | |
290 | curr_cpu = cpumask_next(curr_cpu, | |
291 | &entry->def_intr.mask); | |
292 | if (curr_cpu >= nr_cpu_ids) | |
293 | break; | |
294 | } | |
d6373019 SS |
295 | |
296 | /* | |
297 | * If there ends up being 0 CPU cores leftover for SDMA | |
298 | * engines, use the same CPU cores as general/control | |
299 | * context. | |
300 | */ | |
301 | if (cpumask_weight(&entry->def_intr.mask) == 0) | |
302 | cpumask_copy(&entry->def_intr.mask, | |
303 | &entry->general_intr_mask); | |
4197344b | 304 | } |
957558c9 | 305 | |
4197344b DD |
306 | spin_lock(&node_affinity.lock); |
307 | node_affinity_add_tail(entry); | |
308 | spin_unlock(&node_affinity.lock); | |
309 | } | |
310 | ||
311 | return 0; | |
957558c9 MH |
312 | } |
313 | ||
314 | int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) | |
315 | { | |
316 | int ret; | |
317 | cpumask_var_t diff; | |
4197344b | 318 | struct hfi1_affinity_node *entry; |
d6373019 | 319 | struct cpu_mask_set *set = NULL; |
957558c9 MH |
320 | struct sdma_engine *sde = NULL; |
321 | struct hfi1_ctxtdata *rcd = NULL; | |
322 | char extra[64]; | |
323 | int cpu = -1; | |
324 | ||
325 | extra[0] = '\0'; | |
326 | cpumask_clear(&msix->mask); | |
327 | ||
328 | ret = zalloc_cpumask_var(&diff, GFP_KERNEL); | |
329 | if (!ret) | |
330 | return -ENOMEM; | |
331 | ||
4197344b DD |
332 | spin_lock(&node_affinity.lock); |
333 | entry = node_affinity_lookup(dd->node); | |
334 | spin_unlock(&node_affinity.lock); | |
335 | ||
957558c9 MH |
336 | switch (msix->type) { |
337 | case IRQ_SDMA: | |
338 | sde = (struct sdma_engine *)msix->arg; | |
339 | scnprintf(extra, 64, "engine %u", sde->this_idx); | |
4197344b | 340 | set = &entry->def_intr; |
957558c9 | 341 | break; |
d6373019 SS |
342 | case IRQ_GENERAL: |
343 | cpu = cpumask_first(&entry->general_intr_mask); | |
344 | break; | |
957558c9 MH |
345 | case IRQ_RCVCTXT: |
346 | rcd = (struct hfi1_ctxtdata *)msix->arg; | |
d6373019 SS |
347 | if (rcd->ctxt == HFI1_CTRL_CTXT) |
348 | cpu = cpumask_first(&entry->general_intr_mask); | |
349 | else | |
4197344b | 350 | set = &entry->rcv_intr; |
957558c9 MH |
351 | scnprintf(extra, 64, "ctxt %u", rcd->ctxt); |
352 | break; | |
353 | default: | |
354 | dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type); | |
355 | return -EINVAL; | |
356 | } | |
357 | ||
358 | /* | |
d6373019 SS |
359 | * The general and control contexts are placed on a particular |
360 | * CPU, which is set above. Skip accounting for it. Everything else | |
361 | * finds its CPU here. | |
957558c9 | 362 | */ |
4197344b DD |
363 | if (cpu == -1 && set) { |
364 | spin_lock(&node_affinity.lock); | |
957558c9 MH |
365 | if (cpumask_equal(&set->mask, &set->used)) { |
366 | /* | |
367 | * We've used up all the CPUs, bump up the generation | |
368 | * and reset the 'used' map | |
369 | */ | |
370 | set->gen++; | |
371 | cpumask_clear(&set->used); | |
372 | } | |
373 | cpumask_andnot(diff, &set->mask, &set->used); | |
374 | cpu = cpumask_first(diff); | |
375 | cpumask_set_cpu(cpu, &set->used); | |
4197344b | 376 | spin_unlock(&node_affinity.lock); |
957558c9 MH |
377 | } |
378 | ||
379 | switch (msix->type) { | |
380 | case IRQ_SDMA: | |
381 | sde->cpu = cpu; | |
382 | break; | |
383 | case IRQ_GENERAL: | |
384 | case IRQ_RCVCTXT: | |
385 | case IRQ_OTHER: | |
386 | break; | |
387 | } | |
388 | ||
389 | cpumask_set_cpu(cpu, &msix->mask); | |
390 | dd_dev_info(dd, "IRQ vector: %u, type %s %s -> cpu: %d\n", | |
391 | msix->msix.vector, irq_type_names[msix->type], | |
392 | extra, cpu); | |
393 | irq_set_affinity_hint(msix->msix.vector, &msix->mask); | |
394 | ||
395 | free_cpumask_var(diff); | |
396 | return 0; | |
397 | } | |
398 | ||
399 | void hfi1_put_irq_affinity(struct hfi1_devdata *dd, | |
400 | struct hfi1_msix_entry *msix) | |
401 | { | |
402 | struct cpu_mask_set *set = NULL; | |
403 | struct hfi1_ctxtdata *rcd; | |
4197344b DD |
404 | struct hfi1_affinity_node *entry; |
405 | ||
406 | spin_lock(&node_affinity.lock); | |
407 | entry = node_affinity_lookup(dd->node); | |
408 | spin_unlock(&node_affinity.lock); | |
957558c9 MH |
409 | |
410 | switch (msix->type) { | |
411 | case IRQ_SDMA: | |
4197344b | 412 | set = &entry->def_intr; |
957558c9 | 413 | break; |
d6373019 | 414 | case IRQ_GENERAL: |
b094a36f | 415 | /* Don't do accounting for general contexts */ |
d6373019 | 416 | break; |
957558c9 MH |
417 | case IRQ_RCVCTXT: |
418 | rcd = (struct hfi1_ctxtdata *)msix->arg; | |
d6373019 | 419 | /* Don't do accounting for control contexts */ |
957558c9 | 420 | if (rcd->ctxt != HFI1_CTRL_CTXT) |
4197344b | 421 | set = &entry->rcv_intr; |
957558c9 MH |
422 | break; |
423 | default: | |
424 | return; | |
425 | } | |
426 | ||
427 | if (set) { | |
4197344b | 428 | spin_lock(&node_affinity.lock); |
957558c9 MH |
429 | cpumask_andnot(&set->used, &set->used, &msix->mask); |
430 | if (cpumask_empty(&set->used) && set->gen) { | |
431 | set->gen--; | |
432 | cpumask_copy(&set->used, &set->mask); | |
433 | } | |
4197344b | 434 | spin_unlock(&node_affinity.lock); |
957558c9 MH |
435 | } |
436 | ||
437 | irq_set_affinity_hint(msix->msix.vector, NULL); | |
438 | cpumask_clear(&msix->mask); | |
439 | } | |
440 | ||
b094a36f SS |
441 | /* This should be called with node_affinity.lock held */ |
442 | static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask, | |
443 | struct hfi1_affinity_node_list *affinity) | |
444 | { | |
445 | int possible, curr_cpu, i; | |
446 | uint num_cores_per_socket = node_affinity.num_online_cpus / | |
447 | affinity->num_core_siblings / | |
448 | node_affinity.num_online_nodes; | |
449 | ||
450 | cpumask_copy(hw_thread_mask, &affinity->proc.mask); | |
451 | if (affinity->num_core_siblings > 0) { | |
452 | /* Removing other siblings not needed for now */ | |
453 | possible = cpumask_weight(hw_thread_mask); | |
454 | curr_cpu = cpumask_first(hw_thread_mask); | |
455 | for (i = 0; | |
456 | i < num_cores_per_socket * node_affinity.num_online_nodes; | |
457 | i++) | |
458 | curr_cpu = cpumask_next(curr_cpu, hw_thread_mask); | |
459 | ||
460 | for (; i < possible; i++) { | |
461 | cpumask_clear_cpu(curr_cpu, hw_thread_mask); | |
462 | curr_cpu = cpumask_next(curr_cpu, hw_thread_mask); | |
463 | } | |
464 | ||
465 | /* Identifying correct HW threads within physical cores */ | |
466 | cpumask_shift_left(hw_thread_mask, hw_thread_mask, | |
467 | num_cores_per_socket * | |
468 | node_affinity.num_online_nodes * | |
469 | hw_thread_no); | |
470 | } | |
471 | } | |
472 | ||
473 | int hfi1_get_proc_affinity(int node) | |
957558c9 | 474 | { |
b094a36f | 475 | int cpu = -1, ret, i; |
4197344b | 476 | struct hfi1_affinity_node *entry; |
b094a36f | 477 | cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask; |
957558c9 MH |
478 | const struct cpumask *node_mask, |
479 | *proc_mask = tsk_cpus_allowed(current); | |
b094a36f SS |
480 | struct hfi1_affinity_node_list *affinity = &node_affinity; |
481 | struct cpu_mask_set *set = &affinity->proc; | |
957558c9 MH |
482 | |
483 | /* | |
484 | * check whether process/context affinity has already | |
485 | * been set | |
486 | */ | |
487 | if (cpumask_weight(proc_mask) == 1) { | |
f242d93a LR |
488 | hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl", |
489 | current->pid, current->comm, | |
490 | cpumask_pr_args(proc_mask)); | |
957558c9 MH |
491 | /* |
492 | * Mark the pre-set CPU as used. This is atomic so we don't | |
493 | * need the lock | |
494 | */ | |
495 | cpu = cpumask_first(proc_mask); | |
496 | cpumask_set_cpu(cpu, &set->used); | |
497 | goto done; | |
498 | } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) { | |
f242d93a LR |
499 | hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl", |
500 | current->pid, current->comm, | |
501 | cpumask_pr_args(proc_mask)); | |
957558c9 MH |
502 | goto done; |
503 | } | |
504 | ||
505 | /* | |
506 | * The process does not have a preset CPU affinity so find one to | |
b094a36f SS |
507 | * recommend using the following algorithm: |
508 | * | |
509 | * For each user process that is opening a context on HFI Y: | |
510 | * a) If all cores are filled, reinitialize the bitmask | |
511 | * b) Fill real cores first, then HT cores (First set of HT | |
512 | * cores on all physical cores, then second set of HT core, | |
513 | * and, so on) in the following order: | |
514 | * | |
515 | * 1. Same NUMA node as HFI Y and not running an IRQ | |
516 | * handler | |
517 | * 2. Same NUMA node as HFI Y and running an IRQ handler | |
518 | * 3. Different NUMA node to HFI Y and not running an IRQ | |
519 | * handler | |
520 | * 4. Different NUMA node to HFI Y and running an IRQ | |
521 | * handler | |
522 | * c) Mark core as filled in the bitmask. As user processes are | |
523 | * done, clear cores from the bitmask. | |
957558c9 MH |
524 | */ |
525 | ||
526 | ret = zalloc_cpumask_var(&diff, GFP_KERNEL); | |
527 | if (!ret) | |
528 | goto done; | |
b094a36f | 529 | ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL); |
957558c9 MH |
530 | if (!ret) |
531 | goto free_diff; | |
b094a36f | 532 | ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL); |
957558c9 | 533 | if (!ret) |
b094a36f SS |
534 | goto free_hw_thread_mask; |
535 | ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL); | |
536 | if (!ret) | |
537 | goto free_available_mask; | |
957558c9 | 538 | |
b094a36f | 539 | spin_lock(&affinity->lock); |
957558c9 | 540 | /* |
b094a36f | 541 | * If we've used all available HW threads, clear the mask and start |
957558c9 MH |
542 | * overloading. |
543 | */ | |
544 | if (cpumask_equal(&set->mask, &set->used)) { | |
545 | set->gen++; | |
546 | cpumask_clear(&set->used); | |
547 | } | |
548 | ||
d6373019 SS |
549 | /* |
550 | * If NUMA node has CPUs used by interrupt handlers, include them in the | |
551 | * interrupt handler mask. | |
552 | */ | |
553 | entry = node_affinity_lookup(node); | |
554 | if (entry) { | |
b094a36f SS |
555 | cpumask_copy(intrs_mask, (entry->def_intr.gen ? |
556 | &entry->def_intr.mask : | |
557 | &entry->def_intr.used)); | |
558 | cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ? | |
559 | &entry->rcv_intr.mask : | |
560 | &entry->rcv_intr.used)); | |
561 | cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask); | |
d6373019 | 562 | } |
f242d93a | 563 | hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl", |
b094a36f SS |
564 | cpumask_pr_args(intrs_mask)); |
565 | ||
566 | cpumask_copy(hw_thread_mask, &set->mask); | |
957558c9 MH |
567 | |
568 | /* | |
b094a36f SS |
569 | * If HT cores are enabled, identify which HW threads within the |
570 | * physical cores should be used. | |
957558c9 | 571 | */ |
b094a36f SS |
572 | if (affinity->num_core_siblings > 0) { |
573 | for (i = 0; i < affinity->num_core_siblings; i++) { | |
574 | find_hw_thread_mask(i, hw_thread_mask, affinity); | |
575 | ||
576 | /* | |
577 | * If there's at least one available core for this HW | |
578 | * thread number, stop looking for a core. | |
579 | * | |
580 | * diff will always be not empty at least once in this | |
581 | * loop as the used mask gets reset when | |
582 | * (set->mask == set->used) before this loop. | |
583 | */ | |
584 | cpumask_andnot(diff, hw_thread_mask, &set->used); | |
585 | if (!cpumask_empty(diff)) | |
586 | break; | |
587 | } | |
588 | } | |
589 | hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl", | |
590 | cpumask_pr_args(hw_thread_mask)); | |
591 | ||
957558c9 | 592 | node_mask = cpumask_of_node(node); |
b094a36f | 593 | hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node, |
f242d93a | 594 | cpumask_pr_args(node_mask)); |
957558c9 | 595 | |
b094a36f SS |
596 | /* Get cpumask of available CPUs on preferred NUMA */ |
597 | cpumask_and(available_mask, hw_thread_mask, node_mask); | |
598 | cpumask_andnot(available_mask, available_mask, &set->used); | |
599 | hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node, | |
600 | cpumask_pr_args(available_mask)); | |
957558c9 MH |
601 | |
602 | /* | |
603 | * At first, we don't want to place processes on the same | |
b094a36f SS |
604 | * CPUs as interrupt handlers. Then, CPUs running interrupt |
605 | * handlers are used. | |
606 | * | |
607 | * 1) If diff is not empty, then there are CPUs not running | |
608 | * non-interrupt handlers available, so diff gets copied | |
609 | * over to available_mask. | |
610 | * 2) If diff is empty, then all CPUs not running interrupt | |
611 | * handlers are taken, so available_mask contains all | |
612 | * available CPUs running interrupt handlers. | |
613 | * 3) If available_mask is empty, then all CPUs on the | |
614 | * preferred NUMA node are taken, so other NUMA nodes are | |
615 | * used for process assignments using the same method as | |
616 | * the preferred NUMA node. | |
957558c9 | 617 | */ |
b094a36f | 618 | cpumask_andnot(diff, available_mask, intrs_mask); |
957558c9 | 619 | if (!cpumask_empty(diff)) |
b094a36f | 620 | cpumask_copy(available_mask, diff); |
957558c9 | 621 | |
b094a36f SS |
622 | /* If we don't have CPUs on the preferred node, use other NUMA nodes */ |
623 | if (cpumask_empty(available_mask)) { | |
624 | cpumask_andnot(available_mask, hw_thread_mask, &set->used); | |
625 | /* Excluding preferred NUMA cores */ | |
626 | cpumask_andnot(available_mask, available_mask, node_mask); | |
627 | hfi1_cdbg(PROC, | |
628 | "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl", | |
629 | cpumask_pr_args(available_mask)); | |
630 | ||
631 | /* | |
632 | * At first, we don't want to place processes on the same | |
633 | * CPUs as interrupt handlers. | |
634 | */ | |
635 | cpumask_andnot(diff, available_mask, intrs_mask); | |
636 | if (!cpumask_empty(diff)) | |
637 | cpumask_copy(available_mask, diff); | |
957558c9 | 638 | } |
b094a36f SS |
639 | hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl", |
640 | cpumask_pr_args(available_mask)); | |
957558c9 | 641 | |
b094a36f | 642 | cpu = cpumask_first(available_mask); |
957558c9 MH |
643 | if (cpu >= nr_cpu_ids) /* empty */ |
644 | cpu = -1; | |
645 | else | |
646 | cpumask_set_cpu(cpu, &set->used); | |
b094a36f SS |
647 | spin_unlock(&affinity->lock); |
648 | hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu); | |
649 | ||
650 | free_cpumask_var(intrs_mask); | |
651 | free_available_mask: | |
652 | free_cpumask_var(available_mask); | |
653 | free_hw_thread_mask: | |
654 | free_cpumask_var(hw_thread_mask); | |
957558c9 MH |
655 | free_diff: |
656 | free_cpumask_var(diff); | |
657 | done: | |
658 | return cpu; | |
659 | } | |
660 | ||
b094a36f | 661 | void hfi1_put_proc_affinity(int cpu) |
957558c9 | 662 | { |
b094a36f SS |
663 | struct hfi1_affinity_node_list *affinity = &node_affinity; |
664 | struct cpu_mask_set *set = &affinity->proc; | |
957558c9 MH |
665 | |
666 | if (cpu < 0) | |
667 | return; | |
b094a36f | 668 | spin_lock(&affinity->lock); |
957558c9 | 669 | cpumask_clear_cpu(cpu, &set->used); |
b094a36f | 670 | hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu); |
957558c9 MH |
671 | if (cpumask_empty(&set->used) && set->gen) { |
672 | set->gen--; | |
673 | cpumask_copy(&set->used, &set->mask); | |
674 | } | |
b094a36f | 675 | spin_unlock(&affinity->lock); |
957558c9 | 676 | } |
b14db1f0 TS |
677 | |
678 | /* Prevents concurrent reads and writes of the sdma_affinity attrib */ | |
679 | static DEFINE_MUTEX(sdma_affinity_mutex); | |
680 | ||
681 | int hfi1_set_sdma_affinity(struct hfi1_devdata *dd, const char *buf, | |
682 | size_t count) | |
683 | { | |
684 | struct hfi1_affinity_node *entry; | |
685 | struct cpumask mask; | |
686 | int ret, i; | |
687 | ||
688 | spin_lock(&node_affinity.lock); | |
689 | entry = node_affinity_lookup(dd->node); | |
690 | spin_unlock(&node_affinity.lock); | |
691 | ||
692 | if (!entry) | |
693 | return -EINVAL; | |
694 | ||
695 | ret = cpulist_parse(buf, &mask); | |
696 | if (ret) | |
697 | return ret; | |
698 | ||
699 | if (!cpumask_subset(&mask, cpu_online_mask) || cpumask_empty(&mask)) { | |
700 | dd_dev_warn(dd, "Invalid CPU mask\n"); | |
701 | return -EINVAL; | |
702 | } | |
703 | ||
704 | mutex_lock(&sdma_affinity_mutex); | |
705 | /* reset the SDMA interrupt affinity details */ | |
706 | init_cpu_mask_set(&entry->def_intr); | |
707 | cpumask_copy(&entry->def_intr.mask, &mask); | |
708 | /* | |
709 | * Reassign the affinity for each SDMA interrupt. | |
710 | */ | |
711 | for (i = 0; i < dd->num_msix_entries; i++) { | |
712 | struct hfi1_msix_entry *msix; | |
713 | ||
714 | msix = &dd->msix_entries[i]; | |
715 | if (msix->type != IRQ_SDMA) | |
716 | continue; | |
717 | ||
718 | ret = hfi1_get_irq_affinity(dd, msix); | |
719 | ||
720 | if (ret) | |
721 | break; | |
722 | } | |
723 | ||
724 | mutex_unlock(&sdma_affinity_mutex); | |
725 | return ret ? ret : strnlen(buf, PAGE_SIZE); | |
726 | } | |
727 | ||
728 | int hfi1_get_sdma_affinity(struct hfi1_devdata *dd, char *buf) | |
729 | { | |
730 | struct hfi1_affinity_node *entry; | |
731 | ||
732 | spin_lock(&node_affinity.lock); | |
733 | entry = node_affinity_lookup(dd->node); | |
734 | spin_unlock(&node_affinity.lock); | |
735 | ||
736 | if (!entry) | |
737 | return -EINVAL; | |
738 | ||
739 | mutex_lock(&sdma_affinity_mutex); | |
740 | cpumap_print_to_pagebuf(true, buf, &entry->def_intr.mask); | |
741 | mutex_unlock(&sdma_affinity_mutex); | |
742 | return strnlen(buf, PAGE_SIZE); | |
743 | } |