Commit | Line | Data |
---|---|---|
e126ba97 | 1 | /* |
302bdf68 | 2 | * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. |
e126ba97 EC |
3 | * |
4 | * This software is available to you under a choice of one of two | |
5 | * licenses. You may choose to be licensed under the terms of the GNU | |
6 | * General Public License (GPL) Version 2, available from the file | |
7 | * COPYING in the main directory of this source tree, or the | |
8 | * OpenIB.org BSD license below: | |
9 | * | |
10 | * Redistribution and use in source and binary forms, with or | |
11 | * without modification, are permitted provided that the following | |
12 | * conditions are met: | |
13 | * | |
14 | * - Redistributions of source code must retain the above | |
15 | * copyright notice, this list of conditions and the following | |
16 | * disclaimer. | |
17 | * | |
18 | * - Redistributions in binary form must reproduce the above | |
19 | * copyright notice, this list of conditions and the following | |
20 | * disclaimer in the documentation and/or other materials | |
21 | * provided with the distribution. | |
22 | * | |
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
30 | * SOFTWARE. | |
31 | */ | |
32 | ||
33 | #include <linux/kernel.h> | |
34 | #include <linux/module.h> | |
35 | #include <linux/random.h> | |
36 | #include <linux/vmalloc.h> | |
37 | #include <linux/mlx5/driver.h> | |
38 | #include <linux/mlx5/cmd.h> | |
39 | #include "mlx5_core.h" | |
40 | ||
41 | enum { | |
42 | MLX5_HEALTH_POLL_INTERVAL = 2 * HZ, | |
43 | MAX_MISSES = 3, | |
44 | }; | |
45 | ||
46 | enum { | |
47 | MLX5_HEALTH_SYNDR_FW_ERR = 0x1, | |
48 | MLX5_HEALTH_SYNDR_IRISC_ERR = 0x7, | |
49 | MLX5_HEALTH_SYNDR_CRC_ERR = 0x9, | |
50 | MLX5_HEALTH_SYNDR_FETCH_PCI_ERR = 0xa, | |
51 | MLX5_HEALTH_SYNDR_HW_FTL_ERR = 0xb, | |
52 | MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR = 0xc, | |
53 | MLX5_HEALTH_SYNDR_EQ_ERR = 0xd, | |
54 | MLX5_HEALTH_SYNDR_FFSER_ERR = 0xf, | |
55 | }; | |
56 | ||
57 | static DEFINE_SPINLOCK(health_lock); | |
e126ba97 EC |
58 | static LIST_HEAD(health_list); |
59 | static struct work_struct health_work; | |
60 | ||
e126ba97 EC |
61 | static void health_care(struct work_struct *work) |
62 | { | |
63 | struct mlx5_core_health *health, *n; | |
64 | struct mlx5_core_dev *dev; | |
65 | struct mlx5_priv *priv; | |
66 | LIST_HEAD(tlist); | |
67 | ||
68 | spin_lock_irq(&health_lock); | |
69 | list_splice_init(&health_list, &tlist); | |
70 | ||
71 | spin_unlock_irq(&health_lock); | |
72 | ||
73 | list_for_each_entry_safe(health, n, &tlist, list) { | |
74 | priv = container_of(health, struct mlx5_priv, health); | |
75 | dev = container_of(priv, struct mlx5_core_dev, priv); | |
76 | mlx5_core_warn(dev, "handling bad device here\n"); | |
7d46daba | 77 | /* nothing yet */ |
e126ba97 | 78 | spin_lock_irq(&health_lock); |
e126ba97 EC |
79 | list_del_init(&health->list); |
80 | spin_unlock_irq(&health_lock); | |
81 | } | |
82 | } | |
83 | ||
84 | static const char *hsynd_str(u8 synd) | |
85 | { | |
86 | switch (synd) { | |
87 | case MLX5_HEALTH_SYNDR_FW_ERR: | |
88 | return "firmware internal error"; | |
89 | case MLX5_HEALTH_SYNDR_IRISC_ERR: | |
90 | return "irisc not responding"; | |
91 | case MLX5_HEALTH_SYNDR_CRC_ERR: | |
92 | return "firmware CRC error"; | |
93 | case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR: | |
94 | return "ICM fetch PCI error"; | |
95 | case MLX5_HEALTH_SYNDR_HW_FTL_ERR: | |
96 | return "HW fatal error\n"; | |
97 | case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR: | |
98 | return "async EQ buffer overrun"; | |
99 | case MLX5_HEALTH_SYNDR_EQ_ERR: | |
100 | return "EQ error"; | |
101 | case MLX5_HEALTH_SYNDR_FFSER_ERR: | |
102 | return "FFSER error"; | |
103 | default: | |
104 | return "unrecognized error"; | |
105 | } | |
106 | } | |
107 | ||
582c016e RD |
108 | static u16 read_be16(__be16 __iomem *p) |
109 | { | |
110 | return swab16(readl((__force u16 __iomem *) p)); | |
111 | } | |
112 | ||
113 | static u32 read_be32(__be32 __iomem *p) | |
114 | { | |
115 | return swab32(readl((__force u32 __iomem *) p)); | |
116 | } | |
117 | ||
e126ba97 EC |
118 | static void print_health_info(struct mlx5_core_dev *dev) |
119 | { | |
120 | struct mlx5_core_health *health = &dev->priv.health; | |
121 | struct health_buffer __iomem *h = health->health; | |
122 | int i; | |
123 | ||
124 | for (i = 0; i < ARRAY_SIZE(h->assert_var); i++) | |
582c016e RD |
125 | pr_info("assert_var[%d] 0x%08x\n", i, read_be32(h->assert_var + i)); |
126 | ||
127 | pr_info("assert_exit_ptr 0x%08x\n", read_be32(&h->assert_exit_ptr)); | |
128 | pr_info("assert_callra 0x%08x\n", read_be32(&h->assert_callra)); | |
129 | pr_info("fw_ver 0x%08x\n", read_be32(&h->fw_ver)); | |
130 | pr_info("hw_id 0x%08x\n", read_be32(&h->hw_id)); | |
131 | pr_info("irisc_index %d\n", readb(&h->irisc_index)); | |
132 | pr_info("synd 0x%x: %s\n", readb(&h->synd), hsynd_str(readb(&h->synd))); | |
133 | pr_info("ext_sync 0x%04x\n", read_be16(&h->ext_sync)); | |
e126ba97 EC |
134 | } |
135 | ||
136 | static void poll_health(unsigned long data) | |
137 | { | |
138 | struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data; | |
139 | struct mlx5_core_health *health = &dev->priv.health; | |
140 | unsigned long next; | |
141 | u32 count; | |
142 | ||
143 | count = ioread32be(health->health_counter); | |
144 | if (count == health->prev) | |
145 | ++health->miss_counter; | |
146 | else | |
147 | health->miss_counter = 0; | |
148 | ||
149 | health->prev = count; | |
150 | if (health->miss_counter == MAX_MISSES) { | |
151 | mlx5_core_err(dev, "device's health compromised\n"); | |
152 | print_health_info(dev); | |
153 | spin_lock_irq(&health_lock); | |
154 | list_add_tail(&health->list, &health_list); | |
155 | spin_unlock_irq(&health_lock); | |
156 | ||
157 | queue_work(mlx5_core_wq, &health_work); | |
158 | } else { | |
159 | get_random_bytes(&next, sizeof(next)); | |
160 | next %= HZ; | |
161 | next += jiffies + MLX5_HEALTH_POLL_INTERVAL; | |
162 | mod_timer(&health->timer, next); | |
163 | } | |
164 | } | |
165 | ||
166 | void mlx5_start_health_poll(struct mlx5_core_dev *dev) | |
167 | { | |
168 | struct mlx5_core_health *health = &dev->priv.health; | |
169 | ||
170 | INIT_LIST_HEAD(&health->list); | |
171 | init_timer(&health->timer); | |
172 | health->health = &dev->iseg->health; | |
173 | health->health_counter = &dev->iseg->health_counter; | |
174 | ||
175 | health->timer.data = (unsigned long)dev; | |
176 | health->timer.function = poll_health; | |
177 | health->timer.expires = round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL); | |
178 | add_timer(&health->timer); | |
179 | } | |
180 | ||
181 | void mlx5_stop_health_poll(struct mlx5_core_dev *dev) | |
182 | { | |
183 | struct mlx5_core_health *health = &dev->priv.health; | |
184 | ||
185 | del_timer_sync(&health->timer); | |
186 | ||
187 | spin_lock_irq(&health_lock); | |
188 | if (!list_empty(&health->list)) | |
189 | list_del_init(&health->list); | |
190 | spin_unlock_irq(&health_lock); | |
191 | } | |
192 | ||
193 | void mlx5_health_cleanup(void) | |
194 | { | |
195 | } | |
196 | ||
197 | void __init mlx5_health_init(void) | |
198 | { | |
199 | INIT_WORK(&health_work, health_care); | |
200 | } |