Commit | Line | Data |
---|---|---|
ea4d26ae JK |
1 | #ifndef _ASM_X86_XOR_AVX_H |
2 | #define _ASM_X86_XOR_AVX_H | |
3 | ||
4 | /* | |
5 | * Optimized RAID-5 checksumming functions for AVX | |
6 | * | |
7 | * Copyright (C) 2012 Intel Corporation | |
8 | * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> | |
9 | * | |
10 | * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines | |
11 | * | |
12 | * This program is free software; you can redistribute it and/or | |
13 | * modify it under the terms of the GNU General Public License | |
14 | * as published by the Free Software Foundation; version 2 | |
15 | * of the License. | |
16 | */ | |
17 | ||
18 | #ifdef CONFIG_AS_AVX | |
19 | ||
20 | #include <linux/compiler.h> | |
df6b35f4 | 21 | #include <asm/fpu/api.h> |
ea4d26ae | 22 | |
ea4d26ae JK |
23 | #define BLOCK4(i) \ |
24 | BLOCK(32 * i, 0) \ | |
25 | BLOCK(32 * (i + 1), 1) \ | |
26 | BLOCK(32 * (i + 2), 2) \ | |
27 | BLOCK(32 * (i + 3), 3) | |
28 | ||
29 | #define BLOCK16() \ | |
30 | BLOCK4(0) \ | |
31 | BLOCK4(4) \ | |
32 | BLOCK4(8) \ | |
33 | BLOCK4(12) | |
34 | ||
35 | static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) | |
36 | { | |
841e3604 | 37 | unsigned long lines = bytes >> 9; |
ea4d26ae | 38 | |
841e3604 | 39 | kernel_fpu_begin(); |
ea4d26ae JK |
40 | |
41 | while (lines--) { | |
42 | #undef BLOCK | |
43 | #define BLOCK(i, reg) \ | |
44 | do { \ | |
45 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ | |
46 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | |
47 | "m" (p0[i / sizeof(*p0)])); \ | |
48 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | |
49 | "=m" (p0[i / sizeof(*p0)])); \ | |
50 | } while (0); | |
51 | ||
52 | BLOCK16() | |
53 | ||
54 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | |
55 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | |
56 | } | |
57 | ||
841e3604 | 58 | kernel_fpu_end(); |
ea4d26ae JK |
59 | } |
60 | ||
61 | static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, | |
62 | unsigned long *p2) | |
63 | { | |
841e3604 | 64 | unsigned long lines = bytes >> 9; |
ea4d26ae | 65 | |
841e3604 | 66 | kernel_fpu_begin(); |
ea4d26ae JK |
67 | |
68 | while (lines--) { | |
69 | #undef BLOCK | |
70 | #define BLOCK(i, reg) \ | |
71 | do { \ | |
72 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ | |
73 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | |
74 | "m" (p1[i / sizeof(*p1)])); \ | |
75 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | |
76 | "m" (p0[i / sizeof(*p0)])); \ | |
77 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | |
78 | "=m" (p0[i / sizeof(*p0)])); \ | |
79 | } while (0); | |
80 | ||
81 | BLOCK16() | |
82 | ||
83 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | |
84 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | |
85 | p2 = (unsigned long *)((uintptr_t)p2 + 512); | |
86 | } | |
87 | ||
841e3604 | 88 | kernel_fpu_end(); |
ea4d26ae JK |
89 | } |
90 | ||
91 | static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, | |
92 | unsigned long *p2, unsigned long *p3) | |
93 | { | |
841e3604 | 94 | unsigned long lines = bytes >> 9; |
ea4d26ae | 95 | |
841e3604 | 96 | kernel_fpu_begin(); |
ea4d26ae JK |
97 | |
98 | while (lines--) { | |
99 | #undef BLOCK | |
100 | #define BLOCK(i, reg) \ | |
101 | do { \ | |
102 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ | |
103 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | |
104 | "m" (p2[i / sizeof(*p2)])); \ | |
105 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | |
106 | "m" (p1[i / sizeof(*p1)])); \ | |
107 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | |
108 | "m" (p0[i / sizeof(*p0)])); \ | |
109 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | |
110 | "=m" (p0[i / sizeof(*p0)])); \ | |
111 | } while (0); | |
112 | ||
113 | BLOCK16(); | |
114 | ||
115 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | |
116 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | |
117 | p2 = (unsigned long *)((uintptr_t)p2 + 512); | |
118 | p3 = (unsigned long *)((uintptr_t)p3 + 512); | |
119 | } | |
120 | ||
841e3604 | 121 | kernel_fpu_end(); |
ea4d26ae JK |
122 | } |
123 | ||
124 | static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, | |
125 | unsigned long *p2, unsigned long *p3, unsigned long *p4) | |
126 | { | |
841e3604 | 127 | unsigned long lines = bytes >> 9; |
ea4d26ae | 128 | |
841e3604 | 129 | kernel_fpu_begin(); |
ea4d26ae JK |
130 | |
131 | while (lines--) { | |
132 | #undef BLOCK | |
133 | #define BLOCK(i, reg) \ | |
134 | do { \ | |
135 | asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ | |
136 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | |
137 | "m" (p3[i / sizeof(*p3)])); \ | |
138 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | |
139 | "m" (p2[i / sizeof(*p2)])); \ | |
140 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | |
141 | "m" (p1[i / sizeof(*p1)])); \ | |
142 | asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ | |
143 | "m" (p0[i / sizeof(*p0)])); \ | |
144 | asm volatile("vmovdqa %%ymm" #reg ", %0" : \ | |
145 | "=m" (p0[i / sizeof(*p0)])); \ | |
146 | } while (0); | |
147 | ||
148 | BLOCK16() | |
149 | ||
150 | p0 = (unsigned long *)((uintptr_t)p0 + 512); | |
151 | p1 = (unsigned long *)((uintptr_t)p1 + 512); | |
152 | p2 = (unsigned long *)((uintptr_t)p2 + 512); | |
153 | p3 = (unsigned long *)((uintptr_t)p3 + 512); | |
154 | p4 = (unsigned long *)((uintptr_t)p4 + 512); | |
155 | } | |
156 | ||
841e3604 | 157 | kernel_fpu_end(); |
ea4d26ae JK |
158 | } |
159 | ||
160 | static struct xor_block_template xor_block_avx = { | |
161 | .name = "avx", | |
162 | .do_2 = xor_avx_2, | |
163 | .do_3 = xor_avx_3, | |
164 | .do_4 = xor_avx_4, | |
165 | .do_5 = xor_avx_5, | |
166 | }; | |
167 | ||
168 | #define AVX_XOR_SPEED \ | |
169 | do { \ | |
edb6f294 | 170 | if (cpu_has_avx && cpu_has_osxsave) \ |
ea4d26ae JK |
171 | xor_speed(&xor_block_avx); \ |
172 | } while (0) | |
173 | ||
174 | #define AVX_SELECT(FASTEST) \ | |
edb6f294 | 175 | (cpu_has_avx && cpu_has_osxsave ? &xor_block_avx : FASTEST) |
ea4d26ae JK |
176 | |
177 | #else | |
178 | ||
179 | #define AVX_XOR_SPEED {} | |
180 | ||
181 | #define AVX_SELECT(FASTEST) (FASTEST) | |
182 | ||
183 | #endif | |
184 | #endif |