Commit | Line | Data |
---|---|---|
6a8ce1ef TC |
1 | /* |
2 | * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) | |
3 | * | |
4 | * The white paper on CRC32C calculations with PCLMULQDQ instruction can be | |
5 | * downloaded from: | |
6 | * http://download.intel.com/design/intarch/papers/323405.pdf | |
7 | * | |
8 | * Copyright (C) 2012 Intel Corporation. | |
9 | * | |
10 | * Authors: | |
11 | * Wajdi Feghali <wajdi.k.feghali@intel.com> | |
12 | * James Guilford <james.guilford@intel.com> | |
13 | * David Cote <david.m.cote@intel.com> | |
14 | * Tim Chen <tim.c.chen@linux.intel.com> | |
15 | * | |
16 | * This software is available to you under a choice of one of two | |
17 | * licenses. You may choose to be licensed under the terms of the GNU | |
18 | * General Public License (GPL) Version 2, available from the file | |
19 | * COPYING in the main directory of this source tree, or the | |
20 | * OpenIB.org BSD license below: | |
21 | * | |
22 | * Redistribution and use in source and binary forms, with or | |
23 | * without modification, are permitted provided that the following | |
24 | * conditions are met: | |
25 | * | |
26 | * - Redistributions of source code must retain the above | |
27 | * copyright notice, this list of conditions and the following | |
28 | * disclaimer. | |
29 | * | |
30 | * - Redistributions in binary form must reproduce the above | |
31 | * copyright notice, this list of conditions and the following | |
32 | * disclaimer in the documentation and/or other materials | |
33 | * provided with the distribution. | |
34 | * | |
35 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
36 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
37 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
38 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
39 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
40 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
41 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
42 | * SOFTWARE. | |
43 | */ | |
44 | ||
45 | ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction | |
46 | ||
47 | .macro LABEL prefix n | |
48 | \prefix\n\(): | |
49 | .endm | |
50 | ||
51 | .macro JMPTBL_ENTRY i | |
52 | .word crc_\i - crc_array | |
53 | .endm | |
54 | ||
55 | .macro JNC_LESS_THAN j | |
56 | jnc less_than_\j | |
57 | .endm | |
58 | ||
59 | # Define threshold where buffers are considered "small" and routed to more | |
60 | # efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so | |
61 | # SMALL_SIZE can be no larger than 255. | |
62 | ||
63 | #define SMALL_SIZE 200 | |
64 | ||
65 | .if (SMALL_SIZE > 255) | |
66 | .error "SMALL_ SIZE must be < 256" | |
67 | .endif | |
68 | ||
69 | # unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); | |
70 | ||
71 | .global crc_pcl | |
72 | crc_pcl: | |
73 | #define bufp %rdi | |
74 | #define bufp_dw %edi | |
75 | #define bufp_w %di | |
76 | #define bufp_b %dil | |
77 | #define bufptmp %rcx | |
78 | #define block_0 %rcx | |
79 | #define block_1 %rdx | |
80 | #define block_2 %r11 | |
81 | #define len %rsi | |
82 | #define len_dw %esi | |
83 | #define len_w %si | |
84 | #define len_b %sil | |
85 | #define crc_init_arg %rdx | |
86 | #define tmp %rbx | |
87 | #define crc_init %r8 | |
88 | #define crc_init_dw %r8d | |
89 | #define crc1 %r9 | |
90 | #define crc2 %r10 | |
91 | ||
92 | pushq %rbx | |
93 | pushq %rdi | |
94 | pushq %rsi | |
95 | ||
96 | ## Move crc_init for Linux to a different | |
97 | mov crc_init_arg, crc_init | |
98 | ||
99 | ################################################################ | |
100 | ## 1) ALIGN: | |
101 | ################################################################ | |
102 | ||
103 | mov bufp, bufptmp # rdi = *buf | |
104 | neg bufp | |
105 | and $7, bufp # calculate the unalignment amount of | |
106 | # the address | |
107 | je proc_block # Skip if aligned | |
108 | ||
109 | ## If len is less than 8 and we're unaligned, we need to jump | |
110 | ## to special code to avoid reading beyond the end of the buffer | |
111 | cmp $8, len | |
112 | jae do_align | |
113 | # less_than_8 expects length in upper 3 bits of len_dw | |
114 | # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] | |
115 | shl $32-3+1, len_dw | |
116 | jmp less_than_8_post_shl1 | |
117 | ||
118 | do_align: | |
119 | #### Calculate CRC of unaligned bytes of the buffer (if any) | |
120 | movq (bufptmp), tmp # load a quadward from the buffer | |
121 | add bufp, bufptmp # align buffer pointer for quadword | |
122 | # processing | |
123 | sub bufp, len # update buffer length | |
124 | align_loop: | |
125 | crc32b %bl, crc_init_dw # compute crc32 of 1-byte | |
126 | shr $8, tmp # get next byte | |
127 | dec bufp | |
128 | jne align_loop | |
129 | ||
130 | proc_block: | |
131 | ||
132 | ################################################################ | |
133 | ## 2) PROCESS BLOCKS: | |
134 | ################################################################ | |
135 | ||
136 | ## compute num of bytes to be processed | |
137 | movq len, tmp # save num bytes in tmp | |
138 | ||
139 | cmpq $128*24, len | |
140 | jae full_block | |
141 | ||
142 | continue_block: | |
143 | cmpq $SMALL_SIZE, len | |
144 | jb small | |
145 | ||
146 | ## len < 128*24 | |
147 | movq $2731, %rax # 2731 = ceil(2^16 / 24) | |
148 | mul len_dw | |
149 | shrq $16, %rax | |
150 | ||
151 | ## eax contains floor(bytes / 24) = num 24-byte chunks to do | |
152 | ||
153 | ## process rax 24-byte chunks (128 >= rax >= 0) | |
154 | ||
155 | ## compute end address of each block | |
156 | ## block 0 (base addr + RAX * 8) | |
157 | ## block 1 (base addr + RAX * 16) | |
158 | ## block 2 (base addr + RAX * 24) | |
159 | lea (bufptmp, %rax, 8), block_0 | |
160 | lea (block_0, %rax, 8), block_1 | |
161 | lea (block_1, %rax, 8), block_2 | |
162 | ||
163 | xor crc1, crc1 | |
164 | xor crc2, crc2 | |
165 | ||
166 | ## branch into array | |
167 | lea jump_table(%rip), bufp | |
168 | movzxw (bufp, %rax, 2), len | |
169 | offset=crc_array-jump_table | |
170 | lea offset(bufp, len, 1), bufp | |
171 | jmp *bufp | |
172 | ||
173 | ################################################################ | |
174 | ## 2a) PROCESS FULL BLOCKS: | |
175 | ################################################################ | |
176 | full_block: | |
177 | movq $128,%rax | |
178 | lea 128*8*2(block_0), block_1 | |
179 | lea 128*8*3(block_0), block_2 | |
180 | add $128*8*1, block_0 | |
181 | ||
182 | xor crc1,crc1 | |
183 | xor crc2,crc2 | |
184 | ||
185 | # Fall thruogh into top of crc array (crc_128) | |
186 | ||
187 | ################################################################ | |
188 | ## 3) CRC Array: | |
189 | ################################################################ | |
190 | ||
191 | crc_array: | |
192 | i=128 | |
193 | .rept 128-1 | |
194 | .altmacro | |
195 | LABEL crc_ %i | |
196 | .noaltmacro | |
197 | crc32q -i*8(block_0), crc_init | |
198 | crc32q -i*8(block_1), crc1 | |
199 | crc32q -i*8(block_2), crc2 | |
200 | i=(i-1) | |
201 | .endr | |
202 | ||
203 | .altmacro | |
204 | LABEL crc_ %i | |
205 | .noaltmacro | |
206 | crc32q -i*8(block_0), crc_init | |
207 | crc32q -i*8(block_1), crc1 | |
208 | # SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet | |
209 | ||
210 | mov block_2, block_0 | |
211 | ||
212 | ################################################################ | |
213 | ## 4) Combine three results: | |
214 | ################################################################ | |
215 | ||
216 | lea (K_table-16)(%rip), bufp # first entry is for idx 1 | |
217 | shlq $3, %rax # rax *= 8 | |
218 | subq %rax, tmp # tmp -= rax*8 | |
219 | shlq $1, %rax | |
220 | subq %rax, tmp # tmp -= rax*16 | |
221 | # (total tmp -= rax*24) | |
222 | addq %rax, bufp | |
223 | ||
224 | movdqa (bufp), %xmm0 # 2 consts: K1:K2 | |
225 | ||
226 | movq crc_init, %xmm1 # CRC for block 1 | |
227 | pclmulqdq $0x00,%xmm0,%xmm1 # Multiply by K2 | |
228 | ||
229 | movq crc1, %xmm2 # CRC for block 2 | |
230 | pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 | |
231 | ||
232 | pxor %xmm2,%xmm1 | |
233 | movq %xmm1, %rax | |
234 | xor -i*8(block_2), %rax | |
235 | mov crc2, crc_init | |
236 | crc32 %rax, crc_init | |
237 | ||
238 | ################################################################ | |
239 | ## 5) Check for end: | |
240 | ################################################################ | |
241 | ||
242 | LABEL crc_ 0 | |
243 | mov tmp, len | |
244 | cmp $128*24, tmp | |
245 | jae full_block | |
246 | cmp $24, tmp | |
247 | jae continue_block | |
248 | ||
249 | less_than_24: | |
250 | shl $32-4, len_dw # less_than_16 expects length | |
251 | # in upper 4 bits of len_dw | |
252 | jnc less_than_16 | |
253 | crc32q (bufptmp), crc_init | |
254 | crc32q 8(bufptmp), crc_init | |
255 | jz do_return | |
256 | add $16, bufptmp | |
257 | # len is less than 8 if we got here | |
258 | # less_than_8 expects length in upper 3 bits of len_dw | |
259 | # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] | |
260 | shl $2, len_dw | |
261 | jmp less_than_8_post_shl1 | |
262 | ||
263 | ####################################################################### | |
264 | ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) | |
265 | ####################################################################### | |
266 | small: | |
267 | shl $32-8, len_dw # Prepare len_dw for less_than_256 | |
268 | j=256 | |
269 | .rept 5 # j = {256, 128, 64, 32, 16} | |
270 | .altmacro | |
271 | LABEL less_than_ %j # less_than_j: Length should be in | |
272 | # upper lg(j) bits of len_dw | |
273 | j=(j/2) | |
274 | shl $1, len_dw # Get next MSB | |
275 | JNC_LESS_THAN %j | |
276 | .noaltmacro | |
277 | i=0 | |
278 | .rept (j/8) | |
279 | crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data | |
280 | i=i+8 | |
281 | .endr | |
282 | jz do_return # Return if remaining length is zero | |
283 | add $j, bufptmp # Advance buf | |
284 | .endr | |
285 | ||
286 | less_than_8: # Length should be stored in | |
287 | # upper 3 bits of len_dw | |
288 | shl $1, len_dw | |
289 | less_than_8_post_shl1: | |
290 | jnc less_than_4 | |
291 | crc32l (bufptmp), crc_init_dw # CRC of 4 bytes | |
292 | jz do_return # return if remaining data is zero | |
293 | add $4, bufptmp | |
294 | less_than_4: # Length should be stored in | |
295 | # upper 2 bits of len_dw | |
296 | shl $1, len_dw | |
297 | jnc less_than_2 | |
298 | crc32w (bufptmp), crc_init_dw # CRC of 2 bytes | |
299 | jz do_return # return if remaining data is zero | |
300 | add $2, bufptmp | |
301 | less_than_2: # Length should be stored in the MSB | |
302 | # of len_dw | |
303 | shl $1, len_dw | |
304 | jnc less_than_1 | |
305 | crc32b (bufptmp), crc_init_dw # CRC of 1 byte | |
306 | less_than_1: # Length should be zero | |
307 | do_return: | |
308 | movq crc_init, %rax | |
309 | popq %rsi | |
310 | popq %rdi | |
311 | popq %rbx | |
312 | ret | |
313 | ||
314 | ################################################################ | |
315 | ## jump table Table is 129 entries x 2 bytes each | |
316 | ################################################################ | |
317 | .align 4 | |
318 | jump_table: | |
319 | i=0 | |
320 | .rept 129 | |
321 | .altmacro | |
322 | JMPTBL_ENTRY %i | |
323 | .noaltmacro | |
324 | i=i+1 | |
325 | .endr | |
326 | ################################################################ | |
327 | ## PCLMULQDQ tables | |
328 | ## Table is 128 entries x 2 quad words each | |
329 | ################################################################ | |
330 | .data | |
331 | .align 64 | |
332 | K_table: | |
333 | .quad 0x14cd00bd6,0x105ec76f0 | |
334 | .quad 0x0ba4fc28e,0x14cd00bd6 | |
335 | .quad 0x1d82c63da,0x0f20c0dfe | |
336 | .quad 0x09e4addf8,0x0ba4fc28e | |
337 | .quad 0x039d3b296,0x1384aa63a | |
338 | .quad 0x102f9b8a2,0x1d82c63da | |
339 | .quad 0x14237f5e6,0x01c291d04 | |
340 | .quad 0x00d3b6092,0x09e4addf8 | |
341 | .quad 0x0c96cfdc0,0x0740eef02 | |
342 | .quad 0x18266e456,0x039d3b296 | |
343 | .quad 0x0daece73e,0x0083a6eec | |
344 | .quad 0x0ab7aff2a,0x102f9b8a2 | |
345 | .quad 0x1248ea574,0x1c1733996 | |
346 | .quad 0x083348832,0x14237f5e6 | |
347 | .quad 0x12c743124,0x02ad91c30 | |
348 | .quad 0x0b9e02b86,0x00d3b6092 | |
349 | .quad 0x018b33a4e,0x06992cea2 | |
350 | .quad 0x1b331e26a,0x0c96cfdc0 | |
351 | .quad 0x17d35ba46,0x07e908048 | |
352 | .quad 0x1bf2e8b8a,0x18266e456 | |
353 | .quad 0x1a3e0968a,0x11ed1f9d8 | |
354 | .quad 0x0ce7f39f4,0x0daece73e | |
355 | .quad 0x061d82e56,0x0f1d0f55e | |
356 | .quad 0x0d270f1a2,0x0ab7aff2a | |
357 | .quad 0x1c3f5f66c,0x0a87ab8a8 | |
358 | .quad 0x12ed0daac,0x1248ea574 | |
359 | .quad 0x065863b64,0x08462d800 | |
360 | .quad 0x11eef4f8e,0x083348832 | |
361 | .quad 0x1ee54f54c,0x071d111a8 | |
362 | .quad 0x0b3e32c28,0x12c743124 | |
363 | .quad 0x0064f7f26,0x0ffd852c6 | |
364 | .quad 0x0dd7e3b0c,0x0b9e02b86 | |
365 | .quad 0x0f285651c,0x0dcb17aa4 | |
366 | .quad 0x010746f3c,0x018b33a4e | |
367 | .quad 0x1c24afea4,0x0f37c5aee | |
368 | .quad 0x0271d9844,0x1b331e26a | |
369 | .quad 0x08e766a0c,0x06051d5a2 | |
370 | .quad 0x093a5f730,0x17d35ba46 | |
371 | .quad 0x06cb08e5c,0x11d5ca20e | |
372 | .quad 0x06b749fb2,0x1bf2e8b8a | |
373 | .quad 0x1167f94f2,0x021f3d99c | |
374 | .quad 0x0cec3662e,0x1a3e0968a | |
375 | .quad 0x19329634a,0x08f158014 | |
376 | .quad 0x0e6fc4e6a,0x0ce7f39f4 | |
377 | .quad 0x08227bb8a,0x1a5e82106 | |
378 | .quad 0x0b0cd4768,0x061d82e56 | |
379 | .quad 0x13c2b89c4,0x188815ab2 | |
380 | .quad 0x0d7a4825c,0x0d270f1a2 | |
381 | .quad 0x10f5ff2ba,0x105405f3e | |
382 | .quad 0x00167d312,0x1c3f5f66c | |
383 | .quad 0x0f6076544,0x0e9adf796 | |
384 | .quad 0x026f6a60a,0x12ed0daac | |
385 | .quad 0x1a2adb74e,0x096638b34 | |
386 | .quad 0x19d34af3a,0x065863b64 | |
387 | .quad 0x049c3cc9c,0x1e50585a0 | |
388 | .quad 0x068bce87a,0x11eef4f8e | |
389 | .quad 0x1524fa6c6,0x19f1c69dc | |
390 | .quad 0x16cba8aca,0x1ee54f54c | |
391 | .quad 0x042d98888,0x12913343e | |
392 | .quad 0x1329d9f7e,0x0b3e32c28 | |
393 | .quad 0x1b1c69528,0x088f25a3a | |
394 | .quad 0x02178513a,0x0064f7f26 | |
395 | .quad 0x0e0ac139e,0x04e36f0b0 | |
396 | .quad 0x0170076fa,0x0dd7e3b0c | |
397 | .quad 0x141a1a2e2,0x0bd6f81f8 | |
398 | .quad 0x16ad828b4,0x0f285651c | |
399 | .quad 0x041d17b64,0x19425cbba | |
400 | .quad 0x1fae1cc66,0x010746f3c | |
401 | .quad 0x1a75b4b00,0x18db37e8a | |
402 | .quad 0x0f872e54c,0x1c24afea4 | |
403 | .quad 0x01e41e9fc,0x04c144932 | |
404 | .quad 0x086d8e4d2,0x0271d9844 | |
405 | .quad 0x160f7af7a,0x052148f02 | |
406 | .quad 0x05bb8f1bc,0x08e766a0c | |
407 | .quad 0x0a90fd27a,0x0a3c6f37a | |
408 | .quad 0x0b3af077a,0x093a5f730 | |
409 | .quad 0x04984d782,0x1d22c238e | |
410 | .quad 0x0ca6ef3ac,0x06cb08e5c | |
411 | .quad 0x0234e0b26,0x063ded06a | |
412 | .quad 0x1d88abd4a,0x06b749fb2 | |
413 | .quad 0x04597456a,0x04d56973c | |
414 | .quad 0x0e9e28eb4,0x1167f94f2 | |
415 | .quad 0x07b3ff57a,0x19385bf2e | |
416 | .quad 0x0c9c8b782,0x0cec3662e | |
417 | .quad 0x13a9cba9e,0x0e417f38a | |
418 | .quad 0x093e106a4,0x19329634a | |
419 | .quad 0x167001a9c,0x14e727980 | |
420 | .quad 0x1ddffc5d4,0x0e6fc4e6a | |
421 | .quad 0x00df04680,0x0d104b8fc | |
422 | .quad 0x02342001e,0x08227bb8a | |
423 | .quad 0x00a2a8d7e,0x05b397730 | |
424 | .quad 0x168763fa6,0x0b0cd4768 | |
425 | .quad 0x1ed5a407a,0x0e78eb416 | |
426 | .quad 0x0d2c3ed1a,0x13c2b89c4 | |
427 | .quad 0x0995a5724,0x1641378f0 | |
428 | .quad 0x19b1afbc4,0x0d7a4825c | |
429 | .quad 0x109ffedc0,0x08d96551c | |
430 | .quad 0x0f2271e60,0x10f5ff2ba | |
431 | .quad 0x00b0bf8ca,0x00bf80dd2 | |
432 | .quad 0x123888b7a,0x00167d312 | |
433 | .quad 0x1e888f7dc,0x18dcddd1c | |
434 | .quad 0x002ee03b2,0x0f6076544 | |
435 | .quad 0x183e8d8fe,0x06a45d2b2 | |
436 | .quad 0x133d7a042,0x026f6a60a | |
437 | .quad 0x116b0f50c,0x1dd3e10e8 | |
438 | .quad 0x05fabe670,0x1a2adb74e | |
439 | .quad 0x130004488,0x0de87806c | |
440 | .quad 0x000bcf5f6,0x19d34af3a | |
441 | .quad 0x18f0c7078,0x014338754 | |
442 | .quad 0x017f27698,0x049c3cc9c | |
443 | .quad 0x058ca5f00,0x15e3e77ee | |
444 | .quad 0x1af900c24,0x068bce87a | |
445 | .quad 0x0b5cfca28,0x0dd07448e | |
446 | .quad 0x0ded288f8,0x1524fa6c6 | |
447 | .quad 0x059f229bc,0x1d8048348 | |
448 | .quad 0x06d390dec,0x16cba8aca | |
449 | .quad 0x037170390,0x0a3e3e02c | |
450 | .quad 0x06353c1cc,0x042d98888 | |
451 | .quad 0x0c4584f5c,0x0d73c7bea | |
452 | .quad 0x1f16a3418,0x1329d9f7e | |
453 | .quad 0x0531377e2,0x185137662 | |
454 | .quad 0x1d8d9ca7c,0x1b1c69528 | |
455 | .quad 0x0b25b29f2,0x18a08b5bc | |
456 | .quad 0x19fb2a8b0,0x02178513a | |
457 | .quad 0x1a08fe6ac,0x1da758ae0 | |
458 | .quad 0x045cddf4e,0x0e0ac139e | |
459 | .quad 0x1a91647f2,0x169cf9eb0 | |
460 | .quad 0x1a0f717c4,0x0170076fa |