[deliverable/linux.git] / arch / x86 / include / asm / xor_avx.h

#ifndef _ASM_X86_XOR_AVX_H
#define _ASM_X86_XOR_AVX_H

/*
 * Optimized RAID-5 checksumming functions for AVX
 *
 * Copyright (C) 2012 Intel Corporation
 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
 *
 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; version 2
 * of the License.
 */

#ifdef CONFIG_AS_AVX

#include <linux/compiler.h>
#include <asm/fpu/api.h>

#define BLOCK4(i) \
		BLOCK(32 * i, 0) \
		BLOCK(32 * (i + 1), 1) \
		BLOCK(32 * (i + 2), 2) \
		BLOCK(32 * (i + 3), 3)

#define BLOCK16() \
		BLOCK4(0) \
		BLOCK4(4) \
		BLOCK4(8) \
		BLOCK4(12)

static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
{
	unsigned long lines = bytes >> 9;

	kernel_fpu_begin();

	while (lines--) {
#undef BLOCK
#define BLOCK(i, reg) \
do { \
	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
		"m" (p0[i / sizeof(*p0)])); \
	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
		"=m" (p0[i / sizeof(*p0)])); \
} while (0);

		BLOCK16()

		p0 = (unsigned long *)((uintptr_t)p0 + 512);
		p1 = (unsigned long *)((uintptr_t)p1 + 512);
	}

	kernel_fpu_end();
}

static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
	unsigned long *p2)
{
	unsigned long lines = bytes >> 9;

	kernel_fpu_begin();

	while (lines--) {
#undef BLOCK
#define BLOCK(i, reg) \
do { \
	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		"m" (p1[i / sizeof(*p1)])); \
	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		"m" (p0[i / sizeof(*p0)])); \
	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
		"=m" (p0[i / sizeof(*p0)])); \
} while (0);

		BLOCK16()

		p0 = (unsigned long *)((uintptr_t)p0 + 512);
		p1 = (unsigned long *)((uintptr_t)p1 + 512);
		p2 = (unsigned long *)((uintptr_t)p2 + 512);
	}

	kernel_fpu_end();
}

static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
	unsigned long *p2, unsigned long *p3)
{
	unsigned long lines = bytes >> 9;

	kernel_fpu_begin();

	while (lines--) {
#undef BLOCK
#define BLOCK(i, reg) \
do { \
	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		"m" (p2[i / sizeof(*p2)])); \
	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		"m" (p1[i / sizeof(*p1)])); \
	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		"m" (p0[i / sizeof(*p0)])); \
	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
		"=m" (p0[i / sizeof(*p0)])); \
} while (0);

		BLOCK16();

		p0 = (unsigned long *)((uintptr_t)p0 + 512);
		p1 = (unsigned long *)((uintptr_t)p1 + 512);
		p2 = (unsigned long *)((uintptr_t)p2 + 512);
		p3 = (unsigned long *)((uintptr_t)p3 + 512);
	}

	kernel_fpu_end();
}

static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
	unsigned long *p2, unsigned long *p3, unsigned long *p4)
{
	unsigned long lines = bytes >> 9;

	kernel_fpu_begin();

	while (lines--) {
#undef BLOCK
#define BLOCK(i, reg) \
do { \
	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		"m" (p3[i / sizeof(*p3)])); \
	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		"m" (p2[i / sizeof(*p2)])); \
	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		"m" (p1[i / sizeof(*p1)])); \
	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		"m" (p0[i / sizeof(*p0)])); \
	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
		"=m" (p0[i / sizeof(*p0)])); \
} while (0);

		BLOCK16()

		p0 = (unsigned long *)((uintptr_t)p0 + 512);
		p1 = (unsigned long *)((uintptr_t)p1 + 512);
		p2 = (unsigned long *)((uintptr_t)p2 + 512);
		p3 = (unsigned long *)((uintptr_t)p3 + 512);
		p4 = (unsigned long *)((uintptr_t)p4 + 512);
	}

	kernel_fpu_end();
}

static struct xor_block_template xor_block_avx = {
	.name = "avx",
	.do_2 = xor_avx_2,
	.do_3 = xor_avx_3,
	.do_4 = xor_avx_4,
	.do_5 = xor_avx_5,
};

#define AVX_XOR_SPEED \
do { \
	if (cpu_has_avx && cpu_has_osxsave) \
		xor_speed(&xor_block_avx); \
} while (0)

#define AVX_SELECT(FASTEST) \
	(cpu_has_avx && cpu_has_osxsave ? &xor_block_avx : FASTEST)

#else

#define AVX_XOR_SPEED {}

#define AVX_SELECT(FASTEST) (FASTEST)

#endif
#endif
Commit	Line	Data
ea4d26ae JK	1	#ifndef _ASM_X86_XOR_AVX_H
	2	#define _ASM_X86_XOR_AVX_H
	3
	4	/*
	5	* Optimized RAID-5 checksumming functions for AVX
	6	*
	7	* Copyright (C) 2012 Intel Corporation
	8	* Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
	9	*
	10	* Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
	11	*
	12	* This program is free software; you can redistribute it and/or
	13	* modify it under the terms of the GNU General Public License
	14	* as published by the Free Software Foundation; version 2
	15	* of the License.
	16	*/
	17
	18	#ifdef CONFIG_AS_AVX
	19
	20	#include <linux/compiler.h>
df6b35f4	21	#include <asm/fpu/api.h>
ea4d26ae	22
ea4d26ae JK	23	#define BLOCK4(i) \
	24	BLOCK(32 * i, 0) \
	25	BLOCK(32 * (i + 1), 1) \
	26	BLOCK(32 * (i + 2), 2) \
	27	BLOCK(32 * (i + 3), 3)
	28
	29	#define BLOCK16() \
	30	BLOCK4(0) \
	31	BLOCK4(4) \
	32	BLOCK4(8) \
	33	BLOCK4(12)
	34
	35	static void xor_avx_2(unsigned long bytes, unsigned long p0, unsigned long p1)
	36	{
841e3604	37	unsigned long lines = bytes >> 9;
ea4d26ae	38
841e3604	39	kernel_fpu_begin();
ea4d26ae JK	40
	41	while (lines--) {
	42	#undef BLOCK
	43	#define BLOCK(i, reg) \
	44	do { \
	45	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
	46	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	47	"m" (p0[i / sizeof(*p0)])); \
	48	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
	49	"=m" (p0[i / sizeof(*p0)])); \
	50	} while (0);
	51
	52	BLOCK16()
	53
	54	p0 = (unsigned long *)((uintptr_t)p0 + 512);
	55	p1 = (unsigned long *)((uintptr_t)p1 + 512);
	56	}
	57
841e3604	58	kernel_fpu_end();
ea4d26ae JK	59	}
	60
	61	static void xor_avx_3(unsigned long bytes, unsigned long p0, unsigned long p1,
	62	unsigned long *p2)
	63	{
841e3604	64	unsigned long lines = bytes >> 9;
ea4d26ae	65
841e3604	66	kernel_fpu_begin();
ea4d26ae JK	67
	68	while (lines--) {
	69	#undef BLOCK
	70	#define BLOCK(i, reg) \
	71	do { \
	72	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
	73	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	74	"m" (p1[i / sizeof(*p1)])); \
	75	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	76	"m" (p0[i / sizeof(*p0)])); \
	77	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
	78	"=m" (p0[i / sizeof(*p0)])); \
	79	} while (0);
	80
	81	BLOCK16()
	82
	83	p0 = (unsigned long *)((uintptr_t)p0 + 512);
	84	p1 = (unsigned long *)((uintptr_t)p1 + 512);
	85	p2 = (unsigned long *)((uintptr_t)p2 + 512);
	86	}
	87
841e3604	88	kernel_fpu_end();
ea4d26ae JK	89	}
	90
	91	static void xor_avx_4(unsigned long bytes, unsigned long p0, unsigned long p1,
	92	unsigned long p2, unsigned long p3)
	93	{
841e3604	94	unsigned long lines = bytes >> 9;
ea4d26ae	95
841e3604	96	kernel_fpu_begin();
ea4d26ae JK	97
	98	while (lines--) {
	99	#undef BLOCK
	100	#define BLOCK(i, reg) \
	101	do { \
	102	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
	103	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	104	"m" (p2[i / sizeof(*p2)])); \
	105	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	106	"m" (p1[i / sizeof(*p1)])); \
	107	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	108	"m" (p0[i / sizeof(*p0)])); \
	109	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
	110	"=m" (p0[i / sizeof(*p0)])); \
	111	} while (0);
	112
	113	BLOCK16();
	114
	115	p0 = (unsigned long *)((uintptr_t)p0 + 512);
	116	p1 = (unsigned long *)((uintptr_t)p1 + 512);
	117	p2 = (unsigned long *)((uintptr_t)p2 + 512);
	118	p3 = (unsigned long *)((uintptr_t)p3 + 512);
	119	}
	120
841e3604	121	kernel_fpu_end();
ea4d26ae JK	122	}
	123
	124	static void xor_avx_5(unsigned long bytes, unsigned long p0, unsigned long p1,
	125	unsigned long p2, unsigned long p3, unsigned long *p4)
	126	{
841e3604	127	unsigned long lines = bytes >> 9;
ea4d26ae	128
841e3604	129	kernel_fpu_begin();
ea4d26ae JK	130
	131	while (lines--) {
	132	#undef BLOCK
	133	#define BLOCK(i, reg) \
	134	do { \
	135	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
	136	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	137	"m" (p3[i / sizeof(*p3)])); \
	138	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	139	"m" (p2[i / sizeof(*p2)])); \
	140	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	141	"m" (p1[i / sizeof(*p1)])); \
	142	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	143	"m" (p0[i / sizeof(*p0)])); \
	144	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
	145	"=m" (p0[i / sizeof(*p0)])); \
	146	} while (0);
	147
	148	BLOCK16()
	149
	150	p0 = (unsigned long *)((uintptr_t)p0 + 512);
	151	p1 = (unsigned long *)((uintptr_t)p1 + 512);
	152	p2 = (unsigned long *)((uintptr_t)p2 + 512);
	153	p3 = (unsigned long *)((uintptr_t)p3 + 512);
	154	p4 = (unsigned long *)((uintptr_t)p4 + 512);
	155	}
	156
841e3604	157	kernel_fpu_end();
ea4d26ae JK	158	}
	159
	160	static struct xor_block_template xor_block_avx = {
	161	.name = "avx",
	162	.do_2 = xor_avx_2,
	163	.do_3 = xor_avx_3,
	164	.do_4 = xor_avx_4,
	165	.do_5 = xor_avx_5,
	166	};
	167
	168	#define AVX_XOR_SPEED \
	169	do { \
edb6f294	170	if (cpu_has_avx && cpu_has_osxsave) \
ea4d26ae JK	171	xor_speed(&xor_block_avx); \
	172	} while (0)
	173
	174	#define AVX_SELECT(FASTEST) \
edb6f294	175	(cpu_has_avx && cpu_has_osxsave ? &xor_block_avx : FASTEST)
ea4d26ae JK	176
	177	#else
	178
	179	#define AVX_XOR_SPEED {}
	180
	181	#define AVX_SELECT(FASTEST) (FASTEST)
	182
	183	#endif
	184	#endif