Commit | Line | Data |
---|---|---|
1da177e4 | 1 | /* Copyright 2002 Andi Kleen, SuSE Labs */ |
8d379dad | 2 | |
8d379dad JB |
3 | #include <linux/linkage.h> |
4 | #include <asm/dwarf2.h> | |
2f19e06a FY |
5 | #include <asm/cpufeature.h> |
6 | #include <asm/alternative-asm.h> | |
8d379dad | 7 | |
1da177e4 | 8 | /* |
2f19e06a FY |
9 | * ISO C memset - set a memory block to a byte value. This function uses fast |
10 | * string to get better performance than the original function. The code is | |
11 | * simpler and shorter than the orignal function as well. | |
1da177e4 LT |
12 | * |
13 | * rdi destination | |
14 | * rsi value (char) | |
15 | * rdx count (bytes) | |
16 | * | |
17 | * rax original destination | |
18 | */ | |
7269e881 JB |
19 | .section .altinstr_replacement, "ax", @progbits |
20 | .Lmemset_c: | |
8d379dad | 21 | movq %rdi,%r9 |
5d7244e7 JB |
22 | movq %rdx,%rcx |
23 | andl $7,%edx | |
24 | shrq $3,%rcx | |
8d379dad JB |
25 | /* expand byte value */ |
26 | movzbl %sil,%esi | |
27 | movabs $0x0101010101010101,%rax | |
5d7244e7 | 28 | imulq %rsi,%rax |
8d379dad | 29 | rep stosq |
5d7244e7 | 30 | movl %edx,%ecx |
8d379dad JB |
31 | rep stosb |
32 | movq %r9,%rax | |
33 | ret | |
7269e881 JB |
34 | .Lmemset_e: |
35 | .previous | |
8d379dad | 36 | |
2f19e06a FY |
37 | /* |
38 | * ISO C memset - set a memory block to a byte value. This function uses | |
39 | * enhanced rep stosb to override the fast string function. | |
40 | * The code is simpler and shorter than the fast string function as well. | |
41 | * | |
42 | * rdi destination | |
43 | * rsi value (char) | |
44 | * rdx count (bytes) | |
45 | * | |
46 | * rax original destination | |
47 | */ | |
48 | .section .altinstr_replacement, "ax", @progbits | |
49 | .Lmemset_c_e: | |
50 | movq %rdi,%r9 | |
51 | movb %sil,%al | |
5d7244e7 | 52 | movq %rdx,%rcx |
2f19e06a FY |
53 | rep stosb |
54 | movq %r9,%rax | |
55 | ret | |
56 | .Lmemset_e_e: | |
57 | .previous | |
58 | ||
8d379dad JB |
59 | ENTRY(memset) |
60 | ENTRY(__memset) | |
61 | CFI_STARTPROC | |
7bcd3f34 | 62 | movq %rdi,%r10 |
7bcd3f34 AK |
63 | |
64 | /* expand byte value */ | |
65 | movzbl %sil,%ecx | |
66 | movabs $0x0101010101010101,%rax | |
5d7244e7 | 67 | imulq %rcx,%rax |
7bcd3f34 AK |
68 | |
69 | /* align dst */ | |
70 | movl %edi,%r9d | |
71 | andl $7,%r9d | |
72 | jnz .Lbad_alignment | |
8d379dad | 73 | CFI_REMEMBER_STATE |
7bcd3f34 AK |
74 | .Lafter_bad_alignment: |
75 | ||
5d7244e7 JB |
76 | movq %rdx,%rcx |
77 | shrq $6,%rcx | |
7bcd3f34 AK |
78 | jz .Lhandle_tail |
79 | ||
80 | .p2align 4 | |
81 | .Lloop_64: | |
5d7244e7 | 82 | decq %rcx |
7bcd3f34 AK |
83 | movq %rax,(%rdi) |
84 | movq %rax,8(%rdi) | |
85 | movq %rax,16(%rdi) | |
86 | movq %rax,24(%rdi) | |
87 | movq %rax,32(%rdi) | |
88 | movq %rax,40(%rdi) | |
89 | movq %rax,48(%rdi) | |
90 | movq %rax,56(%rdi) | |
91 | leaq 64(%rdi),%rdi | |
92 | jnz .Lloop_64 | |
93 | ||
94 | /* Handle tail in loops. The loops should be faster than hard | |
95 | to predict jump tables. */ | |
96 | .p2align 4 | |
97 | .Lhandle_tail: | |
5d7244e7 | 98 | movl %edx,%ecx |
7bcd3f34 AK |
99 | andl $63&(~7),%ecx |
100 | jz .Lhandle_7 | |
101 | shrl $3,%ecx | |
102 | .p2align 4 | |
103 | .Lloop_8: | |
104 | decl %ecx | |
105 | movq %rax,(%rdi) | |
106 | leaq 8(%rdi),%rdi | |
107 | jnz .Lloop_8 | |
108 | ||
109 | .Lhandle_7: | |
5d7244e7 | 110 | andl $7,%edx |
7bcd3f34 AK |
111 | jz .Lende |
112 | .p2align 4 | |
113 | .Lloop_1: | |
5d7244e7 | 114 | decl %edx |
7bcd3f34 AK |
115 | movb %al,(%rdi) |
116 | leaq 1(%rdi),%rdi | |
117 | jnz .Lloop_1 | |
118 | ||
119 | .Lende: | |
120 | movq %r10,%rax | |
121 | ret | |
122 | ||
8d379dad | 123 | CFI_RESTORE_STATE |
7bcd3f34 | 124 | .Lbad_alignment: |
5d7244e7 | 125 | cmpq $7,%rdx |
7bcd3f34 AK |
126 | jbe .Lhandle_7 |
127 | movq %rax,(%rdi) /* unaligned store */ | |
128 | movq $8,%r8 | |
129 | subq %r9,%r8 | |
130 | addq %r8,%rdi | |
5d7244e7 | 131 | subq %r8,%rdx |
7bcd3f34 | 132 | jmp .Lafter_bad_alignment |
8d379dad JB |
133 | .Lfinal: |
134 | CFI_ENDPROC | |
135 | ENDPROC(memset) | |
136 | ENDPROC(__memset) | |
7bcd3f34 | 137 | |
2f19e06a FY |
138 | /* Some CPUs support enhanced REP MOVSB/STOSB feature. |
139 | * It is recommended to use this when possible. | |
140 | * | |
141 | * If enhanced REP MOVSB/STOSB feature is not available, use fast string | |
142 | * instructions. | |
143 | * | |
144 | * Otherwise, use original memset function. | |
145 | * | |
146 | * In .altinstructions section, ERMS feature is placed after REG_GOOD | |
147 | * feature to implement the right patch order. | |
148 | */ | |
7bcd3f34 | 149 | .section .altinstructions,"a" |
2f19e06a FY |
150 | altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ |
151 | .Lfinal-memset,.Lmemset_e-.Lmemset_c | |
152 | altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ | |
153 | .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e | |
7bcd3f34 | 154 | .previous |