Merge tag 'please-pull-morepstore' of git://git.kernel.org/pub/scm/linux/kernel/git...
[deliverable/linux.git] / arch / arm64 / lib / memmove.S
1 /*
2 * Copyright (C) 2013 ARM Ltd.
3 * Copyright (C) 2013 Linaro.
4 *
5 * This code is based on glibc cortex strings work originally authored by Linaro
6 * and re-licensed under GPLv2 for the Linux kernel. The original code can
7 * be found @
8 *
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License version 2 as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program. If not, see <http://www.gnu.org/licenses/>.
23 */
24
25 #include <linux/linkage.h>
26 #include <asm/assembler.h>
27 #include <asm/cache.h>
28
29 /*
30 * Move a buffer from src to test (alignment handled by the hardware).
31 * If dest <= src, call memcpy, otherwise copy in reverse order.
32 *
33 * Parameters:
34 * x0 - dest
35 * x1 - src
36 * x2 - n
37 * Returns:
38 * x0 - dest
39 */
40 dstin .req x0
41 src .req x1
42 count .req x2
43 tmp1 .req x3
44 tmp1w .req w3
45 tmp2 .req x4
46 tmp2w .req w4
47 tmp3 .req x5
48 tmp3w .req w5
49 dst .req x6
50
51 A_l .req x7
52 A_h .req x8
53 B_l .req x9
54 B_h .req x10
55 C_l .req x11
56 C_h .req x12
57 D_l .req x13
58 D_h .req x14
59
60 ENTRY(memmove)
61 cmp dstin, src
62 b.lo memcpy
63 add tmp1, src, count
64 cmp dstin, tmp1
65 b.hs memcpy /* No overlap. */
66
67 add dst, dstin, count
68 add src, src, count
69 cmp count, #16
70 b.lo .Ltail15 /*probably non-alignment accesses.*/
71
72 ands tmp2, src, #15 /* Bytes to reach alignment. */
73 b.eq .LSrcAligned
74 sub count, count, tmp2
75 /*
76 * process the aligned offset length to make the src aligned firstly.
77 * those extra instructions' cost is acceptable. It also make the
78 * coming accesses are based on aligned address.
79 */
80 tbz tmp2, #0, 1f
81 ldrb tmp1w, [src, #-1]!
82 strb tmp1w, [dst, #-1]!
83 1:
84 tbz tmp2, #1, 2f
85 ldrh tmp1w, [src, #-2]!
86 strh tmp1w, [dst, #-2]!
87 2:
88 tbz tmp2, #2, 3f
89 ldr tmp1w, [src, #-4]!
90 str tmp1w, [dst, #-4]!
91 3:
92 tbz tmp2, #3, .LSrcAligned
93 ldr tmp1, [src, #-8]!
94 str tmp1, [dst, #-8]!
95
96 .LSrcAligned:
97 cmp count, #64
98 b.ge .Lcpy_over64
99
100 /*
101 * Deal with small copies quickly by dropping straight into the
102 * exit block.
103 */
104 .Ltail63:
105 /*
106 * Copy up to 48 bytes of data. At this point we only need the
107 * bottom 6 bits of count to be accurate.
108 */
109 ands tmp1, count, #0x30
110 b.eq .Ltail15
111 cmp tmp1w, #0x20
112 b.eq 1f
113 b.lt 2f
114 ldp A_l, A_h, [src, #-16]!
115 stp A_l, A_h, [dst, #-16]!
116 1:
117 ldp A_l, A_h, [src, #-16]!
118 stp A_l, A_h, [dst, #-16]!
119 2:
120 ldp A_l, A_h, [src, #-16]!
121 stp A_l, A_h, [dst, #-16]!
122
123 .Ltail15:
124 tbz count, #3, 1f
125 ldr tmp1, [src, #-8]!
126 str tmp1, [dst, #-8]!
127 1:
128 tbz count, #2, 2f
129 ldr tmp1w, [src, #-4]!
130 str tmp1w, [dst, #-4]!
131 2:
132 tbz count, #1, 3f
133 ldrh tmp1w, [src, #-2]!
134 strh tmp1w, [dst, #-2]!
135 3:
136 tbz count, #0, .Lexitfunc
137 ldrb tmp1w, [src, #-1]
138 strb tmp1w, [dst, #-1]
139
140 .Lexitfunc:
141 ret
142
143 .Lcpy_over64:
144 subs count, count, #128
145 b.ge .Lcpy_body_large
146 /*
147 * Less than 128 bytes to copy, so handle 64 bytes here and then jump
148 * to the tail.
149 */
150 ldp A_l, A_h, [src, #-16]
151 stp A_l, A_h, [dst, #-16]
152 ldp B_l, B_h, [src, #-32]
153 ldp C_l, C_h, [src, #-48]
154 stp B_l, B_h, [dst, #-32]
155 stp C_l, C_h, [dst, #-48]
156 ldp D_l, D_h, [src, #-64]!
157 stp D_l, D_h, [dst, #-64]!
158
159 tst count, #0x3f
160 b.ne .Ltail63
161 ret
162
163 /*
164 * Critical loop. Start at a new cache line boundary. Assuming
165 * 64 bytes per line this ensures the entire loop is in one line.
166 */
167 .p2align L1_CACHE_SHIFT
168 .Lcpy_body_large:
169 /* pre-load 64 bytes data. */
170 ldp A_l, A_h, [src, #-16]
171 ldp B_l, B_h, [src, #-32]
172 ldp C_l, C_h, [src, #-48]
173 ldp D_l, D_h, [src, #-64]!
174 1:
175 /*
176 * interlace the load of next 64 bytes data block with store of the last
177 * loaded 64 bytes data.
178 */
179 stp A_l, A_h, [dst, #-16]
180 ldp A_l, A_h, [src, #-16]
181 stp B_l, B_h, [dst, #-32]
182 ldp B_l, B_h, [src, #-32]
183 stp C_l, C_h, [dst, #-48]
184 ldp C_l, C_h, [src, #-48]
185 stp D_l, D_h, [dst, #-64]!
186 ldp D_l, D_h, [src, #-64]!
187 subs count, count, #64
188 b.ge 1b
189 stp A_l, A_h, [dst, #-16]
190 stp B_l, B_h, [dst, #-32]
191 stp C_l, C_h, [dst, #-48]
192 stp D_l, D_h, [dst, #-64]!
193
194 tst count, #0x3f
195 b.ne .Ltail63
196 ret
197 ENDPROC(memmove)
This page took 0.051447 seconds and 5 git commands to generate.