| 1 | /* Overlay manager for SPU. |
| 2 | |
| 3 | Copyright (C) 2006-2020 Free Software Foundation, Inc. |
| 4 | |
| 5 | This file is part of the GNU Binutils. |
| 6 | |
| 7 | This program is free software; you can redistribute it and/or modify |
| 8 | it under the terms of the GNU General Public License as published by |
| 9 | the Free Software Foundation; either version 3 of the License, or |
| 10 | (at your option) any later version. |
| 11 | |
| 12 | This program is distributed in the hope that it will be useful, |
| 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 15 | GNU General Public License for more details. |
| 16 | |
| 17 | You should have received a copy of the GNU General Public License |
| 18 | along with this program; if not, write to the Free Software |
| 19 | Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, |
| 20 | MA 02110-1301, USA. */ |
| 21 | |
| 22 | /* MFC DMA defn's. */ |
| 23 | #define MFC_GET_CMD 0x40 |
| 24 | #define MFC_MAX_DMA_SIZE 0x4000 |
| 25 | #define MFC_TAG_UPDATE_ALL 2 |
| 26 | #define MFC_TAG_ID 0 |
| 27 | |
| 28 | /* Register usage. */ |
| 29 | #define reserved1 $75 |
| 30 | #define parm $75 |
| 31 | #define tab1 reserved1 |
| 32 | #define tab2 reserved1 |
| 33 | #define vma reserved1 |
| 34 | #define oldvma reserved1 |
| 35 | #define newmask reserved1 |
| 36 | #define map reserved1 |
| 37 | |
| 38 | #define reserved2 $76 |
| 39 | #define off1 reserved2 |
| 40 | #define off2 reserved2 |
| 41 | #define present1 reserved2 |
| 42 | #define present2 reserved2 |
| 43 | #define sz reserved2 |
| 44 | #define cmp reserved2 |
| 45 | #define add64 reserved2 |
| 46 | #define cgbits reserved2 |
| 47 | #define off3 reserved2 |
| 48 | #define off4 reserved2 |
| 49 | #define addr4 reserved2 |
| 50 | #define off5 reserved2 |
| 51 | #define tagstat reserved2 |
| 52 | |
| 53 | #define reserved3 $77 |
| 54 | #define size1 reserved3 |
| 55 | #define size2 reserved3 |
| 56 | #define rv3 reserved3 |
| 57 | #define ealo reserved3 |
| 58 | #define cmd reserved3 |
| 59 | #define off64 reserved3 |
| 60 | #define tab3 reserved3 |
| 61 | #define tab4 reserved3 |
| 62 | #define tab5 reserved3 |
| 63 | |
| 64 | #define reserved4 $78 |
| 65 | #define ovl reserved4 |
| 66 | #define rv2 reserved4 |
| 67 | #define rv5 reserved4 |
| 68 | #define cgshuf reserved4 |
| 69 | #define newovl reserved4 |
| 70 | #define irqtmp1 reserved4 |
| 71 | #define irqtmp2 reserved4 |
| 72 | |
| 73 | #define reserved5 $79 |
| 74 | #define target reserved5 |
| 75 | |
| 76 | #define save1 $74 |
| 77 | #define rv4 save1 |
| 78 | #define rv7 save1 |
| 79 | #define tagid save1 |
| 80 | #define maxsize save1 |
| 81 | #define pbyte save1 |
| 82 | #define pbit save1 |
| 83 | |
| 84 | #define save2 $73 |
| 85 | #define cur save2 |
| 86 | #define rv6 save2 |
| 87 | #define osize save2 |
| 88 | #define zovl save2 |
| 89 | #define oldovl save2 |
| 90 | #define newvma save2 |
| 91 | |
| 92 | #define save3 $72 |
| 93 | #define rv1 save3 |
| 94 | #define ea64 save3 |
| 95 | #define buf3 save3 |
| 96 | #define genwi save3 |
| 97 | #define newmap save3 |
| 98 | #define oldmask save3 |
| 99 | |
| 100 | #define save4 $71 |
| 101 | #define irq_stat save4 |
| 102 | |
| 103 | .text |
| 104 | .align 4 |
| 105 | .type __rv_pattern, @object |
| 106 | .size __rv_pattern, 16 |
| 107 | __rv_pattern: |
| 108 | .word 0x00010203, 0x10111213, 0x80808080, 0x80808080 |
| 109 | |
| 110 | .type __cg_pattern, @object |
| 111 | .size __cg_pattern, 16 |
| 112 | __cg_pattern: |
| 113 | .word 0x04050607, 0x80808080, 0x80808080, 0x80808080 |
| 114 | |
| 115 | .type __ovly_current, @object |
| 116 | .size __ovly_current, 16 |
| 117 | __ovly_current: |
| 118 | .space 16 |
| 119 | |
| 120 | /* |
| 121 | * __ovly_return - stub for returning from overlay functions. |
| 122 | * |
| 123 | * On entry the four slots of $lr are: |
| 124 | * __ovly_return, prev ovl index, caller return addr, undefined. |
| 125 | * |
| 126 | * Load the previous overlay and jump to the caller return address. |
| 127 | * Updates __ovly_current. |
| 128 | */ |
| 129 | .align 4 |
| 130 | .global __ovly_return |
| 131 | .type __ovly_return, @function |
| 132 | __ovly_return: |
| 133 | ila tab1, _ovly_table - 16 # 0,2 0 |
| 134 | shlqbyi ovl, $lr, 4 # 1,4 0 |
| 135 | #nop |
| 136 | shlqbyi target, $lr, 8 # 1,4 1 |
| 137 | #nop; lnop |
| 138 | #nop; lnop |
| 139 | shli off1, ovl, 4 # 0,4 4 |
| 140 | #lnop |
| 141 | #nop |
| 142 | hbr ovly_ret9, target # 1,15 5 |
| 143 | #nop; lnop |
| 144 | #nop; lnop |
| 145 | #nop |
| 146 | lqx vma, tab1, off1 # 1,6 8 |
| 147 | #ifdef OVLY_IRQ_SAVE |
| 148 | nop |
| 149 | stqd save4, -64($sp) # 1,6 9 |
| 150 | #else |
| 151 | #nop; lnop |
| 152 | #endif |
| 153 | #nop; lnop |
| 154 | #nop; lnop |
| 155 | #nop; lnop |
| 156 | #nop; lnop |
| 157 | #nop |
| 158 | rotqbyi size1, vma, 4 # 1,4 14 |
| 159 | #nop |
| 160 | stqd save3, -48($sp) # 1,6 15 |
| 161 | #nop |
| 162 | stqd save2, -32($sp) # 1,6 16 |
| 163 | #nop |
| 164 | stqd save1, -16($sp) # 1,6 17 |
| 165 | andi present1, size1, 1 # 0,2 18 |
| 166 | stqr ovl, __ovly_current # 1,6 18 |
| 167 | #nop; lnop |
| 168 | #nop |
| 169 | brz present1, do_load # 1,4 20 |
| 170 | ovly_ret9: |
| 171 | #nop |
| 172 | bi target # 1,4 21 |
| 173 | |
| 174 | /* |
| 175 | * __ovly_load - copy an overlay partion to local store. |
| 176 | * |
| 177 | * On entry $75 points to a word consisting of the overlay index in |
| 178 | * the top 14 bits, and the target address in the bottom 18 bits. |
| 179 | * |
| 180 | * Sets up $lr to return via __ovly_return. If $lr is already set |
| 181 | * to return via __ovly_return, don't change it. In that case we |
| 182 | * have a tail call from one overlay function to another. |
| 183 | * Updates __ovly_current. |
| 184 | */ |
| 185 | .align 3 |
| 186 | .global __ovly_load |
| 187 | .type __ovly_load, @function |
| 188 | __ovly_load: |
| 189 | #if OVL_STUB_SIZE == 8 |
| 190 | ######## |
| 191 | #nop |
| 192 | lqd target, 0(parm) # 1,6 -11 |
| 193 | #nop; lnop |
| 194 | #nop; lnop |
| 195 | #nop; lnop |
| 196 | #nop; lnop |
| 197 | #nop; lnop |
| 198 | #nop |
| 199 | rotqby target, target, parm # 1,4 -5 |
| 200 | ila tab2, _ovly_table - 16 # 0,2 -4 |
| 201 | stqd save3, -48($sp) # 1,6 -4 |
| 202 | #nop |
| 203 | stqd save2, -32($sp) # 1,6 -3 |
| 204 | #nop |
| 205 | stqd save1, -16($sp) # 1,6 -2 |
| 206 | rotmi ovl, target, -18 # 0,4 -1 |
| 207 | hbr ovly_load9, target # 1,15 -1 |
| 208 | ila rv1, __ovly_return # 0,2 0 |
| 209 | #lnop |
| 210 | #nop; lnop |
| 211 | #nop |
| 212 | lqr cur, __ovly_current # 1,6 2 |
| 213 | shli off2, ovl, 4 # 0,4 3 |
| 214 | stqr ovl, __ovly_current # 1,6 3 |
| 215 | ceq rv2, $lr, rv1 # 0,2 4 |
| 216 | lqr rv3, __rv_pattern # 1,6 4 |
| 217 | #nop; lnop |
| 218 | #nop; lnop |
| 219 | #nop |
| 220 | lqx vma, tab2, off2 # 1,6 7 |
| 221 | ######## |
| 222 | #else /* OVL_STUB_SIZE == 16 */ |
| 223 | ######## |
| 224 | ila tab2, _ovly_table - 16 # 0,2 0 |
| 225 | stqd save3, -48($sp) # 1,6 0 |
| 226 | ila rv1, __ovly_return # 0,2 1 |
| 227 | stqd save2, -32($sp) # 1,6 1 |
| 228 | shli off2, ovl, 4 # 0,4 2 |
| 229 | lqr cur, __ovly_current # 1,6 2 |
| 230 | nop |
| 231 | stqr ovl, __ovly_current # 1,6 3 |
| 232 | ceq rv2, $lr, rv1 # 0,2 4 |
| 233 | lqr rv3, __rv_pattern # 1,6 4 |
| 234 | #nop |
| 235 | hbr ovly_load9, target # 1,15 5 |
| 236 | #nop |
| 237 | lqx vma, tab2, off2 # 1,6 6 |
| 238 | #nop |
| 239 | stqd save1, -16($sp) # 1,6 7 |
| 240 | ######## |
| 241 | #endif |
| 242 | |
| 243 | #nop; lnop |
| 244 | #nop; lnop |
| 245 | #nop |
| 246 | shufb rv4, rv1, cur, rv3 # 1,4 10 |
| 247 | #nop |
| 248 | fsmb rv5, rv2 # 1,4 11 |
| 249 | #nop |
| 250 | rotqmbyi rv6, $lr, -8 # 1,4 12 |
| 251 | #nop |
| 252 | rotqbyi size2, vma, 4 # 1,4 13 |
| 253 | #nop |
| 254 | lqd save3, -48($sp) # 1,6 14 |
| 255 | #nop; lnop |
| 256 | or rv7, rv4, rv6 # 0,2 16 |
| 257 | lqd save2, -32($sp) # 1,6 16 |
| 258 | andi present2, size2, 1 # 0,2 17 |
| 259 | #ifdef OVLY_IRQ_SAVE |
| 260 | stqd save4, -64($sp) # 1,6 17 |
| 261 | #else |
| 262 | lnop # 1,0 17 |
| 263 | #endif |
| 264 | selb $lr, rv7, $lr, rv5 # 0,2 18 |
| 265 | lqd save1, -16($sp) # 1,6 18 |
| 266 | #nop |
| 267 | brz present2, do_load # 1,4 19 |
| 268 | ovly_load9: |
| 269 | #nop |
| 270 | bi target # 1,4 20 |
| 271 | |
| 272 | /* If we get here, we are about to load a new overlay. |
| 273 | * "vma" contains the relevant entry from _ovly_table[]. |
| 274 | * extern struct { |
| 275 | * u32 vma; |
| 276 | * u32 size; |
| 277 | * u32 file_offset; |
| 278 | * u32 buf; |
| 279 | * } _ovly_table[]; |
| 280 | */ |
| 281 | .align 3 |
| 282 | .global __ovly_load_event |
| 283 | .type __ovly_load_event, @function |
| 284 | __ovly_load_event: |
| 285 | do_load: |
| 286 | #ifdef OVLY_IRQ_SAVE |
| 287 | ila irqtmp1, do_load10 # 0,2 -5 |
| 288 | rotqbyi sz, vma, 8 # 1,4 -5 |
| 289 | #nop |
| 290 | rdch irq_stat, $SPU_RdMachStat # 1,6 -4 |
| 291 | #nop |
| 292 | bid irqtmp1 # 1,4 -3 |
| 293 | do_load10: |
| 294 | nop |
| 295 | #else |
| 296 | #nop |
| 297 | rotqbyi sz, vma, 8 # 1,4 0 |
| 298 | #endif |
| 299 | rotqbyi osize, vma, 4 # 1,4 1 |
| 300 | #nop |
| 301 | lqa ea64, _EAR_ # 1,6 2 |
| 302 | #nop |
| 303 | lqr cgshuf, __cg_pattern # 1,6 3 |
| 304 | |
| 305 | /* We could predict the branch at the end of this loop by adding a few |
| 306 | instructions, and there are plenty of free cycles to do so without |
| 307 | impacting loop execution time. However, it doesn't make a great |
| 308 | deal of sense since we need to wait for the dma to complete anyway. */ |
| 309 | __ovly_xfer_loop: |
| 310 | #nop |
| 311 | rotqmbyi off64, sz, -4 # 1,4 4 |
| 312 | #nop; lnop |
| 313 | #nop; lnop |
| 314 | #nop; lnop |
| 315 | cg cgbits, ea64, off64 # 0,2 8 |
| 316 | #lnop |
| 317 | #nop; lnop |
| 318 | #nop |
| 319 | shufb add64, cgbits, cgbits, cgshuf # 1,4 10 |
| 320 | #nop; lnop |
| 321 | #nop; lnop |
| 322 | #nop; lnop |
| 323 | addx add64, ea64, off64 # 0,2 14 |
| 324 | #lnop |
| 325 | ila maxsize, MFC_MAX_DMA_SIZE # 0,2 15 |
| 326 | lnop |
| 327 | ori ea64, add64, 0 # 0,2 16 |
| 328 | rotqbyi ealo, add64, 4 # 1,4 16 |
| 329 | cgt cmp, osize, maxsize # 0,2 17 |
| 330 | wrch $MFC_LSA, vma # 1,6 17 |
| 331 | #nop; lnop |
| 332 | selb sz, osize, maxsize, cmp # 0,2 19 |
| 333 | wrch $MFC_EAH, ea64 # 1,6 19 |
| 334 | ila tagid, MFC_TAG_ID # 0,2 20 |
| 335 | wrch $MFC_EAL, ealo # 1,6 20 |
| 336 | ila cmd, MFC_GET_CMD # 0,2 21 |
| 337 | wrch $MFC_Size, sz # 1,6 21 |
| 338 | sf osize, sz, osize # 0,2 22 |
| 339 | wrch $MFC_TagId, tagid # 1,6 22 |
| 340 | a vma, vma, sz # 0,2 23 |
| 341 | wrch $MFC_Cmd, cmd # 1,6 23 |
| 342 | #nop |
| 343 | brnz osize, __ovly_xfer_loop # 1,4 24 |
| 344 | |
| 345 | /* Now update our data structions while waiting for DMA to complete. |
| 346 | Low bit of .size needs to be cleared on the _ovly_table entry |
| 347 | corresponding to the evicted overlay, and set on the entry for the |
| 348 | newly loaded overlay. Note that no overlay may in fact be evicted |
| 349 | as _ovly_buf_table[] starts with all zeros. Don't zap .size entry |
| 350 | for zero index! Also of course update the _ovly_buf_table entry. */ |
| 351 | #nop |
| 352 | lqr newovl, __ovly_current # 1,6 25 |
| 353 | #nop; lnop |
| 354 | #nop; lnop |
| 355 | #nop; lnop |
| 356 | #nop; lnop |
| 357 | #nop; lnop |
| 358 | shli off3, newovl, 4 # 0,4 31 |
| 359 | #lnop |
| 360 | ila tab3, _ovly_table - 16 # 0,2 32 |
| 361 | #lnop |
| 362 | #nop |
| 363 | fsmbi pbyte, 0x100 # 1,4 33 |
| 364 | #nop; lnop |
| 365 | #nop |
| 366 | lqx vma, tab3, off3 # 1,6 35 |
| 367 | #nop; lnop |
| 368 | andi pbit, pbyte, 1 # 0,2 37 |
| 369 | lnop |
| 370 | #nop; lnop |
| 371 | #nop; lnop |
| 372 | #nop; lnop |
| 373 | or newvma, vma, pbit # 0,2 41 |
| 374 | rotqbyi buf3, vma, 12 # 1,4 41 |
| 375 | #nop; lnop |
| 376 | #nop |
| 377 | stqx newvma, tab3, off3 # 1,6 43 |
| 378 | #nop; lnop |
| 379 | shli off4, buf3, 2 # 1,4 45 |
| 380 | #lnop |
| 381 | ila tab4, _ovly_buf_table - 4 # 0,2 46 |
| 382 | #lnop |
| 383 | #nop; lnop |
| 384 | #nop; lnop |
| 385 | #nop |
| 386 | lqx map, tab4, off4 # 1,6 49 |
| 387 | #nop |
| 388 | cwx genwi, tab4, off4 # 1,4 50 |
| 389 | a addr4, tab4, off4 # 0,2 51 |
| 390 | #lnop |
| 391 | #nop; lnop |
| 392 | #nop; lnop |
| 393 | #nop; lnop |
| 394 | #nop |
| 395 | rotqby oldovl, map, addr4 # 1,4 55 |
| 396 | #nop |
| 397 | shufb newmap, newovl, map, genwi # 0,4 56 |
| 398 | #if MFC_TAG_ID < 16 |
| 399 | ila newmask, 1 << MFC_TAG_ID # 0,2 57 |
| 400 | #else |
| 401 | ilhu newmask, 1 << (MFC_TAG_ID - 16) # 0,2 57 |
| 402 | #endif |
| 403 | #lnop |
| 404 | #nop; lnop |
| 405 | #nop; lnop |
| 406 | stqd newmap, 0(addr4) # 1,6 60 |
| 407 | |
| 408 | /* Save app's tagmask, wait for DMA complete, restore mask. */ |
| 409 | ila tagstat, MFC_TAG_UPDATE_ALL # 0,2 61 |
| 410 | rdch oldmask, $MFC_RdTagMask # 1,6 61 |
| 411 | #nop |
| 412 | wrch $MFC_WrTagMask, newmask # 1,6 62 |
| 413 | #nop |
| 414 | wrch $MFC_WrTagUpdate, tagstat # 1,6 63 |
| 415 | #nop |
| 416 | rdch tagstat, $MFC_RdTagStat # 1,6 64 |
| 417 | #nop |
| 418 | sync # 1,4 65 |
| 419 | /* Any hint prior to the sync is lost. A hint here allows the branch |
| 420 | to complete 15 cycles after the hint. With no hint the branch will |
| 421 | take 18 or 19 cycles. */ |
| 422 | ila tab5, _ovly_table - 16 # 0,2 66 |
| 423 | hbr do_load99, target # 1,15 66 |
| 424 | shli off5, oldovl, 4 # 0,4 67 |
| 425 | wrch $MFC_WrTagMask, oldmask # 1,6 67 |
| 426 | ceqi zovl, oldovl, 0 # 0,2 68 |
| 427 | #lnop |
| 428 | #nop; lnop |
| 429 | #nop |
| 430 | fsm zovl, zovl # 1,4 70 |
| 431 | #nop |
| 432 | lqx oldvma, tab5, off5 # 1,6 71 |
| 433 | #nop |
| 434 | lqd save3, -48($sp) # 1,6 72 |
| 435 | #nop; lnop |
| 436 | andc pbit, pbit, zovl # 0,2 74 |
| 437 | lqd save2, -32($sp) # 1,6 74 |
| 438 | #ifdef OVLY_IRQ_SAVE |
| 439 | ila irqtmp2, do_load90 # 0,2 75 |
| 440 | #lnop |
| 441 | andi irq_stat, irq_stat, 1 # 0,2 76 |
| 442 | #lnop |
| 443 | #else |
| 444 | #nop; lnop |
| 445 | #nop; lnop |
| 446 | #endif |
| 447 | andc oldvma, oldvma, pbit # 0,2 77 |
| 448 | lqd save1, -16($sp) # 1,6 77 |
| 449 | nop # 0,0 78 |
| 450 | #lnop |
| 451 | #nop |
| 452 | stqx oldvma, tab5, off5 # 1,6 79 |
| 453 | #nop |
| 454 | #ifdef OVLY_IRQ_SAVE |
| 455 | binze irq_stat, irqtmp2 # 1,4 80 |
| 456 | do_load90: |
| 457 | #nop |
| 458 | lqd save4, -64($sp) # 1,6 84 |
| 459 | #else |
| 460 | #nop; lnop |
| 461 | #endif |
| 462 | |
| 463 | .global _ovly_debug_event |
| 464 | .type _ovly_debug_event, @function |
| 465 | _ovly_debug_event: |
| 466 | nop |
| 467 | /* Branch to target address. */ |
| 468 | do_load99: |
| 469 | bi target # 1,4 81/85 |
| 470 | |
| 471 | .size __ovly_load, . - __ovly_load |