1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32#include <linux/linkage.h>
33 .text
34 .globl memcpy
35 .type memcpy, @function
36 .ent memcpy
37
38memcpy:
39fast_memcpy_ascending:
40
41 addi r3, r5, 0
42
43 addi r4, r0, 4
44 cmpu r4, r4, r7
45 blti r4, a_xfer_end
46
47
48 andi r4, r5, 3
49
50 beqi r4, a_dalign_done
51
52 rsubi r4, r4, 4
53 rsub r7, r4, r7
54
55a_xfer_first_loop:
56
57 beqi r4, a_dalign_done
58 lbui r11, r6, 0
59 sbi r11, r5, 0
60 addi r6, r6, 1
61 addi r5, r5, 1
62 brid a_xfer_first_loop
63 addi r4, r4, -1
64
65a_dalign_done:
66 addi r4, r0, 32
67 cmpu r4, r4, r7
68
69 blti r4, a_block_done
70
71a_block_xfer:
72 andi r4, r7, 0xffffffe0
73 rsub r7, r4, r7
74
75 andi r9, r6, 3
76
77 bnei r9, a_block_unaligned
78
79a_block_aligned:
80 lwi r9, r6, 0
81 lwi r10, r6, 4
82 lwi r11, r6, 8
83 lwi r12, r6, 12
84 swi r9, r5, 0
85 swi r10, r5, 4
86 swi r11, r5, 8
87 swi r12, r5, 12
88 lwi r9, r6, 16
89 lwi r10, r6, 20
90 lwi r11, r6, 24
91 lwi r12, r6, 28
92 swi r9, r5, 16
93 swi r10, r5, 20
94 swi r11, r5, 24
95 swi r12, r5, 28
96 addi r6, r6, 32
97 addi r4, r4, -32
98 bneid r4, a_block_aligned
99 addi r5, r5, 32
100 bri a_block_done
101
102a_block_unaligned:
103 andi r8, r6, 0xfffffffc
104 add r6, r6, r4
105 lwi r11, r8, 0
106
107 addi r9, r9, -1
108 beqi r9, a_block_u1
109 addi r9, r9, -1
110 beqi r9, a_block_u2
111
112a_block_u3:
113 bslli r11, r11, 24
114a_bu3_loop:
115 lwi r12, r8, 4
116 bsrli r9, r12, 8
117 or r9, r11, r9
118 swi r9, r5, 0
119 bslli r11, r12, 24
120 lwi r12, r8, 8
121 bsrli r9, r12, 8
122 or r9, r11, r9
123 swi r9, r5, 4
124 bslli r11, r12, 24
125 lwi r12, r8, 12
126 bsrli r9, r12, 8
127 or r9, r11, r9
128 swi r9, r5, 8
129 bslli r11, r12, 24
130 lwi r12, r8, 16
131 bsrli r9, r12, 8
132 or r9, r11, r9
133 swi r9, r5, 12
134 bslli r11, r12, 24
135 lwi r12, r8, 20
136 bsrli r9, r12, 8
137 or r9, r11, r9
138 swi r9, r5, 16
139 bslli r11, r12, 24
140 lwi r12, r8, 24
141 bsrli r9, r12, 8
142 or r9, r11, r9
143 swi r9, r5, 20
144 bslli r11, r12, 24
145 lwi r12, r8, 28
146 bsrli r9, r12, 8
147 or r9, r11, r9
148 swi r9, r5, 24
149 bslli r11, r12, 24
150 lwi r12, r8, 32
151 bsrli r9, r12, 8
152 or r9, r11, r9
153 swi r9, r5, 28
154 bslli r11, r12, 24
155 addi r8, r8, 32
156 addi r4, r4, -32
157 bneid r4, a_bu3_loop
158 addi r5, r5, 32
159 bri a_block_done
160
161a_block_u1:
162 bslli r11, r11, 8
163a_bu1_loop:
164 lwi r12, r8, 4
165 bsrli r9, r12, 24
166 or r9, r11, r9
167 swi r9, r5, 0
168 bslli r11, r12, 8
169 lwi r12, r8, 8
170 bsrli r9, r12, 24
171 or r9, r11, r9
172 swi r9, r5, 4
173 bslli r11, r12, 8
174 lwi r12, r8, 12
175 bsrli r9, r12, 24
176 or r9, r11, r9
177 swi r9, r5, 8
178 bslli r11, r12, 8
179 lwi r12, r8, 16
180 bsrli r9, r12, 24
181 or r9, r11, r9
182 swi r9, r5, 12
183 bslli r11, r12, 8
184 lwi r12, r8, 20
185 bsrli r9, r12, 24
186 or r9, r11, r9
187 swi r9, r5, 16
188 bslli r11, r12, 8
189 lwi r12, r8, 24
190 bsrli r9, r12, 24
191 or r9, r11, r9
192 swi r9, r5, 20
193 bslli r11, r12, 8
194 lwi r12, r8, 28
195 bsrli r9, r12, 24
196 or r9, r11, r9
197 swi r9, r5, 24
198 bslli r11, r12, 8
199 lwi r12, r8, 32
200 bsrli r9, r12, 24
201 or r9, r11, r9
202 swi r9, r5, 28
203 bslli r11, r12, 8
204 addi r8, r8, 32
205 addi r4, r4, -32
206 bneid r4, a_bu1_loop
207 addi r5, r5, 32
208 bri a_block_done
209
210a_block_u2:
211 bslli r11, r11, 16
212a_bu2_loop:
213 lwi r12, r8, 4
214 bsrli r9, r12, 16
215 or r9, r11, r9
216 swi r9, r5, 0
217 bslli r11, r12, 16
218 lwi r12, r8, 8
219 bsrli r9, r12, 16
220 or r9, r11, r9
221 swi r9, r5, 4
222 bslli r11, r12, 16
223 lwi r12, r8, 12
224 bsrli r9, r12, 16
225 or r9, r11, r9
226 swi r9, r5, 8
227 bslli r11, r12, 16
228 lwi r12, r8, 16
229 bsrli r9, r12, 16
230 or r9, r11, r9
231 swi r9, r5, 12
232 bslli r11, r12, 16
233 lwi r12, r8, 20
234 bsrli r9, r12, 16
235 or r9, r11, r9
236 swi r9, r5, 16
237 bslli r11, r12, 16
238 lwi r12, r8, 24
239 bsrli r9, r12, 16
240 or r9, r11, r9
241 swi r9, r5, 20
242 bslli r11, r12, 16
243 lwi r12, r8, 28
244 bsrli r9, r12, 16
245 or r9, r11, r9
246 swi r9, r5, 24
247 bslli r11, r12, 16
248 lwi r12, r8, 32
249 bsrli r9, r12, 16
250 or r9, r11, r9
251 swi r9, r5, 28
252 bslli r11, r12, 16
253 addi r8, r8, 32
254 addi r4, r4, -32
255 bneid r4, a_bu2_loop
256 addi r5, r5, 32
257
258a_block_done:
259 addi r4, r0, 4
260 cmpu r4, r4, r7
261 blti r4, a_xfer_end
262
263a_word_xfer:
264 andi r4, r7, 0xfffffffc
265 addi r10, r0, 0
266
267 andi r9, r6, 3
268
269 bnei r9, a_word_unaligned
270
271a_word_aligned:
272 lw r9, r6, r10
273 sw r9, r5, r10
274 addi r4, r4,-4
275 bneid r4, a_word_aligned
276 addi r10, r10, 4
277
278 bri a_word_done
279
280a_word_unaligned:
281 andi r8, r6, 0xfffffffc
282 lwi r11, r8, 0
283 addi r8, r8, 4
284
285 addi r9, r9, -1
286 beqi r9, a_word_u1
287 addi r9, r9, -1
288 beqi r9, a_word_u2
289
290a_word_u3:
291 bslli r11, r11, 24
292a_wu3_loop:
293 lw r12, r8, r10
294 bsrli r9, r12, 8
295 or r9, r11, r9
296 sw r9, r5, r10
297 bslli r11, r12, 24
298 addi r4, r4,-4
299 bneid r4, a_wu3_loop
300 addi r10, r10, 4
301
302 bri a_word_done
303
304a_word_u1:
305 bslli r11, r11, 8
306a_wu1_loop:
307 lw r12, r8, r10
308 bsrli r9, r12, 24
309 or r9, r11, r9
310 sw r9, r5, r10
311 bslli r11, r12, 8
312 addi r4, r4,-4
313 bneid r4, a_wu1_loop
314 addi r10, r10, 4
315
316 bri a_word_done
317
318a_word_u2:
319 bslli r11, r11, 16
320a_wu2_loop:
321 lw r12, r8, r10
322 bsrli r9, r12, 16
323 or r9, r11, r9
324 sw r9, r5, r10
325 bslli r11, r12, 16
326 addi r4, r4,-4
327 bneid r4, a_wu2_loop
328 addi r10, r10, 4
329
330a_word_done:
331 add r5, r5, r10
332 add r6, r6, r10
333 rsub r7, r10, r7
334
335a_xfer_end:
336a_xfer_end_loop:
337 beqi r7, a_done
338 lbui r9, r6, 0
339 addi r6, r6, 1
340 sbi r9, r5, 0
341 addi r7, r7, -1
342 brid a_xfer_end_loop
343 addi r5, r5, 1
344
345a_done:
346 rtsd r15, 8
347 nop
348
349.size memcpy, . - memcpy
350.end memcpy
351
352 .globl memmove
353 .type memmove, @function
354 .ent memmove
355
356memmove:
357 cmpu r4, r5, r6
358 bgei r4,fast_memcpy_ascending
359
360fast_memcpy_descending:
361
362 addi r3, r5, 0
363
364 add r5, r5, r7
365 add r6, r6, r7
366
367 addi r4, r0, 4
368 cmpu r4, r4, r7
369 blti r4,d_xfer_end
370
371
372 andi r4, r5, 3
373
374 beqi r4,d_dalign_done
375 rsub r7, r4, r7
376
377d_xfer_first_loop:
378
379 beqi r4,d_dalign_done
380 addi r6, r6, -1
381 addi r5, r5, -1
382 lbui r11, r6, 0
383 sbi r11, r5, 0
384 brid d_xfer_first_loop
385 addi r4, r4, -1
386
387d_dalign_done:
388 addi r4, r0, 32
389 cmpu r4, r4, r7
390
391 blti r4, d_block_done
392
393d_block_xfer:
394 andi r4, r7, 0xffffffe0
395 rsub r7, r4, r7
396
397 andi r9, r6, 3
398
399 bnei r9, d_block_unaligned
400
401d_block_aligned:
402 addi r6, r6, -32
403 addi r5, r5, -32
404 lwi r9, r6, 28
405 lwi r10, r6, 24
406 lwi r11, r6, 20
407 lwi r12, r6, 16
408 swi r9, r5, 28
409 swi r10, r5, 24
410 swi r11, r5, 20
411 swi r12, r5, 16
412 lwi r9, r6, 12
413 lwi r10, r6, 8
414 lwi r11, r6, 4
415 lwi r12, r6, 0
416 swi r9, r5, 12
417 swi r10, r5, 8
418 swi r11, r5, 4
419 addi r4, r4, -32
420 bneid r4, d_block_aligned
421 swi r12, r5, 0
422 bri d_block_done
423
424d_block_unaligned:
425 andi r8, r6, 0xfffffffc
426 rsub r6, r4, r6
427 lwi r11, r8, 0
428
429 addi r9, r9, -1
430 beqi r9,d_block_u1
431 addi r9, r9, -1
432 beqi r9,d_block_u2
433
434d_block_u3:
435 bsrli r11, r11, 8
436d_bu3_loop:
437 addi r8, r8, -32
438 addi r5, r5, -32
439 lwi r12, r8, 28
440 bslli r9, r12, 24
441 or r9, r11, r9
442 swi r9, r5, 28
443 bsrli r11, r12, 8
444 lwi r12, r8, 24
445 bslli r9, r12, 24
446 or r9, r11, r9
447 swi r9, r5, 24
448 bsrli r11, r12, 8
449 lwi r12, r8, 20
450 bslli r9, r12, 24
451 or r9, r11, r9
452 swi r9, r5, 20
453 bsrli r11, r12, 8
454 lwi r12, r8, 16
455 bslli r9, r12, 24
456 or r9, r11, r9
457 swi r9, r5, 16
458 bsrli r11, r12, 8
459 lwi r12, r8, 12
460 bslli r9, r12, 24
461 or r9, r11, r9
462 swi r9, r5, 12
463 bsrli r11, r12, 8
464 lwi r12, r8, 8
465 bslli r9, r12, 24
466 or r9, r11, r9
467 swi r9, r5, 8
468 bsrli r11, r12, 8
469 lwi r12, r8, 4
470 bslli r9, r12, 24
471 or r9, r11, r9
472 swi r9, r5, 4
473 bsrli r11, r12, 8
474 lwi r12, r8, 0
475 bslli r9, r12, 24
476 or r9, r11, r9
477 swi r9, r5, 0
478 addi r4, r4, -32
479 bneid r4, d_bu3_loop
480 bsrli r11, r12, 8
481 bri d_block_done
482
483d_block_u1:
484 bsrli r11, r11, 24
485d_bu1_loop:
486 addi r8, r8, -32
487 addi r5, r5, -32
488 lwi r12, r8, 28
489 bslli r9, r12, 8
490 or r9, r11, r9
491 swi r9, r5, 28
492 bsrli r11, r12, 24
493 lwi r12, r8, 24
494 bslli r9, r12, 8
495 or r9, r11, r9
496 swi r9, r5, 24
497 bsrli r11, r12, 24
498 lwi r12, r8, 20
499 bslli r9, r12, 8
500 or r9, r11, r9
501 swi r9, r5, 20
502 bsrli r11, r12, 24
503 lwi r12, r8, 16
504 bslli r9, r12, 8
505 or r9, r11, r9
506 swi r9, r5, 16
507 bsrli r11, r12, 24
508 lwi r12, r8, 12
509 bslli r9, r12, 8
510 or r9, r11, r9
511 swi r9, r5, 12
512 bsrli r11, r12, 24
513 lwi r12, r8, 8
514 bslli r9, r12, 8
515 or r9, r11, r9
516 swi r9, r5, 8
517 bsrli r11, r12, 24
518 lwi r12, r8, 4
519 bslli r9, r12, 8
520 or r9, r11, r9
521 swi r9, r5, 4
522 bsrli r11, r12, 24
523 lwi r12, r8, 0
524 bslli r9, r12, 8
525 or r9, r11, r9
526 swi r9, r5, 0
527 addi r4, r4, -32
528 bneid r4, d_bu1_loop
529 bsrli r11, r12, 24
530 bri d_block_done
531
532d_block_u2:
533 bsrli r11, r11, 16
534d_bu2_loop:
535 addi r8, r8, -32
536 addi r5, r5, -32
537 lwi r12, r8, 28
538 bslli r9, r12, 16
539 or r9, r11, r9
540 swi r9, r5, 28
541 bsrli r11, r12, 16
542 lwi r12, r8, 24
543 bslli r9, r12, 16
544 or r9, r11, r9
545 swi r9, r5, 24
546 bsrli r11, r12, 16
547 lwi r12, r8, 20
548 bslli r9, r12, 16
549 or r9, r11, r9
550 swi r9, r5, 20
551 bsrli r11, r12, 16
552 lwi r12, r8, 16
553 bslli r9, r12, 16
554 or r9, r11, r9
555 swi r9, r5, 16
556 bsrli r11, r12, 16
557 lwi r12, r8, 12
558 bslli r9, r12, 16
559 or r9, r11, r9
560 swi r9, r5, 12
561 bsrli r11, r12, 16
562 lwi r12, r8, 8
563 bslli r9, r12, 16
564 or r9, r11, r9
565 swi r9, r5, 8
566 bsrli r11, r12, 16
567 lwi r12, r8, 4
568 bslli r9, r12, 16
569 or r9, r11, r9
570 swi r9, r5, 4
571 bsrli r11, r12, 16
572 lwi r12, r8, 0
573 bslli r9, r12, 16
574 or r9, r11, r9
575 swi r9, r5, 0
576 addi r4, r4, -32
577 bneid r4, d_bu2_loop
578 bsrli r11, r12, 16
579
580d_block_done:
581 addi r4, r0, 4
582 cmpu r4, r4, r7
583 blti r4,d_xfer_end
584
585d_word_xfer:
586 andi r4, r7, 0xfffffffc
587 rsub r5, r4, r5
588 rsub r6, r4, r6
589 rsub r7, r4, r7
590
591 andi r9, r6, 3
592
593 bnei r9, d_word_unaligned
594
595d_word_aligned:
596 addi r4, r4,-4
597 lw r9, r6, r4
598 bneid r4, d_word_aligned
599 sw r9, r5, r4
600
601 bri d_word_done
602
603d_word_unaligned:
604 andi r8, r6, 0xfffffffc
605 lw r11, r8, r4
606
607 addi r9, r9, -1
608 beqi r9,d_word_u1
609 addi r9, r9, -1
610 beqi r9,d_word_u2
611
612d_word_u3:
613 bsrli r11, r11, 8
614d_wu3_loop:
615 addi r4, r4,-4
616 lw r12, r8, r4
617 bslli r9, r12, 24
618 or r9, r11, r9
619 sw r9, r5, r4
620 bneid r4, d_wu3_loop
621 bsrli r11, r12, 8
622
623 bri d_word_done
624
625d_word_u1:
626 bsrli r11, r11, 24
627d_wu1_loop:
628 addi r4, r4,-4
629 lw r12, r8, r4
630 bslli r9, r12, 8
631 or r9, r11, r9
632 sw r9, r5, r4
633 bneid r4, d_wu1_loop
634 bsrli r11, r12, 24
635
636 bri d_word_done
637
638d_word_u2:
639 bsrli r11, r11, 16
640d_wu2_loop:
641 addi r4, r4,-4
642 lw r12, r8, r4
643 bslli r9, r12, 16
644 or r9, r11, r9
645 sw r9, r5, r4
646 bneid r4, d_wu2_loop
647 bsrli r11, r12, 16
648
649d_word_done:
650
651d_xfer_end:
652d_xfer_end_loop:
653 beqi r7, a_done
654 addi r6, r6, -1
655 lbui r9, r6, 0
656 addi r5, r5, -1
657 sbi r9, r5, 0
658 brid d_xfer_end_loop
659 addi r7, r7, -1
660
661d_done:
662 rtsd r15, 8
663 nop
664
665.size memmove, . - memmove
666.end memmove
667