-
Notifications
You must be signed in to change notification settings - Fork 2
/
memcpy_armv6v7.S
1048 lines (979 loc) · 28.8 KB
/
memcpy_armv6v7.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright 2013 Harm Hanemaaijer <fgenfb@yahoo.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*/
#include "kernel_defines.h"
.text
.syntax unified
#ifdef NEON_MEMORY_FUNCTIONS
.fpu neon
#endif
ARM( .p2align 5 )
THUMB( .p2align 2 )
/*
* The following memcpy implementation is optimized with a fast path
* for common, word aligned cases and optionally use unaligned access for
* small sizes.
*
* - line_size is the cache line size used for prefetches. Must be 64 or 32.
* - prefetch_distance is the number of cache lines to look ahead and must be
* >= 2.
* - write_align is the write alignment enforced before the main loop for larger
* sizes (word aligned case) and must be 0, 16, 32, or 64.
* - aligned_access must be 0 or 1. When enabled, no unaligned memory accesses
* will occur. Both small size tresholds for unaligned access are not used
* in this case.
* - use_neon must be 0 or 1. When enabled, NEON is used for block copy in the
* main loop for larger sizes.
*/
/* The threshold size for using the fast path for the word-aligned case. */
#define FAST_PATH_THRESHOLD 256
/* The threshold size for using the small size path for the word-aligned case. */
#define SMALL_SIZE_THRESHOLD 15
/*
* The threshold size for using the small size path for the unaligned case.
* Unaligned memory accesses will be generated for requests smaller or equal to
* this size.
*/
#define UNALIGNED_SMALL_SIZE_THRESHOLD 64
/*
* The threshold size for using the small size path when both the source and
* the destination are unaligned. Unaligned memory accesses will be generated
* for requests smaller of equal to this size.
*/
#define BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD 32
/*
* For a code-reduced version, define all four of the above constants to 0,
* eliminating the fast path and small size special cases. With Thumb2
* enabled, this resulted in a reduction in code size from 1150 to 824 bytes,
* at the cost of lower performance for smaller sizes.
*/
// #define FAST_PATH_THRESHOLD 0
// #define SMALL_SIZE_THRESHOLD 0
// #define UNALIGNED_SMALL_SIZE_THRESHOLD 0
// #define BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD 0
/*
* EARLY_PREFETCHES is used in the fast path implementation.
* The optimal value for EARLY_PREFETCHES was determined empirically.
* It is equal 4 for line_size 32, and equal to 2 for line_size 64.
*/
#define EARLY_PREFETCHES (4 * (2 - \line_size / 32) \
+ 2 * (\line_size / 32 - 1))
#if FAST_PATH_THRESHOLD > 0
#define FAST_PATH(instr...) instr
#define NO_FAST_PATH(instr...)
#else
#define FAST_PATH(instr...)
#define NO_FAST_PATH(instr...) instr
#endif
/* Helper macro for the fast-path implementation. */
.macro copy_16_bytes bytes_to_go, line_size, prefetch_distance
#ifdef CONFIG_THUMB2_KERNEL
/*
* When Thumb2 mode is enabled, the ldmia/stmia instructions
* will be 16-bit, and the preload instruction will be
* 32-bit, so we only need one 32-bit wide nop instruction
* when there's no preload, for a total size of two words.
*/
.if \bytes_to_go >= (EARLY_PREFETCHES * \line_size) && \
(\bytes_to_go % \line_size) == 0
pld [r1, ip]
ldmia r1!, {r3, r4, r5, r6}
stmia r0!, {r3, r4, r5, r6}
.else
ldmia r1!, {r3, r4, r5, r6}
W( nop )
stmia r0!, {r3, r4, r5, r6}
.endif
#else
/*
* When ARM mode is enabled, every instruction is one word,
* so make sure the entire block is four instructions.
*/
.if \bytes_to_go >= (EARLY_PREFETCHES * \line_size) && \
(\bytes_to_go % \line_size) == 0
pld [r1, ip]
.else
nop
.endif
ldmia r1!, {r3, r4, r5, r6}
nop
stmia r0!, {r3, r4, r5, r6}
#endif
.endm
/* Helper macro implementing unaligned copy. */
.macro unaligned_copy pullshift, pushshift, line_size, prefetch_distance, \
write_align, aligned_access
/*
* ip is the aligned source base address.
* r3 is a word of data from the source.
*/
.if \write_align > 0
cmp r2, #(32 + \write_align - 4)
.else
cmp r2, #32
.endif
push {r5}
blt 55f
subs r2, r2, #32
/* Handle write alignment. */
.if \write_align > 0
.if \write_align == 8
tst r0, #4
mov r4, r3, pullbits #\pullshift
ldrne r3, [r1], #4
subne r2, r2, #4
orrne r4, r4, r3, pushbits #\pushshift
strne r4, [r0], #4
.else
ands r5, r0, #(\write_align - 1)
rsb r5, r5, #\write_align
beq 59f
sub r2, r2, r5
58: movs r4, r3, pullbits #\pullshift
ldr r3, [r1], #4
subs r5, r5, #4
orr r4, r4, r3, pushbits #\pushshift
str r4, [r0], #4
bgt 58b
59:
.endif
.endif
/*
* Assume a preload at aligned base + line_size will
* be useful.
*/
pld [ip, #\line_size]
push {r6-r11}
mov r11, r3
mov r4, ip
add r5, r1, #(\prefetch_distance * \line_size)
subs r2, r2, #(\prefetch_distance * \line_size)
bic r3, r5, #31
add r4, r4, #(2 * \line_size)
blt 54f
cmp r4, r3
sub ip, r3, r1
/*
* "Catch-up" the early preloads (which have been performed up
* to aligned source address + line_size) to the preload offset
* used in the main loop.
*/
bge 52f
51: adds r4, r4, #\line_size /* Thumb16 */
cmp r4, r3
pld [r4, #(- \line_size)]
blt 51b
52:
/*
* Note that when PRELOAD_LINE_SIZE is 64, we are
* prefetching every 32 bytes. Although not optimal
* there doesn't seem to be big penalty for the extra
* preload instructions and it prevents greater
* code size and complexity.
*/
53: pld [r1, ip]
54:
ldmia r1!, {r4-r7}
mov r3, r11, pullbits #\pullshift
ldmia r1!, {r8-r11}
orr r3, r3, r4, pushbits #\pushshift
movs r4, r4, pullbits #\pullshift /* Thumb16 */
orr r4, r4, r5, pushbits #\pushshift
movs r5, r5, pullbits #\pullshift /* Thumb16 */
orr r5, r5, r6, pushbits #\pushshift
movs r6, r6, pullbits #\pullshift /* Thumb16 */
orr r6, r6, r7, pushbits #\pushshift
movs r7, r7, pullbits #\pullshift /* Thumb16 */
orr r7, r7, r8, pushbits #\pushshift
mov r8, r8, pullbits #\pullshift
orr r8, r8, r9, pushbits #\pushshift
mov r9, r9, pullbits #\pullshift
orr r9, r9, r10, pushbits #\pushshift
mov r10, r10, pullbits #\pullshift
orr r10, r10, r11, pushbits #\pushshift
subs r2, r2, #32
stmia r0!, {r3-r10}
bge 53b
cmn r2, #(\prefetch_distance * \line_size)
bge 54b
/* Correct the count. */
adds r2, r2, #(\prefetch_distance * \line_size + 32)
mov r3, r11
pop {r6-r11}
55: bics r5, r2, #3
beq 57f
56: movs r4, r3, pullbits #\pullshift
ldr r3, [r1], #4
subs r5, r5, #4
orr r4, r4, r3, pushbits #\pushshift
str r4, [r0], #4
bgt 56b
57: pop {r5}
pop {r4}
subs r1, r1, #(\pushshift / 8)
.if \aligned_access == 1
b 7b
.else
b 3b
.endif
.endm
/* The main memcpy function macro. */
.macro memcpy_variant line_size, prefetch_distance, write_align, \
unaligned_write_align, aligned_access, use_neon
.if \aligned_access == 1
cmp r2, #3
.else
NO_FAST_PATH( cmp r2, #3 )
.endif
orr r3, r0, r1
.if \aligned_access == 1
push {r0}
ble 7f
.else
NO_FAST_PATH( push {r0} )
NO_FAST_PATH( ble 3f )
.endif
bic ip, r1, #(\line_size - 1)
tst r3, #3
pld [ip]
.if \aligned_access == 1
FAST_PATH( bne 30f )
.else
FAST_PATH( push {r0} )
FAST_PATH( bne 7f ) /* Unaligned source or destination. */
.endif
FAST_PATH( cmp r2, #FAST_PATH_THRESHOLD )
FAST_PATH( bgt 10f )
NO_FAST_PATH( bne 30f )
#if FAST_PATH_THRESHOLD == 0
/*
* When the fast path is disabled, check whether there are
* enough bytes for alignment, and jump to the main handling
* code for larger sizes.
*/
.if \write_align > 0
cmp r2, #(\write_align - 4)
bge 10f
.endif
push {r4}
b 18f
#endif
/*
* Fast path for aligned copies of size <= FAST_PATH_THRESHOLD.
*/
#if FAST_PATH_THRESHOLD > 0
#if SMALL_SIZE_THRESHOLD == 15
bics r3, r2, #15
pld [ip, #\line_size]
/* Jump for small sizes <= 15 bytes. */
beq 5f
#else
cmp r2, #SMALL_SIZE_THRESHOLD
pld [ip, #\line_size]
/* Jump for small sizes <= SMALL_SIZE_THRESHOLD bytes. */
ble 5f
bic r3, r2, #15
#endif
9: /*
* This is the entry-point into the fast path from
* an unaligned request that has been aligned.
*/
push {r4, r5, r6}
/*
* Use a heuristic to determine whether the preload
* at aligned_base + 2 * line_size will be useful.
*/
.if EARLY_PREFETCHES >= 3
cmp r2, #(2 * \line_size - \line_size / 2)
.endif
add r5, ip, #(EARLY_PREFETCHES * \line_size)
.if EARLY_PREFETCHES >= 3
blt 1f
.endif
.if EARLY_PREFETCHES == 3
pld [ip, #(2 * \line_size)] )
.endif
.if EARLY_PREFETCHES == 4
cmp r2, #(3 * \line_size - \line_size / 2)
pld [ip, #(2 * \line_size)]
blt 1f
pld [ip, #(3 * \line_size)]
.endif
.if EARLY_PREFETCHES == 5
cmp r2, #(3 * \line_size - \line_size / 2)
pld [ip, #(2 * \line_size)]
blt 1f
cmp r2, #(4 * \line_size - \line_size / 2)
pld [ip, #(3 * \line_size)]
blt 1f
pld [ip, #(4 * \line_size)]
.endif
1: /*
* Set r5 so that the next preload will occur
* exactly at aligned_base + EARLY_PREFETCHES *
* line_size. For example, if line_size is 64
* and the number of bytes is 240, the next preload
* will occur after processing 48 bytes, which is derived
* from the formula r3 & (line_size - 1),
* where r3 is equal to number_of_bytes & (~15).
*/
rsb r4, r3, #256
subs r5, r5, r1
and ip, r3, #(\line_size - 1)
subs r2, r2, r3 /* Thumb16 */
THUMB( lsrs r4, r4, #1 ) /* Thumb16 */
sub ip, r5, ip
add pc, pc, r4
nop
/* >= 256 bytes to go. */
copy_16_bytes 256, \line_size, \prefetch_distance
/* >= 240 bytes go. */
copy_16_bytes 240, \line_size, \prefetch_distance
/* >= 224 bytes to go. */
copy_16_bytes 224, \line_size, \prefetch_distance
/* >= 204 bytes go. */
copy_16_bytes 204, \line_size, \prefetch_distance
/* >= 192 bytes to go. */
copy_16_bytes 192, \line_size, \prefetch_distance
/* >= 176 bytes go. */
copy_16_bytes 176, \line_size, \prefetch_distance
/* >= 160 bytes to go. */
copy_16_bytes 160, \line_size, \prefetch_distance
/* >= 144 bytes go. */
copy_16_bytes 144, \line_size, \prefetch_distance
/* >= 128 bytes to go. */
copy_16_bytes 128, \line_size, \prefetch_distance
/* >= 112 bytes go. */
copy_16_bytes 112, \line_size, \prefetch_distance
/* >= 96 bytes to go. */
copy_16_bytes 96, \line_size, \prefetch_distance
/* >= 80 bytes to go. */
copy_16_bytes 80, \line_size, \prefetch_distance
/* >= 64 bytes to go. */
copy_16_bytes 64, \line_size, \prefetch_distance
/* >= 48 bytes to go. */
copy_16_bytes 48, \line_size, \prefetch_distance
/* >= 32 bytes to go. */
copy_16_bytes 32, \line_size, \prefetch_distance
/* At this point there are 16 to 31 bytes to go. */
tst r2, #15
ldmia r1!, {r3, r4, r5, r6}
cmpne r2, #8
/*
* If r2 == 8, we need to clear the eq flag while
* making sure carry remains set.
*/
tsteq r2, #15
stmia r0!, {r3, r4, r5, r6}
/*
* The equal flag is set if there are no bytes left.
* The carry flag is set is there are >= 8 bytes left.
*/
pop {r4, r5, r6}
beq 4f
2:
/*
* ARM mode imposes restrictions on the registers used
* in double-word loads and stored so we have to use
* single-word operations.
*/
.if \aligned_access == 0
ARM( ldrcs r3, [r1], #4 )
ARM( ldrcs ip, [r1], #4 )
ARM( strcs r3, [r0], #4 )
ARM( strcs ip, [r0], #4 )
THUMB( ldrdcs r3, ip, [r1], #8 )
THUMB( strdcs r3, ip, [r0], #8 )
.else
ldrcs r3, [r1], #4
ldrcs ip, [r1], #4
strcs r3, [r0], #4
strcs ip, [r0], #4
.endif
tst r2, #4
ldrne ip, [r1], #4
strne ip, [r0], #4
tst r2, #3
popeq {r0}
BXEQLR
/*
* Handle the last up to three bytes. Unaligned access
* make take place if source or destination is not
* half-word aligned.
*/
3: movs r2, r2, lsl #31
ldrhcs r3, [r1], #2
strhcs r3, [r0], #2
ldrbne r3, [r1], #1
strbne r3, [r0], #1
4: pop {r0}
BXLR
5: /*
* Sizes <= SMALL_SIZE_THRESHOLD bytes, both source and
* destination aligned.
*/
#if SMALL_SIZE_THRESHOLD <= 15
cmp r2, #8 /* cs if r2 >= 8. */
b 2b
#else
101: tst r2, #4
ldrne r3, [r1], #4
subne r2, r2, #4
strne r3, [r0], #4
cmp r2, #8
blt 3b
6: cmp r2, #16
#ifdef NEON_MEMORY_FUNCTIONS
vld1.8 {d0}, [r1]!
sub r2, r2, #8
vst1.8 {d0}, [r0]!
#else
ldr r3, [r1], #4
ldr ip, [r1], #4
str r3, [r0], #4
sub r2, r2, #8
str ip, [r0], #4
#endif
bge 6b
cmp r2, #0
popeq {r0}
BXEQLR
b 3b
#endif
#endif /* FAST_PATH_THRESHOLD > 0 */
.if \aligned_access == 1
/*
* Handle the last up to three bytes avoiding
* unaligned memory access.
*/
7: movs r2, r2, lsl #31
ldrbcs r3, [r1], #1
ldrbcs ip, [r1], #1
strbcs r3, [r0], #1
strbcs ip, [r0], #1
ldrbne r3, [r1], #1
strbne r3, [r0], #1
pop {r0}
BXLR
.endif
#if FAST_PATH_THRESHOLD > 0
.if \aligned_access == 0
7: /*
* Unaligned source or destination. There are seperate small
* size thresholds for when both source and destination are
* unaligned and the other case.
*/
tst r0, #3
mov r3, #UNALIGNED_SMALL_SIZE_THRESHOLD
tstne r1, #3
movne r3, #BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD
cmp r2, r3
bgt 30f
/* Small sizes, unaligned case. Use single word load/stores. */
#if SMALL_SIZE_THRESHOLD >= 16
/* Use the identical code path already defined above. */
b 101b
#else
tst r2, #4
ldrne r3, [r1], #4
subne r2, r2, #4
strne r3, [r0], #4
cmp r2, #8
blt 3b
8: cmp r2, #16
#ifdef NEON_MEMORY_FUNCTIONS
vld1.8 {d0}, [r1]!
sub r2, r2, #8
vst1.8 {d0}, [r0]!
#else
ldr r3, [r1], #4
ldr ip, [r1], #4
str r3, [r0], #4
sub r2, r2, #8
str ip, [r0], #4
#endif
bge 8b
b 3b
#endif /* SMALL_SIZE_THRESHOLD < 16 */
.endif
#endif /* FAST_PATH_THRESHOLD > 0 */
10: /*
* This is the start of the handling of larger sizes for
* aligned copies.
*
* Size > FAST_PATH_THRESHOLD (256).
* ip is the line_sized aligned source address for preloads.
*/
.if \write_align >= 16
ands r3, r0, #(\write_align - 1)
push {r4}
rsb r3, r3, #\write_align
beq 17f
push {lr}
bl 20f
pop {lr}
17:
.elseif \write_align == 8
/*
* For write alignment of 8, it is quickest to do a simple
* conditional load/store.
*/
tst r0, #4
push {r4}
ldrne r3, [r1], #4
subne r2, r2, #4
strne r3, [r0], #4
.else
push {r4}
.endif
18:
.if (FAST_PATH_THRESHOLD - (\write_align - 4)) < \line_size
cmp r2, #\line_size
blt 15f
.endif
subs r2, r2, #\line_size
16: /*
* This is the entry-point when source and destination were
* initially unaligned but are now aligned because they had
* the same alignment within a word. Write alignment and
* size check has already been handled.
*/
.if \use_neon == 1
push {r5}
.else
push {r5-r11}
.endif
/*
* Assume a preload at aligned base + line_size will
* be useful.
*/
mov r4, ip
pld [ip, #\line_size]
add r5, r1, #(\prefetch_distance * \line_size)
subs r2, r2, #(\prefetch_distance * \line_size)
bic r3, r5, #(\line_size - 1)
add r4, r4, #(2 * \line_size)
blt 14f
cmp r4, r3
sub ip, r3, r1
/*
* "Catch-up" the early preloads (which have been performed up
* to aligned source address + line_size) to the preload offset
* used in the main loop.
*/
bge 12f
11: adds r4, r4, #\line_size /* Thumb16 */
cmp r4, r3
pld [r4, #(- \line_size)]
blt 11b
12:
/*
* The main loop for large sizes. Copy 32 bytes at a time
* using ldmia/stmia while prefetching a 32-byte aligned
* address for line size 32, or 64 bytes at a time while
* prefetching a 64-byte aligned address for line size 64.
*/
13: pld [r1, ip]
14:
.if \line_size == 32
ldmia r1!, {r4-r7}
subs r2, r2, #32
ldmia r1!, {r8-r11}
stmia r0!, {r4-r7}
stmia r0!, {r8-r11}
.else
.if \use_neon == 1
/*
* Since the destination is 32-byte aligned,
* specify 256-bit alignment for the NEON stores.
* It should be possible to specify 32-bit
* alignment for the NEON loads if the A bit is 0,
* but gas doesn't accept it.
*/
vld1.32 {d0-d3}, [r1]!
vld1.32 {d4-d7}, [r1]!
vst1.32 {d0-d3}, [r0 :256]!
subs r2, r2, #64
vst1.32 {d4-d7}, [r0 :256]!
.else
ldmia r1!, {r4-r11}
subs r2, r2, #64
stmia r0!, {r4-r11}
ldmia r1!, {r4-r11}
stmia r0!, {r4-r11}
.endif
.endif
bge 13b
cmn r2, #(\prefetch_distance * \line_size)
bge 14b
/* Correct the count. */
adds r2, r2, #((\prefetch_distance + 1) * \line_size)
.if \use_neon == 1
pop {r5}
.else
pop {r5-r11}
.endif
15: ands r3, r2, #60
.if \write_align <= 8
/*
* When the subroutine is not used for write alignment, the
* subroutine will only be called once, so branch without
* linking.
*/
bne 20f
19:
.else
mov ip, lr
blne 20f
mov lr, ip
.endif
pop {r4}
#if FAST_PATH_THRESHOLD > 0
cmp r2, #0
bne 3b
#else
ARM( cmp r2, #0 )
ARM( beq 4f )
THUMB( cbz r2, 4f )
/* Handle the last up to three bytes. */
3: movs r2, r2, lsl #31
ldrhcs r3, [r1], #2
strhcs r3, [r0], #2
ldrbne r3, [r1], #1
strbne r3, [r0]
4:
#endif
pop {r0}
BXLR
/*
* Subroutine that copies a multiple of 4 bytes of size
* r3 from 0 to 64 or 32 bytes. r2 is decremented by the
* number of bytes copied.
*/
20: tst r3, #4
sub r2, r2, r3
ldrne r4, [r1], #4
subne r3, r3, #4
strne r4, [r0], #4
.if \write_align <= 32 && \line_size == 32
rsb r3, r3, #32
.else
rsb r3, r3, #64
.endif
/*
* These ldmia/stmia instructions are 16-bit on Thumb2,
* 32-bit on ARM.
*/
THUMB( lsrs r3, r3, #1 )
add pc, pc, r3
nop
ldmia r1!, {r3, r4}
stmia r0!, {r3, r4}
ldmia r1!, {r3, r4}
stmia r0!, {r3, r4}
ldmia r1!, {r3, r4}
stmia r0!, {r3, r4}
ldmia r1!, {r3, r4}
stmia r0!, {r3, r4}
.if \write_align > 32 || \line_size > 32
ldmia r1!, {r3, r4}
stmia r0!, {r3, r4}
ldmia r1!, {r3, r4}
stmia r0!, {r3, r4}
ldmia r1!, {r3, r4}
stmia r0!, {r3, r4}
ldmia r1!, {r3, r4}
stmia r0!, {r3, r4}
.endif
.if \write_align <= 8
b 19b
.else
mov pc, lr
.endif
30: /*
* Unaligned case. Align the destination.
* Number of bytes is > UNALIGNED_SMALL_SIZE_THRESHOLD.
* Note: This may use unaligned access.
* ip is the line_size aligned source address for preloads.
*/
ands r3, r0, #3
push {r4}
andeq r3, r1, #3
beq 40f /* Destination is aligned but source is not. */
/* Align the destination. */
cmp r3, #2
.if \aligned_access == 1
ldrble r4, [r1], #1
ldrble r3, [r1], #1
suble r2, r2, #2
strble r4, [r0], #1
strble r3, [r0], #1
.else
ldrhle r4, [r1], #2
suble r2, r2, #2
strhle r4, [r0], #2
.endif
ldrbne r4, [r1], #1
subne r2, r2, #1
strbne r4, [r0], #1
ands r3, r1, #3
bne 40f /* Destination is aligned but source is not. */
#if 0 && FAST_PATH_THRESHOLD > 0
/*
* Source and destination are now aligned.
* Now recreate the situation of a word-aligned memcpy
* with the current source and destination,
* which may require an extra preload instruction.
*
* This path is currently disabled disabled in favour
* of the one below this which does write alignment and
* jumps into the main loop for larger sizes.
*/
bic r3, r1, #(\line_size - 1)
pop {r4}
cmp r3, ip
THUMB( pldne [r3] )
THUMB( cmp r2, #FAST_PATH_THRESHOLD )
THUMB( mov ip, r3 )
ARM( beq 31f )
ARM( pld [r3] )
ARM( mov ip, r3 )
31: ARM( cmp r2, #FAST_PATH_THRESHOLD )
bgt 10b
/*
* Recreate the fast path small size check here,
* but only if it necessary.
*/
.if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3) <= SMALL_SIZE_THRESHOLD ||
\aligned_access == 1
cmp r2, #SMALL_SIZE_THRESHOLD
pld [ip, #\line_size]
/* Jump for small sizes <= SMALL_SIZE_THRESHOLD bytes. */
ble 5b
.else
pld [ip, #\line_size]
.endif
bic r3, r2, #15
b 9b
#else
/*
* Source and destination are now aligned. Check carefully
* whether there are enough bytes to do alignment.
*/
.if \write_align > 0
.if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3) < (\write_align - 4) \
|| \aligned_access == 1
cmp r2, #(\write_align - 4)
blt 31f
.endif
.if \write_align == 8
/*
* For write alignment of 8, it is quickest to do a simple
* conditional load/store.
*/
tst r0, #4
ldrne r3, [r1], #4
subne r2, r2, #4
strne r3, [r0], #4
.else
ands r3, r0, #(\write_align - 1)
rsb r3, r3, #\write_align
beq 31f
push {lr}
bl 20b
pop {lr}
.endif
31: /*
* Check whether there are enough bytes to do one iteration
* of the main loop.
*/
.if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3 - (\write_align - 4)) < \
\line_size || \aligned_access == 1
cmp r2, #\line_size
blt 15b
.endif
subs r2, r2, #\line_size
.else
/*
* No write alignment. Only have to check for enough bytes to
* do one iteration of the main loop.
*/
.if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3) < \line_size \
|| \aligned_access == 1
cmp r2, #\line_size
blt 15b
.endif
subs r2, r2, #\line_size
.endif
b 16b
#endif
40: /*
* Unaligned case. Size is > SMALL_SIZE_THRESHOLD - 3.
*/
.if \use_neon == 0
bic r1, r1, #3
cmp r3, #2
ldr r3, [r1], #4
beq 41f
bgt 42f
unaligned_copy 8, 24, \line_size, \prefetch_distance, \
\unaligned_write_align, \aligned_access
41: unaligned_copy 16, 16, \line_size, \prefetch_distance, \
\unaligned_write_align, \aligned_access
42: unaligned_copy 24, 8, \line_size, \prefetch_distance, \
\unaligned_write_align, \aligned_access
.else /* use_neon == 1 */
/*
* NEON unaligned copy. The destination has been aligned
* to a word boundary, size > SMALL_SIZE_THRESHOLD - 3.
*/
cmp r2, #64
blt 56f
subs r2, r2, #64
/* Align to a 32-byte boundary. */
ands r4, r0, #31
rsb r4, r4, #32
beq 50f
tst r4, #4
ldrne r3, [r1 :8], #4 /* Unaligned access. */
sub r2, r2, r4
strne r3, [r0 :32], #4
tst r4, #8
vld1ne.8 {d0}, [r1]!
vst1ne.8 {d0}, [r0 :64]!
cmp r4, #16
vld1ge.8 {d0, d1}, [r1]!
vst1ge.8 {d0, d1}, [r0 :128]!
cmp r2, #0
addlt r2, r2, #64
blt 55f
50:
/*
* Assume a preload at aligned base + line_size will
* be useful.
*/
pld [ip, #\line_size]
push {r5}
mov r4, ip
add r5, r1, #(\prefetch_distance * \line_size)
subs r2, r2, #(\prefetch_distance * \line_size)
bic r3, r5, #31
add r4, r4, #(2 * \line_size)
blt 54f
cmp r4, r3
sub ip, r3, r1
/*
* "Catch-up" the early preloads (which have been performed up
* to aligned source address + line_size) to the preload offset
* used in the main loop.
*/
bge 52f
51: adds r4, r4, #\line_size /* Thumb16 */
cmp r4, r3
pld [r4, #(- \line_size)]
blt 51b
52:
/*
* Process 64 unaligned bytes from source at a time and copy
* them to the 32-byte aligned destination.
*/
53: pld [r1, ip]
54: vld1.8 {d0-d3}, [r1]!
vld1.8 {d4-d7}, [r1]!
vst1.64 {d0-d3}, [r0 :256]!
subs r2, r2, #64
vst1.64 {d4-d7}, [r0 :256]!
bge 53b
cmn r2, #(\prefetch_distance * \line_size)
bge 54b
/* Correct the count. */
adds r2, r2, #(\prefetch_distance * \line_size + 64)
pop {r5}
/* Process the last 0-63 bytes. */
55: cmp r2, #32
vld1ge.8 {d0-d3}, [r1]!
vst1ge.8 {d0-d3}, [r0 :128]!
tst r2, #16
vld1ne.8 {d0, d1}, [r1]!
vst1ne.8 {d0, d1}, [r0 :128]!
tst r2, #8
vld1ne.8 {d0}, [r1]!
vst1ne.8 {d0}, [r0 :64]!
tst r2, #4
ldrne r3, [r1 :8], #4 /* Unaligned access. */
strne r3, [r0 :32], #4
pop {r4}
.if \aligned_access == 1
b 7b
.else
b 3b
.endif
/* Handle small size of 0-63 bytes, unaligned. */
56: cmp r2, #32
vld1ge.8 {d0-d3}, [r1]!
/*
* Assembler doesn't accept :32 alignment for
* the following instruction.
*/
vst1ge.8 {d0-d3}, [r0]!
tst r2, #16