blob: ebec8ee3e7cc3d69d4cea7f25dae7dec371ad971 [file] [log] [blame]
Arthur Heymans9df0fee2022-11-04 20:20:14 +01001/* SPDX-License-Identifier: GPL-2.0-only */
2
3/* This code originates from Linux 5.19 */
4
5/*
6 * Implement memmove(). This can handle overlap between src and dst.
7 *
8 * Input:
9 * rdi: dest
10 * rsi: src
11 * rdx: count
12 *
13 * Output:
14 * rax: dest
15 */
16.global memmove
17memmove:
18
19 mov %rdi, %rax
20
21 /* Decide forward/backward copy mode */
22 cmp %rdi, %rsi
23 jge .Lmemmove_begin_forward
24 mov %rsi, %r8
25 add %rdx, %r8
26 cmp %rdi, %r8
27 jg 2f
28
29 /* Don't optimize for FSRM and ERMS like Linux */
30.Lmemmove_begin_forward:
31 cmp $0x20, %rdx
32 jb 1f
33
34 /*
35 * movsq instruction have many startup latency
36 * so we handle small size by general register.
37 */
38 cmp $680, %rdx
39 jb 3f
40 /*
41 * movsq instruction is only good for aligned case.
42 */
43
44 cmpb %dil, %sil
45 je 4f
463:
47 sub $0x20, %rdx
48 /*
49 * We gobble 32 bytes forward in each loop.
50 */
515:
52 sub $0x20, %rdx
53 movq 0*8(%rsi), %r11
54 movq 1*8(%rsi), %r10
55 movq 2*8(%rsi), %r9
56 movq 3*8(%rsi), %r8
57 leaq 4*8(%rsi), %rsi
58
59 movq %r11, 0*8(%rdi)
60 movq %r10, 1*8(%rdi)
61 movq %r9, 2*8(%rdi)
62 movq %r8, 3*8(%rdi)
63 leaq 4*8(%rdi), %rdi
64 jae 5b
65 addq $0x20, %rdx
66 jmp 1f
67 /*
68 * Handle data forward by movsq.
69 */
70 .p2align 4
714:
72 movq %rdx, %rcx
73 movq -8(%rsi, %rdx), %r11
74 lea -8(%rdi, %rdx), %r10
75 shrq $3, %rcx
76 rep movsq
77 movq %r11, (%r10)
78 jmp 13f
79.Lmemmove_end_forward:
80
81 /*
82 * Handle data backward by movsq.
83 */
84 .p2align 4
857:
86 movq %rdx, %rcx
87 movq (%rsi), %r11
88 movq %rdi, %r10
89 leaq -8(%rsi, %rdx), %rsi
90 leaq -8(%rdi, %rdx), %rdi
91 shrq $3, %rcx
92 std
93 rep movsq
94 cld
95 movq %r11, (%r10)
96 jmp 13f
97
98 /*
99 * Start to prepare for backward copy.
100 */
101 .p2align 4
1022:
103 cmp $0x20, %rdx
104 jb 1f
105 cmp $680, %rdx
106 jb 6f
107 cmp %dil, %sil
108 je 7b
1096:
110 /*
111 * Calculate copy position to tail.
112 */
113 addq %rdx, %rsi
114 addq %rdx, %rdi
115 subq $0x20, %rdx
116 /*
117 * We gobble 32 bytes backward in each loop.
118 */
1198:
120 subq $0x20, %rdx
121 movq -1*8(%rsi), %r11
122 movq -2*8(%rsi), %r10
123 movq -3*8(%rsi), %r9
124 movq -4*8(%rsi), %r8
125 leaq -4*8(%rsi), %rsi
126
127 movq %r11, -1*8(%rdi)
128 movq %r10, -2*8(%rdi)
129 movq %r9, -3*8(%rdi)
130 movq %r8, -4*8(%rdi)
131 leaq -4*8(%rdi), %rdi
132 jae 8b
133 /*
134 * Calculate copy position to head.
135 */
136 addq $0x20, %rdx
137 subq %rdx, %rsi
138 subq %rdx, %rdi
1391:
140 cmpq $16, %rdx
141 jb 9f
142 /*
143 * Move data from 16 bytes to 31 bytes.
144 */
145 movq 0*8(%rsi), %r11
146 movq 1*8(%rsi), %r10
147 movq -2*8(%rsi, %rdx), %r9
148 movq -1*8(%rsi, %rdx), %r8
149 movq %r11, 0*8(%rdi)
150 movq %r10, 1*8(%rdi)
151 movq %r9, -2*8(%rdi, %rdx)
152 movq %r8, -1*8(%rdi, %rdx)
153 jmp 13f
154 .p2align 4
1559:
156 cmpq $8, %rdx
157 jb 10f
158 /*
159 * Move data from 8 bytes to 15 bytes.
160 */
161 movq 0*8(%rsi), %r11
162 movq -1*8(%rsi, %rdx), %r10
163 movq %r11, 0*8(%rdi)
164 movq %r10, -1*8(%rdi, %rdx)
165 jmp 13f
16610:
167 cmpq $4, %rdx
168 jb 11f
169 /*
170 * Move data from 4 bytes to 7 bytes.
171 */
172 movl (%rsi), %r11d
173 movl -4(%rsi, %rdx), %r10d
174 movl %r11d, (%rdi)
175 movl %r10d, -4(%rdi, %rdx)
176 jmp 13f
17711:
178 cmp $2, %rdx
179 jb 12f
180 /*
181 * Move data from 2 bytes to 3 bytes.
182 */
183 movw (%rsi), %r11w
184 movw -2(%rsi, %rdx), %r10w
185 movw %r11w, (%rdi)
186 movw %r10w, -2(%rdi, %rdx)
187 jmp 13f
18812:
189 cmp $1, %rdx
190 jb 13f
191 /*
192 * Move data for 1 byte.
193 */
194 movb (%rsi), %r11b
195 movb %r11b, (%rdi)
19613:
197 RET