libxr  1.0
Want to be the best embedded framework
Loading...
Searching...
No Matches
libxr_mem_o3.cpp
1#include "libxr_def.hpp"
2#include "libxr_mem.hpp"
3
4void LibXR::Memory::FastCopy(void* dst, const void* src, size_t size)
5{
6 uint8_t* d = static_cast<uint8_t*>(dst);
7 const uint8_t* s = static_cast<const uint8_t*>(src);
8
9 uintptr_t d_offset = reinterpret_cast<uintptr_t>(d) & (LibXR::ALIGN_SIZE - 1);
10 uintptr_t s_offset = reinterpret_cast<uintptr_t>(s) & (LibXR::ALIGN_SIZE - 1);
11
16 if (d_offset == s_offset)
17 {
19 if (d_offset)
20 {
21 size_t head = LibXR::ALIGN_SIZE - d_offset;
22 if (head > size)
23 {
24 head = size;
25 }
26 while (head--)
27 {
28 *d++ = *s++;
29 --size;
30 }
31 }
32
33 if constexpr (LibXR::ALIGN_SIZE == 8)
34 {
36 auto* dw = reinterpret_cast<uint64_t*>(d);
37 auto* sw = reinterpret_cast<const uint64_t*>(s);
38
39 while (size >= 64)
40 {
41 dw[0] = sw[0];
42 dw[1] = sw[1];
43 dw[2] = sw[2];
44 dw[3] = sw[3];
45 dw[4] = sw[4];
46 dw[5] = sw[5];
47 dw[6] = sw[6];
48 dw[7] = sw[7];
49 dw += 8;
50 sw += 8;
51 size -= 64;
52 }
53 while (size >= 8)
54 {
55 *dw++ = *sw++;
56 size -= 8;
57 }
58
59 d = reinterpret_cast<uint8_t*>(dw);
60 s = reinterpret_cast<const uint8_t*>(sw);
61 }
62 else
63 {
65 auto* dw = reinterpret_cast<uint32_t*>(d);
66 auto* sw = reinterpret_cast<const uint32_t*>(s);
67
68 while (size >= 32)
69 {
70 dw[0] = sw[0];
71 dw[1] = sw[1];
72 dw[2] = sw[2];
73 dw[3] = sw[3];
74 dw[4] = sw[4];
75 dw[5] = sw[5];
76 dw[6] = sw[6];
77 dw[7] = sw[7];
78 dw += 8;
79 sw += 8;
80 size -= 32;
81 }
82 while (size >= 4)
83 {
84 *dw++ = *sw++;
85 size -= 4;
86 }
87
88 d = reinterpret_cast<uint8_t*>(dw);
89 s = reinterpret_cast<const uint8_t*>(sw);
90 }
91 }
96 else
97 {
98 uintptr_t addr_diff = reinterpret_cast<uintptr_t>(s) - reinterpret_cast<uintptr_t>(d);
99
100 if constexpr (LibXR::ALIGN_SIZE == 8)
101 {
103 if ((addr_diff & 3) == 0 && size > 0)
104 {
105 while ((reinterpret_cast<uintptr_t>(d) & 3) && size)
106 {
107 *d++ = *s++;
108 --size;
109 }
110 auto* d32 = reinterpret_cast<uint32_t*>(d);
111 auto* s32 = reinterpret_cast<const uint32_t*>(s);
112
113 while (size >= 32)
114 {
115 d32[0] = s32[0];
116 d32[1] = s32[1];
117 d32[2] = s32[2];
118 d32[3] = s32[3];
119 d32[4] = s32[4];
120 d32[5] = s32[5];
121 d32[6] = s32[6];
122 d32[7] = s32[7];
123 d32 += 8;
124 s32 += 8;
125 size -= 32;
126 }
127 while (size >= 4)
128 {
129 *d32++ = *s32++;
130 size -= 4;
131 }
132
133 d = reinterpret_cast<uint8_t*>(d32);
134 s = reinterpret_cast<const uint8_t*>(s32);
135 }
137 else if ((addr_diff & 1) == 0 && size > 0)
138 {
139 if (reinterpret_cast<uintptr_t>(d) & 1)
140 {
141 *d++ = *s++;
142 --size;
143 }
144 auto* d16 = reinterpret_cast<uint16_t*>(d);
145 auto* s16 = reinterpret_cast<const uint16_t*>(s);
146
147 while (size >= 16)
148 {
149 d16[0] = s16[0];
150 d16[1] = s16[1];
151 d16[2] = s16[2];
152 d16[3] = s16[3];
153 d16[4] = s16[4];
154 d16[5] = s16[5];
155 d16[6] = s16[6];
156 d16[7] = s16[7];
157 d16 += 8;
158 s16 += 8;
159 size -= 16;
160 }
161 while (size >= 2)
162 {
163 *d16++ = *s16++;
164 size -= 2;
165 }
166
167 d = reinterpret_cast<uint8_t*>(d16);
168 s = reinterpret_cast<const uint8_t*>(s16);
169 }
170 }
171 else if ((addr_diff & 1) == 0 && size > 0)
172 {
173 if (reinterpret_cast<uintptr_t>(d) & 1)
174 {
175 *d++ = *s++;
176 --size;
177 }
178 auto* d16 = reinterpret_cast<uint16_t*>(d);
179 auto* s16 = reinterpret_cast<const uint16_t*>(s);
180
181 while (size >= 16)
182 {
183 d16[0] = s16[0];
184 d16[1] = s16[1];
185 d16[2] = s16[2];
186 d16[3] = s16[3];
187 d16[4] = s16[4];
188 d16[5] = s16[5];
189 d16[6] = s16[6];
190 d16[7] = s16[7];
191 d16 += 8;
192 s16 += 8;
193 size -= 16;
194 }
195 while (size >= 2)
196 {
197 *d16++ = *s16++;
198 size -= 2;
199 }
200
201 d = reinterpret_cast<uint8_t*>(d16);
202 s = reinterpret_cast<const uint8_t*>(s16);
203 }
204 // Otherwise, fallback to byte-wise copying below.
205 }
206
208 while (size--)
209 {
210 *d++ = *s++;
211 }
212}
213
214void LibXR::Memory::FastMove(void* dst, const void* src, size_t size)
215{
216 if (size == 0 || dst == src)
217 {
218 return;
219 }
220
221 auto* d = static_cast<uint8_t*>(dst);
222 const auto* s = static_cast<const uint8_t*>(src);
223
224 if (!(d < s + size && s < d + size))
225 {
226 FastCopy(dst, src, size);
227 return;
228 }
229
230 if (d > s)
231 {
232 // Backward-overlap move: consume from the tail first so we never overwrite
233 // bytes that still need to be read from the source window.
234 uintptr_t d_end_offset =
235 reinterpret_cast<uintptr_t>(d + size) & (LibXR::ALIGN_SIZE - 1);
236 uintptr_t s_end_offset =
237 reinterpret_cast<uintptr_t>(s + size) & (LibXR::ALIGN_SIZE - 1);
238
239 d += size;
240 s += size;
241
242 if (d_end_offset == s_end_offset)
243 {
244 // Once both ends share the same alignment, we can peel the unaligned tail
245 // bytes and then switch to wide backward copies safely.
246 if (d_end_offset)
247 {
248 size_t tail = d_end_offset;
249 if (tail > size)
250 {
251 tail = size;
252 }
253 while (tail--)
254 {
255 *--d = *--s;
256 --size;
257 }
258 }
259
260 if constexpr (LibXR::ALIGN_SIZE == 8)
261 {
262 auto* dw = reinterpret_cast<uint64_t*>(d);
263 auto* sw = reinterpret_cast<const uint64_t*>(s);
264
265 while (size >= 64)
266 {
267 uint64_t a0 = sw[-1];
268 uint64_t a1 = sw[-2];
269 uint64_t a2 = sw[-3];
270 uint64_t a3 = sw[-4];
271 uint64_t a4 = sw[-5];
272 uint64_t a5 = sw[-6];
273 uint64_t a6 = sw[-7];
274 uint64_t a7 = sw[-8];
275 dw[-1] = a0;
276 dw[-2] = a1;
277 dw[-3] = a2;
278 dw[-4] = a3;
279 dw[-5] = a4;
280 dw[-6] = a5;
281 dw[-7] = a6;
282 dw[-8] = a7;
283 dw -= 8;
284 sw -= 8;
285 size -= 64;
286 }
287 while (size >= 8)
288 {
289 uint64_t a = *--sw;
290 *--dw = a;
291 size -= 8;
292 }
293
294 d = reinterpret_cast<uint8_t*>(dw);
295 s = reinterpret_cast<const uint8_t*>(sw);
296 }
297 else
298 {
299 auto* dw = reinterpret_cast<uint32_t*>(d);
300 auto* sw = reinterpret_cast<const uint32_t*>(s);
301
302 while (size >= 32)
303 {
304 uint32_t a0 = sw[-1];
305 uint32_t a1 = sw[-2];
306 uint32_t a2 = sw[-3];
307 uint32_t a3 = sw[-4];
308 uint32_t a4 = sw[-5];
309 uint32_t a5 = sw[-6];
310 uint32_t a6 = sw[-7];
311 uint32_t a7 = sw[-8];
312 dw[-1] = a0;
313 dw[-2] = a1;
314 dw[-3] = a2;
315 dw[-4] = a3;
316 dw[-5] = a4;
317 dw[-6] = a5;
318 dw[-7] = a6;
319 dw[-8] = a7;
320 dw -= 8;
321 sw -= 8;
322 size -= 32;
323 }
324 while (size >= 4)
325 {
326 uint32_t a = *--sw;
327 *--dw = a;
328 size -= 4;
329 }
330
331 d = reinterpret_cast<uint8_t*>(dw);
332 s = reinterpret_cast<const uint8_t*>(sw);
333 }
334 }
335
336 while (size--)
337 {
338 *--d = *--s;
339 }
340 return;
341 }
342
343 // Forward-overlap move: valid only when destination starts before source, so
344 // consuming from the head cannot destroy unread source bytes.
345 uintptr_t d_offset = reinterpret_cast<uintptr_t>(d) & (LibXR::ALIGN_SIZE - 1);
346 uintptr_t s_offset = reinterpret_cast<uintptr_t>(s) & (LibXR::ALIGN_SIZE - 1);
347
348 if (d_offset == s_offset)
349 {
350 // Same-alignment forward overlap can reuse the same "align head, then burst"
351 // strategy as FastCopy because source bytes are always read before they are
352 // overwritten.
353 if (d_offset)
354 {
355 size_t head = LibXR::ALIGN_SIZE - d_offset;
356 if (head > size)
357 {
358 head = size;
359 }
360 while (head--)
361 {
362 *d++ = *s++;
363 --size;
364 }
365 }
366
367 if constexpr (LibXR::ALIGN_SIZE == 8)
368 {
369 auto* dw = reinterpret_cast<uint64_t*>(d);
370 auto* sw = reinterpret_cast<const uint64_t*>(s);
371
372 while (size >= 64)
373 {
374 uint64_t a0 = sw[0];
375 uint64_t a1 = sw[1];
376 uint64_t a2 = sw[2];
377 uint64_t a3 = sw[3];
378 uint64_t a4 = sw[4];
379 uint64_t a5 = sw[5];
380 uint64_t a6 = sw[6];
381 uint64_t a7 = sw[7];
382 dw[0] = a0;
383 dw[1] = a1;
384 dw[2] = a2;
385 dw[3] = a3;
386 dw[4] = a4;
387 dw[5] = a5;
388 dw[6] = a6;
389 dw[7] = a7;
390 dw += 8;
391 sw += 8;
392 size -= 64;
393 }
394 while (size >= 8)
395 {
396 uint64_t a = *sw++;
397 *dw++ = a;
398 size -= 8;
399 }
400
401 d = reinterpret_cast<uint8_t*>(dw);
402 s = reinterpret_cast<const uint8_t*>(sw);
403 }
404 else
405 {
406 auto* dw = reinterpret_cast<uint32_t*>(d);
407 auto* sw = reinterpret_cast<const uint32_t*>(s);
408
409 while (size >= 32)
410 {
411 uint32_t a0 = sw[0];
412 uint32_t a1 = sw[1];
413 uint32_t a2 = sw[2];
414 uint32_t a3 = sw[3];
415 uint32_t a4 = sw[4];
416 uint32_t a5 = sw[5];
417 uint32_t a6 = sw[6];
418 uint32_t a7 = sw[7];
419 dw[0] = a0;
420 dw[1] = a1;
421 dw[2] = a2;
422 dw[3] = a3;
423 dw[4] = a4;
424 dw[5] = a5;
425 dw[6] = a6;
426 dw[7] = a7;
427 dw += 8;
428 sw += 8;
429 size -= 32;
430 }
431 while (size >= 4)
432 {
433 uint32_t a = *sw++;
434 *dw++ = a;
435 size -= 4;
436 }
437
438 d = reinterpret_cast<uint8_t*>(dw);
439 s = reinterpret_cast<const uint8_t*>(sw);
440 }
441 }
442
443 while (size--)
444 {
445 *d++ = *s++;
446 }
447}
448
449void LibXR::Memory::FastSet(void* dst, uint8_t value, size_t size)
450{
451 if (size == 0)
452 {
453 return;
454 }
455
456 uint8_t* d = static_cast<uint8_t*>(dst);
457
458 uintptr_t d_offset = reinterpret_cast<uintptr_t>(d) & (LibXR::ALIGN_SIZE - 1);
459
460 // 先处理头部到对齐
461 if (d_offset)
462 {
463 size_t head = LibXR::ALIGN_SIZE - d_offset;
464 if (head > size)
465 {
466 head = size;
467 }
468 while (head--)
469 {
470 *d++ = value;
471 --size;
472 }
473 }
474
475 if constexpr (LibXR::ALIGN_SIZE == 8)
476 {
477 // 8-byte pattern
478 uint64_t pat = value;
479 pat |= pat << 8;
480 pat |= pat << 16;
481 pat |= pat << 32;
482
483 auto* dw = reinterpret_cast<uint64_t*>(d);
484
485 while (size >= 64)
486 {
487 dw[0] = pat;
488 dw[1] = pat;
489 dw[2] = pat;
490 dw[3] = pat;
491 dw[4] = pat;
492 dw[5] = pat;
493 dw[6] = pat;
494 dw[7] = pat;
495 dw += 8;
496 size -= 64;
497 }
498 while (size >= 8)
499 {
500 *dw++ = pat;
501 size -= 8;
502 }
503
504 d = reinterpret_cast<uint8_t*>(dw);
505 }
506 else
507 {
508 // 4-byte pattern
509 uint32_t pat = value;
510 pat |= pat << 8;
511 pat |= pat << 16;
512
513 auto* dw = reinterpret_cast<uint32_t*>(d);
514
515 while (size >= 32)
516 {
517 dw[0] = pat;
518 dw[1] = pat;
519 dw[2] = pat;
520 dw[3] = pat;
521 dw[4] = pat;
522 dw[5] = pat;
523 dw[6] = pat;
524 dw[7] = pat;
525 dw += 8;
526 size -= 32;
527 }
528 while (size >= 4)
529 {
530 *dw++ = pat;
531 size -= 4;
532 }
533
534 d = reinterpret_cast<uint8_t*>(dw);
535 }
536
537 // 尾巴
538 while (size--)
539 {
540 *d++ = value;
541 }
542}
static void FastSet(void *dst, uint8_t value, size_t size)
快速内存填充 / Fast memory fill
static void FastCopy(void *dst, const void *src, size_t size)
快速内存拷贝 / Fast memory copy
static void FastMove(void *dst, const void *src, size_t size)
内存搬移 / Memory move
constexpr size_t ALIGN_SIZE
平台自然对齐大小 / Native platform alignment size
Definition libxr_def.hpp:67