Fast memory copy routine optimized for alignment and burst copying.
If source and destination have the same alignment offset, we can perform fast aligned copying after handling the leading bytes.
Handle unaligned head bytes before reaching alignment.
Burst copy 4 bytes per cycle (32-bit), using 8x unrolled loop.
If alignments are different, try to maximize word copy size based on address difference.
Copy any remaining bytes (tail).
18{
19 uint8_t* d = static_cast<uint8_t*>(dst);
20 const uint8_t* s = static_cast<const uint8_t*>(src);
21
22 uintptr_t d_offset = reinterpret_cast<uintptr_t>(d) & (LIBXR_ALIGN_SIZE - 1);
23 uintptr_t s_offset = reinterpret_cast<uintptr_t>(s) & (LIBXR_ALIGN_SIZE - 1);
24
29 if (d_offset == s_offset)
30 {
32 if (d_offset)
33 {
34 size_t head = LIBXR_ALIGN_SIZE - d_offset;
35 if (head > size)
36 {
37 head = size;
38 }
39 while (head--)
40 {
41 *d++ = *s++;
42 --size;
43 }
44 }
45
46#if LIBXR_ALIGN_SIZE == 8
48 auto* dw = reinterpret_cast<uint64_t*>(d);
49 auto* sw = reinterpret_cast<const uint64_t*>(s);
50
51 while (size >= 64)
52 {
53 dw[0] = sw[0];
54 dw[1] = sw[1];
55 dw[2] = sw[2];
56 dw[3] = sw[3];
57 dw[4] = sw[4];
58 dw[5] = sw[5];
59 dw[6] = sw[6];
60 dw[7] = sw[7];
61 dw += 8;
62 sw += 8;
63 size -= 64;
64 }
65 while (size >= 8)
66 {
67 *dw++ = *sw++;
68 size -= 8;
69 }
70
71 d = reinterpret_cast<uint8_t*>(dw);
72 s = reinterpret_cast<const uint8_t*>(sw);
73#else
75 auto* dw = reinterpret_cast<uint32_t*>(d);
76 auto* sw = reinterpret_cast<const uint32_t*>(s);
77
78 while (size >= 32)
79 {
80 dw[0] = sw[0];
81 dw[1] = sw[1];
82 dw[2] = sw[2];
83 dw[3] = sw[3];
84 dw[4] = sw[4];
85 dw[5] = sw[5];
86 dw[6] = sw[6];
87 dw[7] = sw[7];
88 dw += 8;
89 sw += 8;
90 size -= 32;
91 }
92 while (size >= 4)
93 {
94 *dw++ = *sw++;
95 size -= 4;
96 }
97
98 d = reinterpret_cast<uint8_t*>(dw);
99 s = reinterpret_cast<const uint8_t*>(sw);
100#endif
101 }
106 else
107 {
108 uintptr_t addr_diff = reinterpret_cast<uintptr_t>(s) - reinterpret_cast<uintptr_t>(d);
109
110#if LIBXR_ALIGN_SIZE == 8
112 if ((addr_diff & 3) == 0)
113 {
114 while ((reinterpret_cast<uintptr_t>(d) & 3) && size)
115 {
116 *d++ = *s++;
117 --size;
118 }
119 auto* d32 = reinterpret_cast<uint32_t*>(d);
120 auto* s32 = reinterpret_cast<const uint32_t*>(s);
121
122 while (size >= 32)
123 {
124 d32[0] = s32[0];
125 d32[1] = s32[1];
126 d32[2] = s32[2];
127 d32[3] = s32[3];
128 d32[4] = s32[4];
129 d32[5] = s32[5];
130 d32[6] = s32[6];
131 d32[7] = s32[7];
132 d32 += 8;
133 s32 += 8;
134 size -= 32;
135 }
136 while (size >= 4)
137 {
138 *d32++ = *s32++;
139 size -= 4;
140 }
141
142 d = reinterpret_cast<uint8_t*>(d32);
143 s = reinterpret_cast<const uint8_t*>(s32);
144 }
146 else
147#endif
148 if ((addr_diff & 1) == 0)
149 {
150 if (reinterpret_cast<uintptr_t>(d) & 1)
151 {
152 *d++ = *s++;
153 --size;
154 }
155 auto* d16 = reinterpret_cast<uint16_t*>(d);
156 auto* s16 = reinterpret_cast<const uint16_t*>(s);
157
158 while (size >= 16)
159 {
160 d16[0] = s16[0];
161 d16[1] = s16[1];
162 d16[2] = s16[2];
163 d16[3] = s16[3];
164 d16[4] = s16[4];
165 d16[5] = s16[5];
166 d16[6] = s16[6];
167 d16[7] = s16[7];
168 d16 += 8;
169 s16 += 8;
170 size -= 16;
171 }
172 while (size >= 2)
173 {
174 *d16++ = *s16++;
175 size -= 2;
176 }
177
178 d = reinterpret_cast<uint8_t*>(d16);
179 s = reinterpret_cast<const uint8_t*>(s16);
180 }
181
182 }
183
185 while (size--)
186 {
187 *d++ = *s++;
188 }
189}