xref: /haiku/src/system/libroot/posix/string/arch/x86_64/arch_string.cpp (revision 02354704729d38c3b078c696adc1bbbd33cbcf72)
1 /*
2  * Copyright 2014, Paweł Dziepak, pdziepak@quarnos.org.
3  * Distributed under the terms of the MIT License.
4  */
5 
6 
7 #include <array>
8 
9 #include <cstddef>
10 #include <cstdint>
11 
12 #include <emmintrin.h>
13 
14 
15 namespace {
16 
17 
18 // __m128i resolves to a type with an attribute, which can't get into the
19 // template signature, resulting in a warning. Nonetheless the code is what we
20 // expect, so we silent the warning.
21 #pragma GCC diagnostic push
22 #if defined __GNUC__ && __GNUC__ >= 6
23 #pragma GCC diagnostic ignored "-Wignored-attributes"
24 #endif
25 
26 
27 template<template<size_t N> class Generator, unsigned N, unsigned ...Index>
28 struct GenerateTable : GenerateTable<Generator, N - 1,  N - 1, Index...> {
29 };
30 
31 template<template<size_t N> class Generator, unsigned ...Index>
32 struct GenerateTable<Generator, 0, Index...>
33 	: std::array<decltype(Generator<0>::sValue), sizeof...(Index)> {
34 	constexpr GenerateTable()
35 	:
36 	std::array<decltype(Generator<0>::sValue), sizeof...(Index)> {
37 		{ Generator<Index>::sValue... }
38 	}
39 	{
40 	}
41 };
42 
43 
44 #pragma GCC diagnostic pop
45 
46 
47 static inline void memcpy_repmovs(uint8_t* destination, const uint8_t* source,
48 	size_t length)
49 {
50 	__asm__ __volatile__("rep movsb"
51 		: "+D" (destination), "+S" (source), "+c" (length)
52 		:
53 		: "memory");
54 }
55 
56 
57 template<size_t N>
58 inline void copy_small(uint8_t* destination, const uint8_t* source)
59 {
60 	struct data {
61 		uint8_t x[N];
62 	};
63 	*reinterpret_cast<data*>(destination)
64 		= *reinterpret_cast<const data*>(source);
65 }
66 
67 
68 template<size_t N>
69 struct SmallGenerator {
70 	constexpr static void (*sValue)(uint8_t*, const uint8_t*) = copy_small<N>;
71 };
72 constexpr static GenerateTable<SmallGenerator, 8> table_small;
73 
74 
75 static inline void memcpy_small(uint8_t* destination, const uint8_t* source,
76 	size_t length)
77 {
78 	if (length < 8) {
79 		table_small[length](destination, source);
80 	} else {
81 		auto to = reinterpret_cast<uint64_t*>(destination);
82 		auto from = reinterpret_cast<const uint64_t*>(source);
83 		*to = *from;
84 		to = reinterpret_cast<uint64_t*>(destination + length - 8);
85 		from = reinterpret_cast<const uint64_t*>(source + length - 8);
86 		*to = *from;
87 	}
88 }
89 
90 
91 template<size_t N>
92 inline void copy_sse(__m128i* destination, const __m128i* source)
93 {
94 	auto temp = _mm_loadu_si128(source);
95 	_mm_storeu_si128(destination, temp);
96 	copy_sse<N - 1>(destination + 1, source + 1);
97 }
98 
99 
100 template<>
101 inline void copy_sse<0>(__m128i* destination, const __m128i* source)
102 {
103 }
104 
105 
106 template<size_t N>
107 struct SSEGenerator {
108 	constexpr static void (*sValue)(__m128i*, const __m128i*) = copy_sse<N>;
109 };
110 constexpr static GenerateTable<SSEGenerator, 4> table_sse;
111 
112 
113 static inline void memcpy_sse(uint8_t* destination, const uint8_t* source, size_t length)
114 {
115 	auto to = reinterpret_cast<__m128i*>(destination);
116 	auto from = reinterpret_cast<const __m128i*>(source);
117 	auto toEnd = reinterpret_cast<__m128i*>(destination + length - 16);
118 	auto fromEnd = reinterpret_cast<const __m128i*>(source + length - 16);
119 	while (length >= 64) {
120 		copy_sse<4>(to, from);
121 		to += 4;
122 		from += 4;
123 		length -= 64;
124 	}
125 	if (length >= 16) {
126 		table_sse[length / 16](to, from);
127 		length %= 16;
128 	}
129 	if (length) {
130 		copy_sse<1>(toEnd, fromEnd);
131 	}
132 }
133 
134 
135 }
136 
137 
138 extern "C" void* memcpy(void* destination, const void* source, size_t length)
139 {
140 	auto to = static_cast<uint8_t*>(destination);
141 	auto from = static_cast<const uint8_t*>(source);
142 	if (length <= 16) {
143 		memcpy_small(to, from, length);
144 		return destination;
145 	}
146 	if (length < 2048) {
147 		memcpy_sse(to, from, length);
148 		return destination;
149 	}
150 	memcpy_repmovs(to, from, length);
151 	return destination;
152 }
153 
154 
155 static inline void
156 memset_repstos(uint8_t* destination, uint8_t value, size_t length)
157 {
158 	__asm__ __volatile__("rep stosb"
159 		: "+D" (destination), "+c" (length)
160 		: "a" (value)
161 		: "memory");
162 }
163 
164 
165 static inline void
166 memset_sse(uint8_t* destination, uint8_t value, size_t length)
167 {
168 	__m128i packed = _mm_set1_epi8(value);
169 	auto end = reinterpret_cast<__m128i*>(destination + length - 16);
170 	auto diff = reinterpret_cast<uintptr_t>(destination) % 16;
171 	if (diff) {
172 		diff = 16 - diff;
173 		length -= diff;
174 		_mm_storeu_si128(reinterpret_cast<__m128i*>(destination), packed);
175 	}
176 	auto ptr = reinterpret_cast<__m128i*>(destination + diff);
177 	while (length >= 64) {
178 		_mm_store_si128(ptr++, packed);
179 		_mm_store_si128(ptr++, packed);
180 		_mm_store_si128(ptr++, packed);
181 		_mm_store_si128(ptr++, packed);
182 		length -= 64;
183 	}
184 	while (length >= 16) {
185 		_mm_store_si128(ptr++, packed);
186 		length -= 16;
187 	}
188 	_mm_storeu_si128(end, packed);
189 }
190 
191 
192 static inline void
193 memset_small(uint8_t* destination, uint8_t value, size_t length)
194 {
195 	if (length >= 8) {
196 		auto packed = value * 0x101010101010101ul;
197 		auto ptr = reinterpret_cast<uint64_t*>(destination);
198 		auto end = reinterpret_cast<uint64_t*>(destination + length - 8);
199 		while (length >= 8) {
200 			*ptr++ = packed;
201 			length -= 8;
202 		}
203 		*end = packed;
204 	} else {
205 		while (length--) {
206 			*destination++ = value;
207 		}
208 	}
209 }
210 
211 
212 extern "C" void*
213 memset(void* ptr, int chr, size_t length)
214 {
215 	auto value = static_cast<unsigned char>(chr);
216 	auto destination = static_cast<uint8_t*>(ptr);
217 	if (length < 32) {
218 		memset_small(destination, value, length);
219 		return ptr;
220 	}
221 	if (length < 2048) {
222 		memset_sse(destination, value, length);
223 		return ptr;
224 	}
225 	memset_repstos(destination, value, length);
226 	return ptr;
227 }
228 
229