xref: /haiku/src/system/libroot/posix/string/arch/x86_64/arch_string.cpp (revision 644fa5a93845dc4a1bc155f1fd0f94ebdf0b47bc)
1 /*
2  * Copyright 2014, Paweł Dziepak, pdziepak@quarnos.org.
3  * Distributed under the terms of the MIT License.
4  */
5 
6 
7 #include <array>
8 
9 #include <cstddef>
10 #include <cstdint>
11 
12 #include <x86intrin.h>
13 
14 
15 namespace {
16 
17 
18 template<template<size_t N> class Generator, unsigned N, unsigned ...Index>
19 struct GenerateTable : GenerateTable<Generator, N - 1,  N - 1, Index...> {
20 };
21 
22 template<template<size_t N> class Generator, unsigned ...Index>
23 struct GenerateTable<Generator, 0, Index...>
24 	: std::array<decltype(Generator<0>::sValue), sizeof...(Index)> {
25 	constexpr GenerateTable()
26 	:
27 	std::array<decltype(Generator<0>::sValue), sizeof...(Index)> {
28 		{ Generator<Index>::sValue... }
29 	}
30 	{
31 	}
32 };
33 
34 
35 static inline void memcpy_repmovs(uint8_t* destination, const uint8_t* source,
36 	size_t length)
37 {
38 	__asm__ __volatile__("rep movsb"
39 		: "+D" (destination), "+S" (source), "+c" (length)
40 		:
41 		: "memory");
42 }
43 
44 
45 template<size_t N>
46 inline void copy_small(uint8_t* destination, const uint8_t* source)
47 {
48 	struct data {
49 		uint8_t x[N];
50 	};
51 	*reinterpret_cast<data*>(destination)
52 		= *reinterpret_cast<const data*>(source);
53 }
54 
55 
56 template<size_t N>
57 struct SmallGenerator {
58 	constexpr static void (*sValue)(uint8_t*, const uint8_t*) = copy_small<N>;
59 };
60 constexpr static GenerateTable<SmallGenerator, 8> table_small;
61 
62 
63 static inline void memcpy_small(uint8_t* destination, const uint8_t* source,
64 	size_t length)
65 {
66 	if (length < 8) {
67 		table_small[length](destination, source);
68 	} else {
69 		auto to = reinterpret_cast<uint64_t*>(destination);
70 		auto from = reinterpret_cast<const uint64_t*>(source);
71 		*to = *from;
72 		to = reinterpret_cast<uint64_t*>(destination + length - 8);
73 		from = reinterpret_cast<const uint64_t*>(source + length - 8);
74 		*to = *from;
75 	}
76 }
77 
78 
79 template<size_t N>
80 inline void copy_sse(__m128i* destination, const __m128i* source)
81 {
82 	auto temp = _mm_loadu_si128(source);
83 	_mm_storeu_si128(destination, temp);
84 	copy_sse<N - 1>(destination + 1, source + 1);
85 }
86 
87 
88 template<>
89 inline void copy_sse<0>(__m128i* destination, const __m128i* source)
90 {
91 }
92 
93 
94 template<size_t N>
95 struct SSEGenerator {
96 	constexpr static void (*sValue)(__m128i*, const __m128i*) = copy_sse<N>;
97 };
98 constexpr static GenerateTable<SSEGenerator, 4> table_sse;
99 
100 
101 static inline void memcpy_sse(uint8_t* destination, const uint8_t* source, size_t length)
102 {
103 	auto to = reinterpret_cast<__m128i*>(destination);
104 	auto from = reinterpret_cast<const __m128i*>(source);
105 	auto toEnd = reinterpret_cast<__m128i*>(destination + length - 16);
106 	auto fromEnd = reinterpret_cast<const __m128i*>(source + length - 16);
107 	while (length >= 64) {
108 		copy_sse<4>(to, from);
109 		to += 4;
110 		from += 4;
111 		length -= 64;
112 	}
113 	if (length >= 16) {
114 		table_sse[length / 16](to, from);
115 		length %= 16;
116 	}
117 	if (length) {
118 		copy_sse<1>(toEnd, fromEnd);
119 	}
120 }
121 
122 
123 }
124 
125 
126 extern "C" void* memcpy(void* destination, const void* source, size_t length)
127 {
128 	auto to = static_cast<uint8_t*>(destination);
129 	auto from = static_cast<const uint8_t*>(source);
130 	if (length <= 16) {
131 		memcpy_small(to, from, length);
132 		return destination;
133 	}
134 	if (length < 2048) {
135 		memcpy_sse(to, from, length);
136 		return destination;
137 	}
138 	memcpy_repmovs(to, from, length);
139 	return destination;
140 }
141 
142 
143 static inline void
144 memset_repstos(uint8_t* destination, uint8_t value, size_t length)
145 {
146 	__asm__ __volatile__("rep stosb"
147 		: "+D" (destination), "+c" (length)
148 		: "a" (value)
149 		: "memory");
150 }
151 
152 
153 static inline void
154 memset_sse(uint8_t* destination, uint8_t value, size_t length)
155 {
156 	__m128i packed = _mm_set1_epi8(value);
157 	auto end = reinterpret_cast<__m128i*>(destination + length - 16);
158 	auto diff = reinterpret_cast<uintptr_t>(destination) % 16;
159 	if (diff) {
160 		diff = 16 - diff;
161 		length -= diff;
162 		_mm_storeu_si128(reinterpret_cast<__m128i*>(destination), packed);
163 	}
164 	auto ptr = reinterpret_cast<__m128i*>(destination + diff);
165 	while (length >= 64) {
166 		_mm_store_si128(ptr++, packed);
167 		_mm_store_si128(ptr++, packed);
168 		_mm_store_si128(ptr++, packed);
169 		_mm_store_si128(ptr++, packed);
170 		length -= 64;
171 	}
172 	while (length >= 16) {
173 		_mm_store_si128(ptr++, packed);
174 		length -= 16;
175 	}
176 	_mm_storeu_si128(end, packed);
177 }
178 
179 
180 static inline void
181 memset_small(uint8_t* destination, uint8_t value, size_t length)
182 {
183 	if (length >= 8) {
184 		auto packed = value * 0x101010101010101ul;
185 		auto ptr = reinterpret_cast<uint64_t*>(destination);
186 		auto end = reinterpret_cast<uint64_t*>(destination + length - 8);
187 		while (length >= 8) {
188 			*ptr++ = packed;
189 			length -= 8;
190 		}
191 		*end = packed;
192 	} else {
193 		while (length--) {
194 			*destination++ = value;
195 		}
196 	}
197 }
198 
199 
200 extern "C" void*
201 memset(void* ptr, int chr, size_t length)
202 {
203 	auto value = static_cast<unsigned char>(chr);
204 	auto destination = static_cast<uint8_t*>(ptr);
205 	if (length < 32) {
206 		memset_small(destination, value, length);
207 		return ptr;
208 	}
209 	if (length < 2048) {
210 		memset_sse(destination, value, length);
211 		return ptr;
212 	}
213 	memset_repstos(destination, value, length);
214 	return ptr;
215 }
216 
217