1 /* 2 * Copyright 2014, Paweł Dziepak, pdziepak@quarnos.org. 3 * Distributed under the terms of the MIT License. 4 */ 5 6 7 #include <array> 8 9 #include <cstddef> 10 #include <cstdint> 11 12 #include <x86intrin.h> 13 14 15 namespace { 16 17 18 // __m128i resolves to a type with an attribute, which can't get into the 19 // template signature, resulting in a warning. Nonetheless the code is what we 20 // expect, so we silent the warning. 21 #pragma GCC diagnostic push 22 #if defined __GNUC__ && __GNUC__ >= 6 23 #pragma GCC diagnostic ignored "-Wignored-attributes" 24 #endif 25 26 27 template<template<size_t N> class Generator, unsigned N, unsigned ...Index> 28 struct GenerateTable : GenerateTable<Generator, N - 1, N - 1, Index...> { 29 }; 30 31 template<template<size_t N> class Generator, unsigned ...Index> 32 struct GenerateTable<Generator, 0, Index...> 33 : std::array<decltype(Generator<0>::sValue), sizeof...(Index)> { 34 constexpr GenerateTable() 35 : 36 std::array<decltype(Generator<0>::sValue), sizeof...(Index)> { 37 { Generator<Index>::sValue... } 38 } 39 { 40 } 41 }; 42 43 44 #pragma GCC diagnostic pop 45 46 47 static inline void memcpy_repmovs(uint8_t* destination, const uint8_t* source, 48 size_t length) 49 { 50 __asm__ __volatile__("rep movsb" 51 : "+D" (destination), "+S" (source), "+c" (length) 52 : 53 : "memory"); 54 } 55 56 57 template<size_t N> 58 inline void copy_small(uint8_t* destination, const uint8_t* source) 59 { 60 struct data { 61 uint8_t x[N]; 62 }; 63 *reinterpret_cast<data*>(destination) 64 = *reinterpret_cast<const data*>(source); 65 } 66 67 68 template<size_t N> 69 struct SmallGenerator { 70 constexpr static void (*sValue)(uint8_t*, const uint8_t*) = copy_small<N>; 71 }; 72 constexpr static GenerateTable<SmallGenerator, 8> table_small; 73 74 75 static inline void memcpy_small(uint8_t* destination, const uint8_t* source, 76 size_t length) 77 { 78 if (length < 8) { 79 table_small[length](destination, source); 80 } else { 81 auto to = reinterpret_cast<uint64_t*>(destination); 82 auto from = reinterpret_cast<const uint64_t*>(source); 83 *to = *from; 84 to = reinterpret_cast<uint64_t*>(destination + length - 8); 85 from = reinterpret_cast<const uint64_t*>(source + length - 8); 86 *to = *from; 87 } 88 } 89 90 91 template<size_t N> 92 inline void copy_sse(__m128i* destination, const __m128i* source) 93 { 94 auto temp = _mm_loadu_si128(source); 95 _mm_storeu_si128(destination, temp); 96 copy_sse<N - 1>(destination + 1, source + 1); 97 } 98 99 100 template<> 101 inline void copy_sse<0>(__m128i* destination, const __m128i* source) 102 { 103 } 104 105 106 template<size_t N> 107 struct SSEGenerator { 108 constexpr static void (*sValue)(__m128i*, const __m128i*) = copy_sse<N>; 109 }; 110 constexpr static GenerateTable<SSEGenerator, 4> table_sse; 111 112 113 static inline void memcpy_sse(uint8_t* destination, const uint8_t* source, size_t length) 114 { 115 auto to = reinterpret_cast<__m128i*>(destination); 116 auto from = reinterpret_cast<const __m128i*>(source); 117 auto toEnd = reinterpret_cast<__m128i*>(destination + length - 16); 118 auto fromEnd = reinterpret_cast<const __m128i*>(source + length - 16); 119 while (length >= 64) { 120 copy_sse<4>(to, from); 121 to += 4; 122 from += 4; 123 length -= 64; 124 } 125 if (length >= 16) { 126 table_sse[length / 16](to, from); 127 length %= 16; 128 } 129 if (length) { 130 copy_sse<1>(toEnd, fromEnd); 131 } 132 } 133 134 135 } 136 137 138 extern "C" void* memcpy(void* destination, const void* source, size_t length) 139 { 140 auto to = static_cast<uint8_t*>(destination); 141 auto from = static_cast<const uint8_t*>(source); 142 if (length <= 16) { 143 memcpy_small(to, from, length); 144 return destination; 145 } 146 if (length < 2048) { 147 memcpy_sse(to, from, length); 148 return destination; 149 } 150 memcpy_repmovs(to, from, length); 151 return destination; 152 } 153 154 155 static inline void 156 memset_repstos(uint8_t* destination, uint8_t value, size_t length) 157 { 158 __asm__ __volatile__("rep stosb" 159 : "+D" (destination), "+c" (length) 160 : "a" (value) 161 : "memory"); 162 } 163 164 165 static inline void 166 memset_sse(uint8_t* destination, uint8_t value, size_t length) 167 { 168 __m128i packed = _mm_set1_epi8(value); 169 auto end = reinterpret_cast<__m128i*>(destination + length - 16); 170 auto diff = reinterpret_cast<uintptr_t>(destination) % 16; 171 if (diff) { 172 diff = 16 - diff; 173 length -= diff; 174 _mm_storeu_si128(reinterpret_cast<__m128i*>(destination), packed); 175 } 176 auto ptr = reinterpret_cast<__m128i*>(destination + diff); 177 while (length >= 64) { 178 _mm_store_si128(ptr++, packed); 179 _mm_store_si128(ptr++, packed); 180 _mm_store_si128(ptr++, packed); 181 _mm_store_si128(ptr++, packed); 182 length -= 64; 183 } 184 while (length >= 16) { 185 _mm_store_si128(ptr++, packed); 186 length -= 16; 187 } 188 _mm_storeu_si128(end, packed); 189 } 190 191 192 static inline void 193 memset_small(uint8_t* destination, uint8_t value, size_t length) 194 { 195 if (length >= 8) { 196 auto packed = value * 0x101010101010101ul; 197 auto ptr = reinterpret_cast<uint64_t*>(destination); 198 auto end = reinterpret_cast<uint64_t*>(destination + length - 8); 199 while (length >= 8) { 200 *ptr++ = packed; 201 length -= 8; 202 } 203 *end = packed; 204 } else { 205 while (length--) { 206 *destination++ = value; 207 } 208 } 209 } 210 211 212 extern "C" void* 213 memset(void* ptr, int chr, size_t length) 214 { 215 auto value = static_cast<unsigned char>(chr); 216 auto destination = static_cast<uint8_t*>(ptr); 217 if (length < 32) { 218 memset_small(destination, value, length); 219 return ptr; 220 } 221 if (length < 2048) { 222 memset_sse(destination, value, length); 223 return ptr; 224 } 225 memset_repstos(destination, value, length); 226 return ptr; 227 } 228 229