1 /* 2 * Copyright 2014, Paweł Dziepak, pdziepak@quarnos.org. 3 * Distributed under the terms of the MIT License. 4 */ 5 6 7 #include <array> 8 9 #include <cstddef> 10 #include <cstdint> 11 12 #include <x86intrin.h> 13 14 15 namespace { 16 17 18 template<template<size_t N> class Generator, unsigned N, unsigned ...Index> 19 struct GenerateTable : GenerateTable<Generator, N - 1, N - 1, Index...> { 20 }; 21 22 template<template<size_t N> class Generator, unsigned ...Index> 23 struct GenerateTable<Generator, 0, Index...> 24 : std::array<decltype(Generator<0>::sValue), sizeof...(Index)> { 25 constexpr GenerateTable() 26 : 27 std::array<decltype(Generator<0>::sValue), sizeof...(Index)> { 28 { Generator<Index>::sValue... } 29 } 30 { 31 } 32 }; 33 34 35 static inline void memcpy_repmovs(uint8_t* destination, const uint8_t* source, 36 size_t length) 37 { 38 __asm__ __volatile__("rep movsb" 39 : "+D" (destination), "+S" (source), "+c" (length) 40 : 41 : "memory"); 42 } 43 44 45 template<size_t N> 46 inline void copy_small(uint8_t* destination, const uint8_t* source) 47 { 48 struct data { 49 uint8_t x[N]; 50 }; 51 *reinterpret_cast<data*>(destination) 52 = *reinterpret_cast<const data*>(source); 53 } 54 55 56 template<size_t N> 57 struct SmallGenerator { 58 constexpr static void (*sValue)(uint8_t*, const uint8_t*) = copy_small<N>; 59 }; 60 constexpr static GenerateTable<SmallGenerator, 8> table_small; 61 62 63 static inline void memcpy_small(uint8_t* destination, const uint8_t* source, 64 size_t length) 65 { 66 if (length < 8) { 67 table_small[length](destination, source); 68 } else { 69 auto to = reinterpret_cast<uint64_t*>(destination); 70 auto from = reinterpret_cast<const uint64_t*>(source); 71 *to = *from; 72 to = reinterpret_cast<uint64_t*>(destination + length - 8); 73 from = reinterpret_cast<const uint64_t*>(source + length - 8); 74 *to = *from; 75 } 76 } 77 78 79 template<size_t N> 80 inline void copy_sse(__m128i* destination, const __m128i* source) 81 { 82 auto temp = _mm_loadu_si128(source); 83 _mm_storeu_si128(destination, temp); 84 copy_sse<N - 1>(destination + 1, source + 1); 85 } 86 87 88 template<> 89 inline void copy_sse<0>(__m128i* destination, const __m128i* source) 90 { 91 } 92 93 94 template<size_t N> 95 struct SSEGenerator { 96 constexpr static void (*sValue)(__m128i*, const __m128i*) = copy_sse<N>; 97 }; 98 constexpr static GenerateTable<SSEGenerator, 4> table_sse; 99 100 101 static inline void memcpy_sse(uint8_t* destination, const uint8_t* source, size_t length) 102 { 103 auto to = reinterpret_cast<__m128i*>(destination); 104 auto from = reinterpret_cast<const __m128i*>(source); 105 auto toEnd = reinterpret_cast<__m128i*>(destination + length - 16); 106 auto fromEnd = reinterpret_cast<const __m128i*>(source + length - 16); 107 while (length >= 64) { 108 copy_sse<4>(to, from); 109 to += 4; 110 from += 4; 111 length -= 64; 112 } 113 if (length >= 16) { 114 table_sse[length / 16](to, from); 115 length %= 16; 116 } 117 if (length) { 118 copy_sse<1>(toEnd, fromEnd); 119 } 120 } 121 122 123 } 124 125 126 extern "C" void* memcpy(void* destination, const void* source, size_t length) 127 { 128 auto to = static_cast<uint8_t*>(destination); 129 auto from = static_cast<const uint8_t*>(source); 130 if (length <= 16) { 131 memcpy_small(to, from, length); 132 return destination; 133 } 134 if (length < 2048) { 135 memcpy_sse(to, from, length); 136 return destination; 137 } 138 memcpy_repmovs(to, from, length); 139 return destination; 140 } 141 142 143 static inline void 144 memset_repstos(uint8_t* destination, uint8_t value, size_t length) 145 { 146 __asm__ __volatile__("rep stosb" 147 : "+D" (destination), "+c" (length) 148 : "a" (value) 149 : "memory"); 150 } 151 152 153 static inline void 154 memset_sse(uint8_t* destination, uint8_t value, size_t length) 155 { 156 __m128i packed = _mm_set1_epi8(value); 157 auto end = reinterpret_cast<__m128i*>(destination + length - 16); 158 auto diff = reinterpret_cast<uintptr_t>(destination) % 16; 159 if (diff) { 160 diff = 16 - diff; 161 length -= diff; 162 _mm_storeu_si128(reinterpret_cast<__m128i*>(destination), packed); 163 } 164 auto ptr = reinterpret_cast<__m128i*>(destination + diff); 165 while (length >= 64) { 166 _mm_store_si128(ptr++, packed); 167 _mm_store_si128(ptr++, packed); 168 _mm_store_si128(ptr++, packed); 169 _mm_store_si128(ptr++, packed); 170 length -= 64; 171 } 172 while (length >= 16) { 173 _mm_store_si128(ptr++, packed); 174 length -= 16; 175 } 176 _mm_storeu_si128(end, packed); 177 } 178 179 180 static inline void 181 memset_small(uint8_t* destination, uint8_t value, size_t length) 182 { 183 if (length >= 8) { 184 auto packed = value * 0x101010101010101ul; 185 auto ptr = reinterpret_cast<uint64_t*>(destination); 186 auto end = reinterpret_cast<uint64_t*>(destination + length - 8); 187 while (length >= 8) { 188 *ptr++ = packed; 189 length -= 8; 190 } 191 *end = packed; 192 } else { 193 while (length--) { 194 *destination++ = value; 195 } 196 } 197 } 198 199 200 extern "C" void* 201 memset(void* ptr, int chr, size_t length) 202 { 203 auto value = static_cast<unsigned char>(chr); 204 auto destination = static_cast<uint8_t*>(ptr); 205 if (length < 32) { 206 memset_small(destination, value, length); 207 return ptr; 208 } 209 if (length < 2048) { 210 memset_sse(destination, value, length); 211 return ptr; 212 } 213 memset_repstos(destination, value, length); 214 return ptr; 215 } 216 217