#include #include #include #include #include #include #include #include #include #if defined(BOOST_WINDOWS) #define WIN32_LEAN_AND_MEAN #include #endif #if defined(_MSC_VER) // MSVC does not always have immintrin.h (at least, not up to MSVC 10), and we cannot detect the target higher than SSE2 anyway, so just include this header #include #else #include #endif using boost::uuids::uuid; bool stock_equal(uuid const& left, uuid const& right) { return std::equal(left.begin(), left.end(), right.begin()); } bool mem_equal(uuid const& left, uuid const& right) { return std::memcmp(left.data, right.data, 16) == 0; } bool simd_equal(uuid const& left, uuid const& right) { #if defined(__SSE3__) __m128i mm_left = _mm_lddqu_si128(reinterpret_cast< const __m128i* >(left.data)); __m128i mm_right = _mm_lddqu_si128(reinterpret_cast< const __m128i* >(right.data)); #else __m128i mm_left = _mm_loadu_si128(reinterpret_cast< const __m128i* >(left.data)); __m128i mm_right = _mm_loadu_si128(reinterpret_cast< const __m128i* >(right.data)); #endif __m128i mm_cmp = _mm_cmpeq_epi32(mm_left, mm_right); #if defined(__SSE4_1__) return _mm_test_all_ones(mm_cmp); #else return _mm_movemask_epi8(mm_cmp) == 0xFFFF; #endif } bool stock_less(uuid const& left, uuid const& right) { return std::lexicographical_compare(left.begin(), left.end(), right.begin(), right.end()); } bool mem_less(uuid const& left, uuid const& right) { return std::memcmp(left.data, right.data, 16) < 0; } bool simd_less(uuid const& left, uuid const& right) { #if defined(__SSE3__) __m128i mm_left = _mm_lddqu_si128(reinterpret_cast< const __m128i* >(left.data)); __m128i mm_right = _mm_lddqu_si128(reinterpret_cast< const __m128i* >(right.data)); #else __m128i mm_left = _mm_loadu_si128(reinterpret_cast< const __m128i* >(left.data)); __m128i mm_right = _mm_loadu_si128(reinterpret_cast< const __m128i* >(right.data)); #endif // To emulate lexicographical_compare behavior we have to perform two comparisons - the forward and reverse one. // Then we know which bytes are equivalent and which ones are different, and for those different the comparison results // will be opposite. Then we'll be able to find the first differing comparison result (for both forward and reverse ways), // and depending on which way it is for, this will be the result of the operation. There are a few notes to consider: // // 1. Due to little endian byte order the first bytes go into the lower part of the xmm registers, // so the comparison results in the least significant bits will actually be the most signigicant for the final operation result. // This means we have to determine which of the comparison results have the least significant bit on, and this is achieved with // the "(x - 1) ^ x" trick. // 2. Because there is only signed comparison in SSE/AVX, we have to operate on 16 bit integers. We zero-extend bytes to 16 bit words // before comparison and then pack the comparison results back to 8 bits. For AVX2 we can still do the comparison in one instruction though. // 3. pcmpgtw compares for "greater" relation, so we swap the arguments to get what we need. const __m128i mm_0 = _mm_setzero_si128(); __m128i mm_left_lo = _mm_unpacklo_epi8(mm_left, mm_0), mm_right_lo = _mm_unpacklo_epi8(mm_right, mm_0); __m128i mm_left_hi = _mm_unpackhi_epi8(mm_left, mm_0), mm_right_hi = _mm_unpackhi_epi8(mm_right, mm_0); __m128i mm_cmp_lo = _mm_cmpgt_epi16(mm_right_lo, mm_left_lo), mm_cmp_hi = _mm_cmpgt_epi16(mm_right_hi, mm_left_hi); __m128i mm_rcmp_lo = _mm_cmpgt_epi16(mm_left_lo, mm_right_lo), mm_rcmp_hi = _mm_cmpgt_epi16(mm_left_hi, mm_right_hi); __m128i mm_cmp = _mm_packs_epi16(mm_cmp_lo, mm_cmp_hi); __m128i mm_rcmp = _mm_packs_epi16(mm_rcmp_lo, mm_rcmp_hi); boost::uint32_t cmp = _mm_movemask_epi8(mm_cmp), rcmp = _mm_movemask_epi8(mm_rcmp); cmp = (cmp - 1u) ^ cmp; rcmp = (rcmp - 1u) ^ rcmp; return static_cast< boost::uint16_t >(cmp) < static_cast< boost::uint16_t >(rcmp); } typedef bool equal_t(uuid const& left, uuid const& right); const unsigned int iterations = 100000000u; void test_performance(uuid const& left, uuid const& right, const char* name, equal_t* eq) { #if !defined(BOOST_WINDOWS) struct timespec start = {}, end = {}; clock_gettime(CLOCK_REALTIME, &start); #else LARGE_INTEGER start = {}, end = {}; QueryPerformanceCounter(&start); #endif for (volatile unsigned int i = 0; i < iterations; ++i) { eq(left, right); } #if !defined(BOOST_WINDOWS) clock_gettime(CLOCK_REALTIME, &end); boost::uint64_t duration = (end.tv_sec - start.tv_sec) * 1000000000ull + (end.tv_nsec - start.tv_nsec); #else QueryPerformanceCounter(&end); LARGE_INTEGER freq = {}; QueryPerformanceFrequency(&freq); boost::uint64_t duration = (end.QuadPart - start.QuadPart) * 1000000000ull / freq.QuadPart; #endif std::cout << name << " duration: " << duration << " ns" << std::endl; } void run_tests(uuid const& left, uuid const& right) { test_performance(left, right, "stock_equal", &stock_equal); test_performance(left, right, "mem_equal", &mem_equal); test_performance(left, right, "simd_equal", &simd_equal); test_performance(left, right, "stock_less", &stock_less); test_performance(left, right, "mem_less", &mem_less); test_performance(left, right, "simd_less", &simd_less); } int main(int, char*[]) { boost::uuids::random_generator gen; { std::cout << "Values placed on stack:" << std::endl; uuid left = gen(); uuid right = gen(); run_tests(left, right); } { std::cout << "\nValues placed on heap:" << std::endl; std::auto_ptr< uuid > pleft(new uuid(gen())); std::auto_ptr< uuid > pright(new uuid(gen())); run_tests(*pleft, *pright); } return 0; }