DU Results -

Source patches

du6bitcount.hpp

8c8
< #include <immintrin.h>
---
> #include <nmmintrin.h>
206c206,209
<         return _mm_set1_epi64x( static_cast<std::uint64_t>(-1LL));
---
>         __m128i n;
>         for (std::size_t i = 0; i < packs64; ++i)
>             n.m128i_u64[i] = static_cast<std::uint64_t>(-1LL);
>         return n;
211c214
<         return reinterpret_cast<const std::uint8_t*>(&vec)[index];
---
>         return vec.m128i_u8[index];
216c219
<         return reinterpret_cast<const std::uint64_t*>(&vec)[index];
---
>         return vec.m128i_u64[index];
221c224
<         return reinterpret_cast<std::uint64_t*>(&vec)[index];
---
>         return vec.m128i_u64[index];
226,227c229,232
<         auto r = _mm_cmpeq_epi64(v1,v2);
< 	return _mm_test_all_ones(r);
---
>         bool res = true;
>         for (std::size_t i = 0; i < packs64; ++i)
>             res = res && v1.m128i_u64[i] == v2.m128i_u64[i];
>         return res;
230c235
<     static __m128i or_(__m128i v1, __m128i v2)
---
>     static __m128i or(__m128i v1, __m128i v2)
235c240
<     static __m128i and_(__m128i v1, __m128i v2)
---
>     static __m128i and(__m128i v1, __m128i v2)
240c245
<     static __m128i xor_(__m128i v1, __m128i v2)
---
>     static __m128i xor(__m128i v1, __m128i v2)
260c265,268
<         return _mm256_set1_epi64x( static_cast<std::uint64_t>(-1LL));
---
>         __m256i n;
>         for (std::size_t i = 0; i < packs64; ++i)
>             n.m256i_u64[i] = static_cast<std::uint64_t>(-1LL);
>         return n;
265c273
<         return reinterpret_cast<const std::uint8_t*>(&vec)[index];
---
>         return vec.m256i_u8[index];
270c278
<         return reinterpret_cast<std::uint64_t*>(&vec)[index];
---
>         return vec.m256i_u64[index];
275c283
<         return reinterpret_cast<const std::uint64_t*>(&vec)[index];
---
>         return vec.m256i_u64[index];
280,281c288,291
<         auto r = _mm256_cmpeq_epi64(v1,v2);
< 	return _mm256_movemask_epi8(r) == 0xFFFFFFFF;
---
>         bool res = true;
>         for (std::size_t i = 0; i < packs64; ++i)
>             res = res && v1.m256i_u64[i] == v2.m256i_u64[i];
>         return res;
284c294
<     static __m256i or_(__m256i v1, __m256i v2)
---
>     static __m256i or(__m256i v1, __m256i v2)
289c299
<     static __m256i and_(__m256i v1, __m256i v2)
---
>     static __m256i and(__m256i v1, __m256i v2)
294c304
<     static __m256i xor_(__m256i v1, __m256i v2)
---
>     static __m256i xor(__m256i v1, __m256i v2)
344c354
<     static __m512i or_(__m512i v1, __m512i v2)
---
>     static __m512i or(__m512i v1, __m512i v2)
349c359
<     static __m512i and_(__m512i v1, __m512i v2)
---
>     static __m512i and(__m512i v1, __m512i v2)
354c364
<     static __m512i xor_(__m512i v1, __m512i v2)
---
>     static __m512i xor(__m512i v1, __m512i v2)
398c408
<             total_simd = simd_traits_t::or_(total_simd, m_storage[i]);
---
>             total_simd = simd_traits_t::or(total_simd, m_storage[i]);
400,401c410
<         std::size_t i = 0; 
<         for (; i < remaining_simd_in_last(); ++i)
---
>         for (std::size_t i = 0; i < remaining_simd_in_last(); ++i)
426c435
<             m_storage[i] = simd_traits_t::xor_(a.m_storage[i], neg_one);
---
>             m_storage[i] = simd_traits_t::xor(a.m_storage[i], neg_one);
442c451
<             m_storage[i] = simd_traits_t::and_(a.m_storage[i], b.m_storage[i]);
---
>             m_storage[i] = simd_traits_t::and(a.m_storage[i], b.m_storage[i]);
459c468
<             m_storage[i] = simd_traits_t::or_(a.m_storage[i], b.m_storage[i]);
---
>             m_storage[i] = simd_traits_t::or(a.m_storage[i], b.m_storage[i]);
533c542
< #endif
---
> #endif
\ No newline at end of file

Flex error reports

C++ error reports

Linker error reports

Run output diffs

empty.in

size	and/random/AVX	or/random/AVX	not/random/AVX	zero/random/AVX	bitcount/zero/AVX	bitcount/random/AVX	bitcount/one/AVX
16361	0.00650001	0.00610001	0.00540001	0.0045	0.1146	0.1145	0.1143
65521	0.00470003	0.00470003	0.00410002	0.00330002	0.113201	0.112901	0.113301
262161	0.0119002	0.0119002	0.00990013	0.00310004	0.116402	0.116602	0.116502
1048721	0.0198009	0.0193009	0.0135006	0.00770034	0.123205	0.123706	0.123205
4194961	0.0223076	0.0224076	0.0166057	0.0104035	0.127744	0.127443	0.127644
16779921	0.0224358	0.0223356	0.0165264	0.0104166	0.128605	0.128405	0.128505