Bit count : preprocessor magic vs modern C++

Why not use the standard library?

#include <bitset>

int bits_in(std::uint64_t u)
{
    auto bs = std::bitset<64>(u);
    return bs.count();
}

resulting assembler (Compiled with -O2 -march=native):

bits_in(unsigned long):
        xor     eax, eax
        popcnt  rax, rdi
        ret

It is worth mentioning at this point that not all x86 processors have this instruction so (at least with gcc) you will need to let it know what architecture to compile for.

@tambre mentioned that in reality, when it can, the optimiser will go further:

volatile int results[3];

int main()
{
    results[0] = bits_in(255);
    results[1] = bits_in(1023);
    results[2] = bits_in(0x8000800080008000);   
}

resulting assembler:

main:
        mov     DWORD PTR results[rip], 8
        xor     eax, eax
        mov     DWORD PTR results[rip+4], 10
        mov     DWORD PTR results[rip+8], 4
        ret

Old-school bit-twiddlers like me need to find new problems to solve 🙂

Update

Not everyone was happy that the solution relies on cpu help to compute the bitcount. So what if we used an autogenerated table but allowed the developer to configure the size of it? (warning – long compile time for the 16-bit table version)

#include <utility>
#include <cstdint>
#include <array>
#include <numeric>
#include <bitset>


template<std::size_t word_size, std::size_t...Is>
constexpr auto generate(std::integral_constant<std::size_t, word_size>, std::index_sequence<Is...>) {
    struct popcount_type {
        constexpr auto operator()(int i) const {
            int bits = 0;
            while (i) {
                i &= i - 1;
                ++bits;
            }
            return bits;
        }
    };
    constexpr auto popcnt = popcount_type();

    return std::array<int, sizeof...(Is)>
            {
                    {popcnt(Is)...}
            };
}

template<class T>
constexpr auto power2(T x) {
    T result = 1;
    for (T i = 0; i < x; ++i)
        result *= 2;
    return result;
}


template<class TableWord>
struct table {
    static constexpr auto word_size = std::numeric_limits<TableWord>::digits;
    static constexpr auto table_length = power2(word_size);
    using array_type = std::array<int, table_length>;
    static const array_type& get_data() {
        static const array_type data = generate(std::integral_constant<std::size_t, word_size>(),
                                           std::make_index_sequence<table_length>());
        return data;
    };

};

template<class Word>
struct use_table_word {
};

template<class Word, class TableWord = std::uint8_t>
int bits_in(Word val, use_table_word<TableWord> = use_table_word<TableWord>()) {
    constexpr auto table_word_size = std::numeric_limits<TableWord>::digits;

    constexpr auto word_size = std::numeric_limits<Word>::digits;
    constexpr auto times = word_size / table_word_size;
    static_assert(times > 0, "incompatible");

    auto reduce = [val](auto times) {
        return (val >> (table_word_size * times)) & (power2(table_word_size) - 1);
    };

    auto const& data = table<TableWord>::get_data();
    auto result = 0;
    for (int i = 0; i < times; ++i) {
        result += data[reduce(i)];
    }
    return result;
}

volatile int results[3];

#include <iostream>

int main() {
    auto input = std::uint64_t(1023);
    results[0] = bits_in(input);
    results[0] = bits_in(input, use_table_word<std::uint16_t>());

    results[1] = bits_in(0x8000800080008000);
    results[2] = bits_in(34567890);

    for (int i = 0; i < 3; ++i) {
        std::cout << results[i] << std::endl;
    }
    return 0;
}

Final Update

This version allows the use of any number of bits in the lookup table and supports any input type, even if it’s smaller than the number of bits in the lookup table.

It also short-circuits if the high bits are zero.

#include <utility>
#include <cstdint>
#include <array>
#include <numeric>
#include <algorithm>

namespace detail {
    template<std::size_t bits, typename = void>
    struct smallest_word;

    template<std::size_t bits>
    struct smallest_word<bits, std::enable_if_t<(bits <= 8)>>
    {
        using type = std::uint8_t;
    };

    template<std::size_t bits>
    struct smallest_word<bits, std::enable_if_t<(bits > 8 and bits <= 16)>>
    {
        using type = std::uint16_t;
    };

    template<std::size_t bits>
    struct smallest_word<bits, std::enable_if_t<(bits > 16 and bits <= 32)>>
    {
        using type = std::uint32_t;
    };

    template<std::size_t bits>
    struct smallest_word<bits, std::enable_if_t<(bits > 32 and bits <= 64)>>
    {
        using type = std::uint64_t;
    };
}

template<std::size_t bits> using smallest_word = typename detail::smallest_word<bits>::type;

template<class WordType, std::size_t bits, std::size_t...Is>
constexpr auto generate(std::index_sequence<Is...>) {

    using word_type = WordType;

    struct popcount_type {
        constexpr auto operator()(word_type i) const {
            int result = 0;
            while (i) {
                i &= i - 1;
                ++result;
            }
            return result;
        }
    };
    constexpr auto popcnt = popcount_type();

    return std::array<word_type, sizeof...(Is)>
            {
                    {popcnt(Is)...}
            };
}

template<class T>
constexpr auto power2(T x) {
    return T(1) << x;
}

template<std::size_t word_size>
struct table {

    static constexpr auto table_length = power2(word_size);

    using word_type = smallest_word<word_size>;

    using array_type = std::array<word_type, table_length>;

    static const array_type& get_data() {
        static const array_type data = generate<word_type, word_size>(std::make_index_sequence<table_length>());
        return data;
    };

    template<class Type, std::size_t bits>
    static constexpr auto n_bits() {
        auto result = Type();
        auto b = bits;
        while(b--) {
            result = (result << 1) | Type(1);
        }
        return result;
    };

    template<class Uint>
    int operator()(Uint i) const {
        constexpr auto mask = n_bits<Uint, word_size>();
        return get_data()[i & mask];
    }

};

template<int bits>
struct use_bits {
    static constexpr auto digits = bits;
};

template<class T>
constexpr auto minimum(T x, T y) { return x < y ? x : y; }

template<class Word, class UseBits = use_bits<8>>
int bits_in(Word val, UseBits = UseBits()) {

    using word_type = std::make_unsigned_t<Word>;
    auto uval = static_cast<word_type>(val);


    constexpr auto table_word_size = UseBits::digits;
    constexpr auto word_size = std::numeric_limits<word_type>::digits;

    auto const& mytable = table<table_word_size>();
    int result = 0;
    while (uval)
    {
        result += mytable(uval);
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wshift-count-overflow"
                uval >>= minimum(table_word_size, word_size);
#pragma clang diagnostic pop
    }

    return result;
}

volatile int results[4];

#include <iostream>

int main() {
    auto input = std::uint8_t(127);
    results[0] = bits_in(input);
    results[1] = bits_in(input, use_bits<4>());
    results[2] = bits_in(input, use_bits<11>());
    results[3] = bits_in(input, use_bits<15>());

    for (auto&& i : results) {
        std::cout << i << std::endl;
    }

    auto input2 = 0xabcdef;
    results[0] = bits_in(input2);
    results[1] = bits_in(input2, use_bits<4>());
    results[2] = bits_in(input2, use_bits<11>());
    results[3] = bits_in(input2, use_bits<15>());

    for (auto&& i : results) {
        std::cout << i << std::endl;
    }

    auto input3 = -1;
    results[0] = bits_in(input3);
    results[1] = bits_in(input3, use_bits<4>());
    results[2] = bits_in(input3, use_bits<11>());
    results[3] = bits_in(input3, use_bits<15>());

    for (auto&& i : results) {
        std::cout << i << std::endl;
    }

    return 0;
}

example output:

The resulting assembly output for a call to bits_in(int, use_bits<11>()) for example, becomes:

.L16:
        mov     edx, edi
        and     edx, 2047
        movzx   edx, WORD PTR table<11ul>::get_data()::data[rdx+rdx]
        add     eax, edx
        shr     edi, 11
        jne     .L16

Which seems reasonable to me.

Update

Final Update

Leave a Comment Cancel reply