c++ - char 配列で最大 6 つの連続する 0 ビットを見つける最速の方法

Question

これは私が現在やっていることです：

int dataLen = 500;
char data[dataLen];
int desired = 1; // between 1 and 6, inclusive
...
char bits[dataLen * 8];
for (int32 j = 0; j < dataLen; j++) {
    for (int i = 0; i < 8; i++) {
        bits[j*8+i] = ( (data[j] & (1 << (7-i))) ? '1' : '0' );
    }
}
int offset = std::search_n(bits, bits + dataLen*8, desired, '0') - bits;

本当に厄介です、私は知っています、そしてそれはパフォーマンスを殺しています。

xchar 配列内の連続する 0 ビットの最初のセットのビットオフセットを見つける最速の方法は何0 < x < 7ですか? 私は SSE 4.2 で GCC を使用しているため、__builtin_ctz、__builtin_popcountl などのビルトインはオプションですが、それらを使用する最良の方法がわかりません。

score 5 · Accepted Answer

連続する 2 バイトを考慮しても、0 ビットが 6 つ連続する数字はいくつありますか?

Byte 1
XXXXXXXX
000000??             0/1/2/3
?000000?             0/1/128/129
??000000             0/64/128/192

したがって、一度に 1 バイトを考慮すると、0/1/2/3/64/128/192

Byte a   Byte b
XXXXXXXX XXXXXXXX
??100000 0???????    (a & 31 == 0) && (b & 128 == 0)
???10000 00??????    (a & 15 == 0) && (b & 192 == 0)
????1000 000?????    (a & 7  == 0) && (b & 224 == 0)
?????100 0000????    (a & 3  == 0) && (b & 240 == 0)
??????10 00000???    (a & 1  == 0) && (b & 248 == 0)

したがって、絶対最大 12 回のテストですべての組み合わせが得られます。
賢く比較すれば、それを減らすことができると確信しています。

テーブル駆動型アプローチを使用するために、以下の @Michael Burr ソリューションをスチールするとします。次に、バイトごとに 1 つまたは 2 つの比較を実行できるように構成します。

struct TestStruct
{
    bool    is6Consecutive;
    bool    hasTrailing;
    int     maskNextByte;
    int     offset;
};
TestStruct   testData[256];

std::size_t findOffsetOf_6ConsecutiveZero(char const* data, size_t size)
{
    for(int loop = 0;loop < (size-1); ++loop)
    {
        char const&           val  = data[loop];
        TestStructure const&  test = testData[val];

        if (test.is6Consecutive)
        {   return loop*8 + test.offset;
        }

        if (test.hasTrailing)
        {
            if ((data[loop + 1] & test.maskNextByte) == 0)
            {   return loop*8 + test.offset;
            }
        }
    }
    // Test last byte
    TestStructure const&  test = testData[data[size-1]];
    if (test.is6Consecutive)
    {   return (size-1)*8 + test.offset;
    }

    return -1;
}

最初のいくつかのテーブルエントリ:

TestStruct   testData[256] =
{
    {true,  false, 0x00, 0},
    {true,  false, 0x00, 0},
    {true,  false, 0x00, 0},
    {true,  false, 0x00, 0},
    {false, true,  0xf0, 6},  // 4 => 00000100 <Next 4 bytes are zero we hit>
    {false, false, 0x00, 0},  // 5 => 00000101 <Ignore and move on>
    {false, true,  0xf8, 7},  // 6 => 00000110 <Next 5 bytes are zero we hit>
    etc...

};

score 3 · Accepted Answer

これは、質問で提供されたものの出力と一致する関数です（少なくとも限定的なテストでは）。これは、1 回限りのスクリプトによって生成されたテーブルを使用して、テーブルルックアップを使用します。そのパフォーマンスがビットテストハッカーや GCC ビルトインを使用する提案と競合するかどうかは正直わかりませんが、そう遠くないことは間違いありません。

struct zeros {
    unsigned char leading;
    unsigned char internal;
    unsigned char trailing;
};

// forward declaration so the long, non-interesting table is at the 
//  end of this 
static struct zeros const zero_table[256];

int find_zero_bits_offset( char const* data, int datalen, int desired)
{
    int offset = -1;
    int found = 0;
    char const* dataptr = &data[0];
    char const* endptr  = &data[datalen];


    // first, find which byte the sequence of zeros begins

    while (!found && (dataptr != endptr)) {
        unsigned char ch1 = *dataptr++;
        unsigned char ch2 = (dataptr != endptr) ? *dataptr : 0xff;

        int internal = zero_table[ch1].internal;
        int trailing = zero_table[ch1].trailing;
        int leading =  zero_table[ch2].leading;

        found = (desired <= internal) || 
                (trailing && (desired <= (trailing + leading)));
    }


    // now zero in on where the sequence starts within the byte

    if (found) {
        char ch = 0;
        unsigned int mask = 0;

        --dataptr;

        // dataptr points to the byte where the sequence of zeros starts.
        //  figure out exactly where the sequence is...

        // there's possibly some opportunity for optimization, if neccesary,
        //  by testing if the sequence was found in the "leading", "internal", or
        //  "trailing" cases. But the explicit loop will only iterate at most
        //  8 times (and will early-out on the first iteration if the match is 
        //  for the "leading" case) that it didn't seem too important

        ch = *dataptr;
        offset = (dataptr - data) * 8;

        // figure out where the appropriate internal run starts
        // note that the offset we need to return isn't necessarily the
        //  offset for the run of zeros counted by zero_table[tmp].internal_offset
        //  since a sufficient shorter run might come first

        // there may be a more efficient bithack for this, but the
        //  loop will iterate at most 8 times...
        mask = ((1 << desired) - 1);
        mask <<= (8 - desired);

        while ((ch & mask) != 0) {
            ++offset;
            mask >>= 1;
        }
    }
    else {
        // however you want to handle the "not found" situation. 
        //  This is equivalent to what was in the question:
        offset = (endptr - data) * 8;
    }

    return offset;
}



static struct zeros const zero_table[256] = {
    { 8, 8, 8 },  // 0000 0000
    { 7, 7, 0 },  // 0000 0001
    { 6, 6, 1 },  // 0000 0010
    { 6, 6, 0 },  // 0000 0011
    { 5, 5, 2 },  // 0000 0100
    { 5, 5, 0 },  // 0000 0101
    { 5, 5, 1 },  // 0000 0110
    { 5, 5, 0 },  // 0000 0111
    { 4, 4, 3 },  // 0000 1000
    { 4, 4, 0 },  // 0000 1001
    { 4, 4, 1 },  // 0000 1010
    { 4, 4, 0 },  // 0000 1011
    { 4, 4, 2 },  // 0000 1100
    { 4, 4, 0 },  // 0000 1101
    { 4, 4, 1 },  // 0000 1110
    { 4, 4, 0 },  // 0000 1111
    { 3, 4, 4 },  // 0001 0000
    { 3, 3, 0 },  // 0001 0001
    { 3, 3, 1 },  // 0001 0010
    { 3, 3, 0 },  // 0001 0011
    { 3, 3, 2 },  // 0001 0100
    { 3, 3, 0 },  // 0001 0101
    { 3, 3, 1 },  // 0001 0110
    { 3, 3, 0 },  // 0001 0111
    { 3, 3, 3 },  // 0001 1000
    { 3, 3, 0 },  // 0001 1001
    { 3, 3, 1 },  // 0001 1010
    { 3, 3, 0 },  // 0001 1011
    { 3, 3, 2 },  // 0001 1100
    { 3, 3, 0 },  // 0001 1101
    { 3, 3, 1 },  // 0001 1110
    { 3, 3, 0 },  // 0001 1111
    { 2, 5, 5 },  // 0010 0000
    { 2, 4, 0 },  // 0010 0001
    { 2, 3, 1 },  // 0010 0010
    { 2, 3, 0 },  // 0010 0011
    { 2, 2, 2 },  // 0010 0100
    { 2, 2, 0 },  // 0010 0101
    { 2, 2, 1 },  // 0010 0110
    { 2, 2, 0 },  // 0010 0111
    { 2, 3, 3 },  // 0010 1000
    { 2, 2, 0 },  // 0010 1001
    { 2, 2, 1 },  // 0010 1010
    { 2, 2, 0 },  // 0010 1011
    { 2, 2, 2 },  // 0010 1100
    { 2, 2, 0 },  // 0010 1101
    { 2, 2, 1 },  // 0010 1110
    { 2, 2, 0 },  // 0010 1111
    { 2, 4, 4 },  // 0011 0000
    { 2, 3, 0 },  // 0011 0001
    { 2, 2, 1 },  // 0011 0010
    { 2, 2, 0 },  // 0011 0011
    { 2, 2, 2 },  // 0011 0100
    { 2, 2, 0 },  // 0011 0101
    { 2, 2, 1 },  // 0011 0110
    { 2, 2, 0 },  // 0011 0111
    { 2, 3, 3 },  // 0011 1000
    { 2, 2, 0 },  // 0011 1001
    { 2, 2, 1 },  // 0011 1010
    { 2, 2, 0 },  // 0011 1011
    { 2, 2, 2 },  // 0011 1100
    { 2, 2, 0 },  // 0011 1101
    { 2, 2, 1 },  // 0011 1110
    { 2, 2, 0 },  // 0011 1111
    { 1, 6, 6 },  // 0100 0000
    { 1, 5, 0 },  // 0100 0001
    { 1, 4, 1 },  // 0100 0010
    { 1, 4, 0 },  // 0100 0011
    { 1, 3, 2 },  // 0100 0100
    { 1, 3, 0 },  // 0100 0101
    { 1, 3, 1 },  // 0100 0110
    { 1, 3, 0 },  // 0100 0111
    { 1, 3, 3 },  // 0100 1000
    { 1, 2, 0 },  // 0100 1001
    { 1, 2, 1 },  // 0100 1010
    { 1, 2, 0 },  // 0100 1011
    { 1, 2, 2 },  // 0100 1100
    { 1, 2, 0 },  // 0100 1101
    { 1, 2, 1 },  // 0100 1110
    { 1, 2, 0 },  // 0100 1111
    { 1, 4, 4 },  // 0101 0000
    { 1, 3, 0 },  // 0101 0001
    { 1, 2, 1 },  // 0101 0010
    { 1, 2, 0 },  // 0101 0011
    { 1, 2, 2 },  // 0101 0100
    { 1, 1, 0 },  // 0101 0101
    { 1, 1, 1 },  // 0101 0110
    { 1, 1, 0 },  // 0101 0111
    { 1, 3, 3 },  // 0101 1000
    { 1, 2, 0 },  // 0101 1001
    { 1, 1, 1 },  // 0101 1010
    { 1, 1, 0 },  // 0101 1011
    { 1, 2, 2 },  // 0101 1100
    { 1, 1, 0 },  // 0101 1101
    { 1, 1, 1 },  // 0101 1110
    { 1, 1, 0 },  // 0101 1111
    { 1, 5, 5 },  // 0110 0000
    { 1, 4, 0 },  // 0110 0001
    { 1, 3, 1 },  // 0110 0010
    { 1, 3, 0 },  // 0110 0011
    { 1, 2, 2 },  // 0110 0100
    { 1, 2, 0 },  // 0110 0101
    { 1, 2, 1 },  // 0110 0110
    { 1, 2, 0 },  // 0110 0111
    { 1, 3, 3 },  // 0110 1000
    { 1, 2, 0 },  // 0110 1001
    { 1, 1, 1 },  // 0110 1010
    { 1, 1, 0 },  // 0110 1011
    { 1, 2, 2 },  // 0110 1100
    { 1, 1, 0 },  // 0110 1101
    { 1, 1, 1 },  // 0110 1110
    { 1, 1, 0 },  // 0110 1111
    { 1, 4, 4 },  // 0111 0000
    { 1, 3, 0 },  // 0111 0001
    { 1, 2, 1 },  // 0111 0010
    { 1, 2, 0 },  // 0111 0011
    { 1, 2, 2 },  // 0111 0100
    { 1, 1, 0 },  // 0111 0101
    { 1, 1, 1 },  // 0111 0110
    { 1, 1, 0 },  // 0111 0111
    { 1, 3, 3 },  // 0111 1000
    { 1, 2, 0 },  // 0111 1001
    { 1, 1, 1 },  // 0111 1010
    { 1, 1, 0 },  // 0111 1011
    { 1, 2, 2 },  // 0111 1100
    { 1, 1, 0 },  // 0111 1101
    { 1, 1, 1 },  // 0111 1110
    { 1, 1, 0 },  // 0111 1111
    { 0, 7, 7 },  // 1000 0000
    { 0, 6, 0 },  // 1000 0001
    { 0, 5, 1 },  // 1000 0010
    { 0, 5, 0 },  // 1000 0011
    { 0, 4, 2 },  // 1000 0100
    { 0, 4, 0 },  // 1000 0101
    { 0, 4, 1 },  // 1000 0110
    { 0, 4, 0 },  // 1000 0111
    { 0, 3, 3 },  // 1000 1000
    { 0, 3, 0 },  // 1000 1001
    { 0, 3, 1 },  // 1000 1010
    { 0, 3, 0 },  // 1000 1011
    { 0, 3, 2 },  // 1000 1100
    { 0, 3, 0 },  // 1000 1101
    { 0, 3, 1 },  // 1000 1110
    { 0, 3, 0 },  // 1000 1111
    { 0, 4, 4 },  // 1001 0000
    { 0, 3, 0 },  // 1001 0001
    { 0, 2, 1 },  // 1001 0010
    { 0, 2, 0 },  // 1001 0011
    { 0, 2, 2 },  // 1001 0100
    { 0, 2, 0 },  // 1001 0101
    { 0, 2, 1 },  // 1001 0110
    { 0, 2, 0 },  // 1001 0111
    { 0, 3, 3 },  // 1001 1000
    { 0, 2, 0 },  // 1001 1001
    { 0, 2, 1 },  // 1001 1010
    { 0, 2, 0 },  // 1001 1011
    { 0, 2, 2 },  // 1001 1100
    { 0, 2, 0 },  // 1001 1101
    { 0, 2, 1 },  // 1001 1110
    { 0, 2, 0 },  // 1001 1111
    { 0, 5, 5 },  // 1010 0000
    { 0, 4, 0 },  // 1010 0001
    { 0, 3, 1 },  // 1010 0010
    { 0, 3, 0 },  // 1010 0011
    { 0, 2, 2 },  // 1010 0100
    { 0, 2, 0 },  // 1010 0101
    { 0, 2, 1 },  // 1010 0110
    { 0, 2, 0 },  // 1010 0111
    { 0, 3, 3 },  // 1010 1000
    { 0, 2, 0 },  // 1010 1001
    { 0, 1, 1 },  // 1010 1010
    { 0, 1, 0 },  // 1010 1011
    { 0, 2, 2 },  // 1010 1100
    { 0, 1, 0 },  // 1010 1101
    { 0, 1, 1 },  // 1010 1110
    { 0, 1, 0 },  // 1010 1111
    { 0, 4, 4 },  // 1011 0000
    { 0, 3, 0 },  // 1011 0001
    { 0, 2, 1 },  // 1011 0010
    { 0, 2, 0 },  // 1011 0011
    { 0, 2, 2 },  // 1011 0100
    { 0, 1, 0 },  // 1011 0101
    { 0, 1, 1 },  // 1011 0110
    { 0, 1, 0 },  // 1011 0111
    { 0, 3, 3 },  // 1011 1000
    { 0, 2, 0 },  // 1011 1001
    { 0, 1, 1 },  // 1011 1010
    { 0, 1, 0 },  // 1011 1011
    { 0, 2, 2 },  // 1011 1100
    { 0, 1, 0 },  // 1011 1101
    { 0, 1, 1 },  // 1011 1110
    { 0, 1, 0 },  // 1011 1111
    { 0, 6, 6 },  // 1100 0000
    { 0, 5, 0 },  // 1100 0001
    { 0, 4, 1 },  // 1100 0010
    { 0, 4, 0 },  // 1100 0011
    { 0, 3, 2 },  // 1100 0100
    { 0, 3, 0 },  // 1100 0101
    { 0, 3, 1 },  // 1100 0110
    { 0, 3, 0 },  // 1100 0111
    { 0, 3, 3 },  // 1100 1000
    { 0, 2, 0 },  // 1100 1001
    { 0, 2, 1 },  // 1100 1010
    { 0, 2, 0 },  // 1100 1011
    { 0, 2, 2 },  // 1100 1100
    { 0, 2, 0 },  // 1100 1101
    { 0, 2, 1 },  // 1100 1110
    { 0, 2, 0 },  // 1100 1111
    { 0, 4, 4 },  // 1101 0000
    { 0, 3, 0 },  // 1101 0001
    { 0, 2, 1 },  // 1101 0010
    { 0, 2, 0 },  // 1101 0011
    { 0, 2, 2 },  // 1101 0100
    { 0, 1, 0 },  // 1101 0101
    { 0, 1, 1 },  // 1101 0110
    { 0, 1, 0 },  // 1101 0111
    { 0, 3, 3 },  // 1101 1000
    { 0, 2, 0 },  // 1101 1001
    { 0, 1, 1 },  // 1101 1010
    { 0, 1, 0 },  // 1101 1011
    { 0, 2, 2 },  // 1101 1100
    { 0, 1, 0 },  // 1101 1101
    { 0, 1, 1 },  // 1101 1110
    { 0, 1, 0 },  // 1101 1111
    { 0, 5, 5 },  // 1110 0000
    { 0, 4, 0 },  // 1110 0001
    { 0, 3, 1 },  // 1110 0010
    { 0, 3, 0 },  // 1110 0011
    { 0, 2, 2 },  // 1110 0100
    { 0, 2, 0 },  // 1110 0101
    { 0, 2, 1 },  // 1110 0110
    { 0, 2, 0 },  // 1110 0111
    { 0, 3, 3 },  // 1110 1000
    { 0, 2, 0 },  // 1110 1001
    { 0, 1, 1 },  // 1110 1010
    { 0, 1, 0 },  // 1110 1011
    { 0, 2, 2 },  // 1110 1100
    { 0, 1, 0 },  // 1110 1101
    { 0, 1, 1 },  // 1110 1110
    { 0, 1, 0 },  // 1110 1111
    { 0, 4, 4 },  // 1111 0000
    { 0, 3, 0 },  // 1111 0001
    { 0, 2, 1 },  // 1111 0010
    { 0, 2, 0 },  // 1111 0011
    { 0, 2, 2 },  // 1111 0100
    { 0, 1, 0 },  // 1111 0101
    { 0, 1, 1 },  // 1111 0110
    { 0, 1, 0 },  // 1111 0111
    { 0, 3, 3 },  // 1111 1000
    { 0, 2, 0 },  // 1111 1001
    { 0, 1, 1 },  // 1111 1010
    { 0, 1, 0 },  // 1111 1011
    { 0, 2, 2 },  // 1111 1100
    { 0, 1, 0 },  // 1111 1101
    { 0, 1, 1 },  // 1111 1110
    { 0, 0, 0 },  // 1111 1111
};

score 2 · Accepted Answer

配列を単語ごとに反復処理します (32 ビットまたは 64 ビットはアーキテクチャによって異なります)。とを使用__builtin_clzし__builtin_ctzて、各単語の先頭と末尾のゼロを計算します。

ゼロが連続する 2 つのケースがあります。

一言で
形容詞を越えて。

最初のケースは簡単に確認できます。2 番目のケースでは、このアイテムの先行ゼロ + 前のアイテムの後続ゼロが >= 6 であるかどうかを確認する必要があります。

score 1 · Accepted Answer

次の算術トリックに注意してください。

// remove the trailing one bits
y = x & (x + 1);

x       11100011
        +      1
        --------
x+1     11100100
x&(x+1) 11100000

// remove the trailing zero bits
z = y | (y - 1);

y       11100000
        -      1
        --------
y-1     11011111
y|(y-1) 11111111

// isolate the hole
hole = z - x;
hole = z ^ x;

z       11111111
x       11100011
        --------
z^x     00011100

// Now you can count the set bits of the hole.
length = bitcount(hole);

// Or you can make it by computing highbit only (log2).
length = highbit(z^y) - highbit(x^y);

したがって、考えられるアルゴリズムは、大きな整数演算でこれらのトリックを使用し、length==0 (穴がなくなる) または length>=n (x=z; で次のループを開始する) までループすることです。

大きな整数を自分でエミュレートし、バイトごとにコレクションに作用し、キャリーがなくなったら停止することができます。

x+1 は、byte==0xFF の場合にのみキャリーを持ちます
y-1 は、byte==0x00 の場合にのみキャリーを持ちます
highbit は 1 バイトで簡単にプログラムできます

これにより、次のような結果が得られます。

// return 1-based position of highest bit set in a byte
int highbit(unsigned char c)
{
    unsigned char n;
    int position = 0;
    n = c >> 4;
    if( n > 0 ) { c=n; position+=4; };
    n = c >> 2;
    if( n > 0 ) { c=n; position+=2; };
    n = c >> 1;
    if( n > 0 ) { c=n; position+=1; };
    position += c;
    return position;
}

int find_consecutive_zeros( unsigned char *bits , int nbytes , int nzero )
{
    int i,nTrailingOnes,nTrailingZero,sizeOfNextHole;
    unsigned char x,y;
    for(i=0 , x=bits[0]; 1; )
    {
        // perform y=x&(x+1) to count and remove trailing ones
        for(;x==0xFF && i<nbytes-1;x=bits[++i]);
        y = x&(x+1);
        nTrailingOnes = 8*i + highbit( x^y );
        // perform x=y|(y-1); to count and remove trailing zeros
        for(;y==0 && i<nbytes-1;y=bits[++i]);
        x = y|(y-1);
        nTrailingZero = 8*i + highbit( x^y );
        sizeOfNextHole = nTrailingZero - nTrailingOnes;
        // if size of hole is long enough, return it's low bit rank (0-based)
        if( sizeOfNextHole >= nzero ) return nTrailingOnes;
        // or return -1 if no more hole
        if( sizeOfNextHole == 0 ) return -1;
    }
}

基本コレクションに 1 バイトよりも長いものを使用することで、より効率的にすることができます...

編集: 6 のような nzero の固定サイズがある場合は高速化します
。上記のアルゴリズムはすべての穴で反復され、小さな穴で時間を失う可能性があります。
小さな穴が埋められた事前計算されたテーブルを使用すると、これを回避できます。

たとえば、10010101 には短すぎる穴が 3 つあるため、11111111 に置き換えることができます
。ただし、先頭と末尾のゼロは変更しないでください。

(x|(x-1))^xこのようなテーブルを初期化するには、末尾のゼロの代わりに1 ビットを含むマスクと先行ゼロの代わりに 1 ビットを含むマスクを使用して 0xFF と xor を取得するだけです((1<<highbit(x))-1)^0xFF。
10000001 の例外を追加します。これは、1 と 1 の間に 6 つのゼロを含む唯一のバイトです。

EDIT2：最初のバイトの最下位ビットを最初に使用してビットシーケンスを処理しました。これは、算術アプローチによく適合します。この質問では、最初のバイトの最上位ビットを最初に明示的に使用しています。そのため、質問に合わせてビットを逆にする必要があります。これは、テーブルの初期化中に行われます...

int reversebits(unsigned char c)
{
    c = ((c & 0x0F) << 4) | ((c & 0xF0) >> 4);
    c = ((c & 0x33) << 2) | ((c & 0xCC) >> 2);
    c = ((c & 0x55) << 1) | ((c & 0xAA) >> 1);
    return c;
}
void initializeFillShortHoles(unsigned char fillShortHoles[256])
    for(unsigned int x=0;x<256;x++) {
        fillShortHoles[reversebits(x)] = ((1<<highbit(x))-1) ^ (x|(x-1)) ^ x;
    }
    fillShortHoles[0]=0;     // no use to reverse bits for those two
    fillShortHoles[129]=129;
}

x=bits[ i ]次に、出現するをに置き換えるだけでx=fillShortHoles[ bits[ i ] ]完了です。

int find_6_consecutive_zeros( unsigned char *bits , int nbytes )
{
    static unsigned char fillShortHoles[256];
    static unsigned char highbitTable[256];
    static first=1;
    int i,nTrailingOnes,nTrailingZero,sizeOfNextHole;
    unsigned char x,y;

    if (first)
    {
        first = 0;
        initializeFillShortHoles( fillShortHoles );
        for(i=0;i<256;i++) highbitTable[i]=highbit(i);
    }
    for(x=fillShortHoles[bits[i=0]]; 1; )
    {
        // perform y=x&(x+1) to count trailing ones
        for(;x==0xFF && i<nbytes-1;x=fillShortHoles[bits[++i]]);
        y = x&(x+1);
        nTrailingOnes = 8*i + highbitTable[ x^y ];
        // perform z=y|(y-1); to count trailing zeros
        for(;y==0 && i<nbytes-1;y=fillShortHoles[bits[++i]]);
        x = y|(y-1);
        nTrailingZero = 8*i + highbitTable[ x^y ];
        sizeOfNextHole = nTrailingZero - nTrailingOnes;
        // if size of hole is long enough, return it's low bit rank (0-based)
        if( sizeOfNextHole >= 6 ) return nTrailingOnes;
        // or return -1 if no more hole
        if( sizeOfNextHole == 0 ) return -1;
    }
}

EDIT3:最後に、nzero<=9 の場合、より高速な方法は、バイトの各ペアの位置をキャッシュすることです。

int find_n_consecutive_zeros_bypair( unsigned char *bits , int nbytes , int nzero)
{
    static int first=1;
    static signed char holepositionbypair[8][65536];
    signed char position;
    int i;
    unsigned short x;
    if (first)
    {
        first = 0;
        for(i=0;i<8;i++) {
            initializeHolePositionByPair( holepositionbypair[i] , i+1 );
        }
    }
    for (i=0 , x=0xFF; i<nbytes; i++) {
        x = (x << 8) + bits[i];
        if( (position = holepositionbypair[nzero-1][x]) >= 0) return (i-1)*8 + position;
    }
    return -1;
}

初期化 x=0xFF は nbytes<2 の場合を処理することに注意してください。

キャッシュホールの位置をペアで埋める方法に関係なく、初期化時にのみ呼び出されます。もちろん、算術的な方法を提案します：

int highbit16(unsigned short c)
{
    unsigned short n;
    int position = 0;
    n = c >> 8;
    if( n ) { c=n; position+=8; };
    n = c >> 4;
    if( n ) { c=n; position+=4; };
    n = c >> 2;
    if( n ) { c=n; position+=2; };
    n = c >> 1;
    if( n ) { c=n; position+=1; };
    position += c;
    return position;
}
unsigned short reversebits16(unsigned short c)
{
    c = ((c & 0x00FF) << 8) | ((c & 0xFF00) >> 8);
    c = ((c & 0x0F0F) << 4) | ((c & 0xF0F0) >> 4);
    c = ((c & 0x3333) << 2) | ((c & 0xCCCC) >> 2);
    c = ((c & 0x5555) << 1) | ((c & 0xAAAA) >> 1);
    return c;
}
initializeHolePositionByPair(signed char holepositionbypair[65536],int n)
{
    int i,n1,n0;
    unsigned short x,y;
    signed char position;
    for(i=0;i<65536;i++) {
        position = -1;
        x = i;
        while(x != 0xFFFF) {
            /* remove trailing ones */
            y = x&(x+1);
            n1 = highbit16(x^y);
            /* remove trailing zeros */
            x = y|(y-1);
            n0 = highbit16(x^y);
            if(n0-n1>=n) {
                position = n1; break;
            }
        }
        holepositionbypair[reversebits16(i)] = position;
    }
}

score 0 · Accepted Answer

1バイトの場合、3つのテストを実行するだけで済みます。

if( (byte & 0x3F) == 0) { /* Found */ }
if( (byte & 0x7E) == 0) { /* Found */ }
if( (byte & 0xFC) == 0) { /* Found */ }

これをより広い値に拡張するのは比較的簡単なはずです。たとえば、次の場合uint32_t：

tempValue1 = value & 0x3F3F3F3F;
tempValue2 = value & 0x7E7E7E7E;
tempValue3 = value & 0xFCFCFCFC;
if( ( (uint8_t)tempValue1 == 0) { /* Found */ }
if( ( (uint8_t)tempValue2 == 0) { /* Found */ }
if( ( (uint8_t)tempValue3 == 0) { /* Found */ }
tempValue1 >>= 8;
tempValue2 >>= 8;
tempValue3 >>= 8;
if( ( (uint8_t)tempValue1 == 0) { /* Found */ }
if( ( (uint8_t)tempValue2 == 0) { /* Found */ }
if( ( (uint8_t)tempValue3 == 0) { /* Found */ }
tempValue1 >>= 8;
tempValue2 >>= 8;
tempValue3 >>= 8;
if( ( (uint8_t)tempValue1 == 0) { /* Found */ }
if( ( (uint8_t)tempValue2 == 0) { /* Found */ }
if( ( (uint8_t)tempValue3 == 0) { /* Found */ }
tempValue1 >>= 8;
tempValue2 >>= 8;
tempValue3 >>= 8;
if( ( (uint8_t)tempValue1 == 0) { /* Found */ }
if( ( (uint8_t)tempValue2 == 0) { /* Found */ }
if( ( (uint8_t)tempValue3 == 0) { /* Found */ }

上記のコードは、それを行うための最良の方法のようには見えません（おそらくそうではないため）。SSEでどのように実行できるかをわかりやすくするために、このように意図的に作成しました。SSEの場合、上記と同様です（ただし、幅が広くなります）。

ただし、SSEの場合、すべての比較を並行して実行し、多くのブランチを取り除くことができます。たとえば、3つのマスクのそれぞれとANDを使用してPCMPEQB3回使用し、次にこれらの結果をORしてから、1つ実行することができPMOVMSKBます。これにより、単一のでテストできる16ビット値（16バイトを表す-ソースバイトごとに1ビット）が得られますif(result_flags == 0) { /* None of the 16 bytes matched */ }。この最終テストは、「do/while」ループの最後で行うことができます。

score 0 · Accepted Answer

試してみましょう - どうですか:

class bitIterator{
    unsigned char* farray;
    int foffset;
public:
    bitIterator(unsigned char* array, int offset)
        :farray(array), foffset(offset)
    {}

    bool operator*() const {
        return (farray[foffset/8] >> (7 - foffset%8)) & 1;
    }

    bitIterator& operator++(){
        foffset++;
        return *this;
    }

    int offset() const {
        return foffset; 
    }

};

// Just to demonstrate how to call it
int find_first_n(unsigned char* buffer, int length, int n){
    return std::search_n(bitIterator(buffer, 0), bitIterator(buffer, length*8), n, 0).offset();
}

それはただの楽しみでした...

ここで、本当にパフォーマンスを絞り出したい場合は、次のことをお勧めします

int find_first_n(unsigned char* buffer, int length, int n){
    int prev_trail = 0;
    unsigned int* buf = reinterpret_cast<unsigned int*>(buffer);
    int len = length/sizeof(int); 
    // Last few bytes will need special treatment if your array is not multple of int-size length.
    // However last bytes should not influence overall performance of code - assuming it is used on rather long arrays.
    // If you plan using rather short arrays (20-50 bytes?) you will probably be better off just using plain char.
    for (int i=0; i<len; i++){
        if (!buf[i]) return i*sizeof(int)*8-prev_trail; // __builtin_clz and __builtin_ctz are undefined on zero ;
        // EDIT:
        if (!~buf[i]){
            prev_trail = 0;
            continue;
        }
        // END EDIT!


        int shft = __builtin_clz(buf[i]);
        if (shft + prev_trail >= n) return i*sizeof(int)*8-prev_trail; // Leading + previous trailing <= n
                                                           // Not enough leading zeros, continue search.
        prev_trail = __builtin_ctz(buf[i]); // Store it to use for next int in array
        unsigned int tmp =0;               
        while(shft < sizeof(int)*8-prev_trail){   // While we haven't got to trailing zeros in this uint
            tmp = buf[i] << shft;               // Shift-out leading zeros;
            shft += (__builtin_clz(~tmp));
            tmp = buf[i] << shft;               // Shift-out leading ones;

            lead_zeros = __builtin_clz(tmp);
            if (lead_zeros >= n)                // and see if have enough leading zeros.
                return i*sizeof(int)*8+shft;

            shft += lead_zeros;
        }
        return length*8; // Not found :(
    }

これはかなり読みにくいですが、概念は単純です。すべての int サイズのブロックを繰り返し処理し、前のブロックの先頭のゼロ + 後続のゼロが >= n であるかどうかを確認します。先頭のゼロと後続のゼロを繰り返しシフトアウトするだけでなく（ビットを設定）、後続のバイトに到達しない限り、後続のゼロ> = nを再度チェックします。

そして、もう1つのアイデア：

int find_first_n(unsigned char* buffer, int length, int n){
    union {
        unsigned char chr[2];
        unsigned int uit;
    };

    unsigned int onemask = 0x8000;
    for (int i = 1 ; i<n; i++) onemask = onemask | (onemask >> 1);

    int* masks = new int[8];
    for (int i = 0; i<8; i++){
        masks[i]=onemask >> i;
    }

    // generating masks - eg. for n == 3 would be:
    // [ 1110 0000 0000 0000 ]
    // [ 0111 0000 0000 0000 ]
    // [ 0011 1000 0000 0000 ]
    // [ 0001 1100 0000 0000 ]
    // [ ... ]
    // [ 0000 0001 1100 0000 ]


    uit = 0;
    for (int i=0; i<length-1; i++){
        chr[0] = buffer[i];
        chr[1] = buffer[i+1];
        for (int j=0; j<8; j++){
            if (!(uit & masks[j])) return i*8+j;
        }
    }
    chr[0] = buffer[length-1];
    chr[1] = 0xff; // Fill with ones at the end.
    for (int j=0; j<8; j++){
        if (!(uit & masks[j])) return (length-1)*8+j;
    }
    return length*8; // Not found :(
}

reinterpret_cast<unsigned short int*>(buffer[i])まだバッファ内にある変数へのポインタを word ( ) に直接キャストし、union を使用せずにマスクを適用したくなるかもしれません。ただし、そのような操作の半分はアライメントされていない変数を使用するため、パフォーマンスが低下する可能性があり、一部のプラットフォームでは例外が生成されることさえあることに注意してください。

score 0 · Accepted Answer

ここで、このコードを試してください。

int dataLen = 500;
char data[dataLen];
//Get the data in place with whatever is your algorithm.
int i,j;
unsigned int dataSample;
unsigned int mask;
for(i=0;i<dataLen-1;i++){
    dataSample=((unsigned int)(data[i+1]) << 8) | (unsigned int) (data[i]);
    mask=(1<<6) - 1 ; //6 consecutive 1's
    for(j=0;j<8;j++){
        if((dataSample & (mask << j)) == 0){
            printf("Found consecutive 6 zeros at byte %d offset %d\n",i,j);
            j+=5; // Followed by j++ makes it j+=6.
        }
    }
}

score 0 · Accepted Answer

ちょうど6 個の連続するゼロを探しているとします。次のようなコードスニペットを使用できます。

unsigned d1 = 0xff;
for (int i = 0; i < dataLen; i += 3) {
  unsigned d1 = (d1 << 24) | (data[i] << 16) | (data [i+1] << 8) | (data[i+2]);
  unsigned d2 = ~d1; // ones
  unsigned d3 = d2 & (d2 << 2) & (d2 << 4);
  unsigned d4 = d3 & (d3 << 1);
  if (!d4) continue;
  doSomethingWithTheSequence(i, d4);
}

d1前の実行の最後のバイトを 3 つの新しいバイトと結合します。したがって、データを 3 の倍数で反復処理できます。データを 16 ビットの原子量として扱うことができるため、2 の倍数を試すことができます。特に 64 ビットプラットフォームを使用している場合は、4 の倍数を試して、64 ビット数値で後続の計算を行うこともできます。または、バイトにまたがるゼロシーケンスの特別なケースを導入します。

d2ビットパターンを逆にします。これは、シフトによって人為的なゼロが導入されますが、人為的なゼロは導入されないため便利です。d3は、オフセット 0、2、および 4 で 3 つの一致する 1 を探します。d4次に、オフセットの別のビットを追加して、0 から 5 までのすべてのオフセットを結合します。したがってd4、に 6 つの連続するゼロがある場合に限り、は非ゼロになりd1ます。を使用__builtin_clzして、の最上位ビットの位置を特定できますd4。これは、のこれらの 6 ビットの最初の位置でもありますd1。そこからの位置を取得できますdata。

ループを追加してコンパイラがそれを最適化して取り除くことを期待するか、目的の実行の長さに適した方法で計算するインラインテンプレート関数を提供することで、コードを他の実行の長さに適合させることができますd4。d2

score 0 · Accepted Answer

これには、x86 のリトルエンディアンとアライメントされていないメモリアクセス機能を利用することでアプローチします。popcount で候補の単語を探し、__builtin_ctz を使用してループで位置を見つけます。これにより、コードから 1 つのループが排除され、単語が候補でない場合のビット検索ループが回避されます。ビッグエンディアンのマシンでは、 htons(*(unsigned short *)p) を使用してバイトを交換する必要があります。これには、アラインされていないワードアクセスを許可するマシンが必要です。また、配列の最後に余分な 0xFF バイトのフェンスが必要になります。

unsigned char *p;
unsigned short v;
int c;
for (p = data; p < data + sizeof(data); p++)
  if (__builtin_popcount(*(unsigned short *)p) <= 16 - desired)
  { for (v = *(unsigned short *)p, c = 0; __builtin_ctz(v) < desired; v>>=1, c++) ;
    if (c < 16) // Gotcha @ p starting at bit c
      break;
  }
if (c == 16 || p >= data + sizeof(data)) // didn't find
else // found

c++ - char 配列で最大 6 つの連続する 0 ビットを見つける最速の方法

9 に答える 9

Related

Reference