MMX SSE を同等の C コードに変換しています。ほとんど変換しましたが、画質が適切ではないか、画像にノイズが入っていることがわかります。過去 5 日間のコードをデバッグしていますが、その理由がわかりません。皆さんが問題を調べて私を助けてくれたら、とてもうれしいです。
元の SSE コード:
void unpack_8bit_to_16bit( __m128i *a, __m128i* b0, __m128i* b1 )
{
__m128i zero = _mm_setzero_si128();
b0 = _mm_unpacklo_epi8( a, zero );
b1 = _mm_unpackhi_epi8( a, zero );
}
void convolve_cols_3x3( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h )
{
using namespace std;
assert( w % 16 == 0 && "width must be multiple of 16!" );
const int w_chunk = w/16;
__m128i* i0 = (__m128i*)( in );
__m128i* i1 = (__m128i*)( in ) + w_chunk*1;
__m128i* i2 = (__m128i*)( in ) + w_chunk*2;
__m128i* result_h = (__m128i*)( out_h ) + 2*w_chunk;
__m128i* result_v = (__m128i*)( out_v ) + 2*w_chunk;
__m128i* end_input = (__m128i*)( in ) + w_chunk*h;
for( ; i2 != end_input; i0++, i1++, i2++, result_v+=2, result_h+=2 )
{
*result_h = _mm_setzero_si128();
*(result_h+1) = _mm_setzero_si128();
*result_v = _mm_setzero_si128();
*(result_v+1) = _mm_setzero_si128();
__m128i ilo, ihi;
unpack_8bit_to_16bit( *i0, ihi, ilo );
*result_h = _mm_add_epi16( ihi, *result_h );
*(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
*result_v = _mm_add_epi16( *result_v, ihi );
*(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
unpack_8bit_to_16bit( *i1, ihi, ilo );
*result_v = _mm_add_epi16( *result_v, ihi );
*(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
*result_v = _mm_add_epi16( *result_v, ihi );
*(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
unpack_8bit_to_16bit( *i2, ihi, ilo );
*result_h = _mm_sub_epi16( *result_h, ihi );
*(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
*result_v = _mm_add_epi16( *result_v, ihi );
*(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
}
}
変換したコードを以下に示します
void convolve_cols_3x3( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h )
{
using namespace std;
assert( w % 16 == 0 && "width must be multiple of 16!" );
const int w_chunk = w/16;
uint8_t* i0 = (uint8_t*)( in );
uint8_t* i1 = (uint8_t*)( in ) + w_chunk*1*16;
uint8_t* i2 = (uint8_t*)( in ) + w_chunk*2*16;
int16_t* result_h = (int16_t*)( out_h ) + 2*w_chunk*16;
int16_t* result_v = (int16_t*)( out_v ) + 2*w_chunk*16;
uint8_t* end_input = (uint8_t*)( in ) + w_chunk*h*16;
for( ; i2 != end_input; i0+= 16, i1+= 16, i2+= 16, result_v+= 16, result_h+= 16 )
{
for (int i=0; i<8;i++)
{
result_h[i] = 0;
result_h[i + 8] = 0;
result_v[i] = 0;
result_v[i + 8] = 0;
result_h[i] = (int16_t)(i0[i]) + result_h[i] ;
result_h[i + 8] = (int16_t)(i0[i + 8]) + result_h[i + 8] ;
result_v[i] = (int16_t)(i0[i]) + result_v[i] ;
result_v[i + 8] = (int16_t)(i0[i + 8]) + result_v[i + 8] ;
result_v[i] = (int16_t)(i1[i]) + result_v[i] ;
result_v[i + 8] = (int16_t)(i1[i + 8]) + result_v[i + 8] ;
result_v[i] = (int16_t)(i1[i]) + result_v[i] ;
result_v[i + 8] = (int16_t)(i1[i + 8]) + result_v[i + 8] ;
result_h[i] = result_h[i] - (int16_t)(i2[i]);
result_h[i + 8] = result_h[i + 8] - (int16_t)(i2[i + 8]);
result_v[i] = (int16_t)(i2[i]) + result_v[i] ;
result_v[i + 8] = (int16_t)(i2[i + 8]) + result_v[i + 8] ;
}
}
}
コードが読みにくい場合は申し訳ありません。幅と高さを表しますw
。とは、後で他の目的に使用される 2 つのパラメーターです。h
out_h
out_v