スケーリングやスキューなどの画像変換にバイキュービック補間を実装しようとしていますが、画像出力が不正確に見えます。補間されたピクセルが 255 を超えてオーバーフローすることがあります。
コードは次のとおりです。
#include <algorithm>
#include <sal.h>
#define ASSERT _ASSERTE
template<typename T>
class bicubic_sampler
{
// use to offset into int 4x4 array to get individual channel
#pragma region constants
static unsigned constexpr x00 = 0;
static unsigned constexpr x01 = 4;
static unsigned constexpr x02 = 8;
static unsigned constexpr x03 = 12;
static unsigned constexpr x10 = 16;
static unsigned constexpr x11 = 20;
static unsigned constexpr x12 = 24;
static unsigned constexpr x13 = 28;
static unsigned constexpr x20 = 32;
static unsigned constexpr x21 = 36;
static unsigned constexpr x22 = 40;
static unsigned constexpr x23 = 44;
static unsigned constexpr x30 = 48;
static unsigned constexpr x31 = 52;
static unsigned constexpr x32 = 56;
static unsigned constexpr x33 = 60;
#pragma endregion
T a00, a01, a02, a03,
a10, a11, a12, a13,
a20, a21, a22, a23,
a30, a31, a32, a33;
public:
void sample(_In_ UINT32(&pix)[4][4], _In_ T x, _In_ T y, _Inout_ BYTE(&output)[4])
{
auto channel_count = 3u; // skip alpha
for (auto c = 0; c != channel_count; ++c)
{
auto p = reinterpret_cast<BYTE*>(pix) + c;
a00 = p[x11];
a01 = -.5*p[x10] + .5*p[x12];
a02 = p[x10] - 2.5*p[x11] + 2 * p[x12] - .5*p[x13];
a03 = -.5*p[x10] + 1.5*p[x11] - 1.5*p[x12] + .5*p[x13];
a10 = -.5*p[x01] + .5*p[x21];
a11 = .25*p[x00] - .25*p[x02] - .25*p[x20] + .25*p[x22];
a12 = -.5*p[x00] + 1.25*p[x01] - p[x02] + .25*p[x03] + .5*p[x20] - 1.25*p[x21] + p[x22] - .25*p[x23];
a13 = .25*p[x00] - .75*p[x01] + .75*p[x02] - .25*p[x03] - .25*p[x20] + .75*p[x21] - .75*p[x22] + .25*p[x23];
a20 = p[x01] - 2.5*p[x11] + 2 * p[x21] - .5*p[x31];
a21 = -.5*p[x00] + .5*p[x02] + 1.25*p[x10] - 1.25*p[x12] - p[x20] + p[x22] + .25*p[x30] - .25*p[x32];
a22 = p[x00] - 2.5*p[x01] + 2 * p[x02] - .5*p[x03] - 2.5*p[x10] + 6.25*p[x11] - 5 * p[x12] + 1.25*p[x13] + 2 * p[x20] - 5 * p[x21] + 4 * p[x22] - p[x23] - .5*p[x30] + 1.25*p[x31] - p[x32] + .25*p[x33];
a23 = -.5*p[x00] + 1.5*p[x01] - 1.5*p[x02] + .5*p[x03] + 1.25*p[x10] - 3.75*p[x11] + 3.75*p[x12] - 1.25*p[x13] - p[x20] + 3 * p[x21] - 3 * p[x22] + p[x23] + .25*p[x30] - .75*p[x31] + .75*p[x32] - .25*p[x33];
a30 = -.5*p[x01] + 1.5*p[x11] - 1.5*p[x21] + .5*p[x31];
a31 = .25*p[x00] - .25*p[x02] - .75*p[x10] + .75*p[x12] + .75*p[x20] - .75*p[x22] - .25*p[x30] + .25*p[x32];
a32 = -.5*p[x00] + 1.25*p[x01] - p[x02] + .25*p[x03] + 1.5*p[x10] - 3.75*p[x11] + 3 * p[x12] - .75*p[x13] - 1.5*p[x20] + 3.75*p[x21] - 3 * p[x22] + .75*p[x23] + .5*p[x30] - 1.25*p[x31] + p[x32] - .25*p[x33];
a33 = .25*p[x00] - .75*p[x01] + .75*p[x02] - .25*p[x03] - .75*p[x10] + 2.25*p[x11] - 2.25*p[x12] + .75*p[x13] + .75*p[x20] - 2.25*p[x21] + 2.25*p[x22] - .75*p[x23] - .25*p[x30] + .75*p[x31] - .75*p[x32] + .25*p[x33];
auto x2 = x * x;
auto x3 = x2 * x;
auto y2 = y * y;
auto y3 = y2 * y;
auto dd = (a00 + a01 * y + a02 * y2 + a03 * y3) +
(a10 + a11 * y + a12 * y2 + a13 * y3) * x +
(a20 + a21 * y + a22 * y2 + a23 * y3) * x2 +
(a30 + a31 * y + a32 * y2 + a33 * y3) * x3;
//ASSERT(dd <= 0xff); // this is overflowing beyond 255
auto finalValue = (std::min)(255.0, dd);
output[c] = static_cast<BYTE>(finalValue);
}
}
};
template<typename T, typename Matrix>
void transform_pixels(_In_ T* src, _Inout_ T* dest, _In_ const int width, _In_ const int height, _In_ const Matrix & mat)
{
auto bc_sampler = bicubic_sampler<double>{};
const ptrdiff_t channelCount = 4;
for (auto y = 0; y != height; ++y)
{
for (auto x = 0; x != width; ++x)
{
auto p0 = point<double>(x, y); //original point
auto p = transform_point(mat, p0); // calculate the tranform point after applying matrix mul, like scale, skewing, rotation
auto pf = point < std::ptrdiff_t >(pt_floor(p));
auto frac = point < double >{ p.x - pf.x, p.y - pf.y };
if (pf.x < 0 || pf.y < 0 || pf.x >= width || pf.y >= height)
{
continue;
}
BYTE mp[4]{}; // one pixel transformed output
auto loc = (src + (pf.y * width + pf.x) * channelCount);
auto stride = width * channelCount;
if (pf.x - 1 >= 0 && pf.y - 1 >= 0 && pf.x + 2 < width && pf.y + 2 < width)
{
UINT32 neig4x4[4][4] = {};
// store the 16 neighbours
neig4x4[0][0] = *reinterpret_cast<INT32*>(loc - (1 * stride) - channelCount);
neig4x4[0][1] = *reinterpret_cast<INT32*>(loc - (1 * stride));
neig4x4[0][2] = *reinterpret_cast<INT32*>(loc - (1 * stride) + channelCount);
neig4x4[0][3] = *reinterpret_cast<INT32*>(loc - (1 * stride) + 2 * channelCount);
neig4x4[1][0] = *reinterpret_cast<INT32*>(loc + (1 * stride) - channelCount);
neig4x4[1][1] = *reinterpret_cast<INT32*>(loc + (1 * stride));
neig4x4[1][2] = *reinterpret_cast<INT32*>(loc + (1 * stride) + channelCount);
neig4x4[1][3] = *reinterpret_cast<INT32*>(loc + (1 * stride) + 2 * channelCount);
neig4x4[2][0] = *reinterpret_cast<INT32*>(loc + (2 * stride) - channelCount);
neig4x4[2][1] = *reinterpret_cast<INT32*>(loc + (2 * stride));
neig4x4[2][2] = *reinterpret_cast<INT32*>(loc + (2 * stride) + channelCount);
neig4x4[2][3] = *reinterpret_cast<INT32*>(loc + (2 * stride) + 2 * channelCount);
neig4x4[3][0] = *reinterpret_cast<INT32*>(loc + (3 * stride) - channelCount);
neig4x4[3][1] = *reinterpret_cast<INT32*>(loc + (3 * stride));
neig4x4[3][2] = *reinterpret_cast<INT32*>(loc + (3 * stride) + channelCount);
neig4x4[3][3] = *reinterpret_cast<INT32*>(loc + (3 * stride) + 2 * channelCount);
// mp is interoplated pixel
bc_sampler.sample(neig4x4, frac.x, frac.y, mp);
auto dst = reinterpret_cast<void*>(dest + (y * width + x) * channelCount); // at what location in dest to copy pixel from the source
memcpy_s(dst, channelCount, mp, channelCount);
}
else
{
auto dst = reinterpret_cast<void*>(dest + (y * width + x) * channelCount);
memcpy_s(dst, 4, loc, 4); // copy the original
}
} // for x
} // for y
}
出力はズーム変換用です
バイリニア出力:
バイキュービック出力