Şu anda SSR41'e kadar gayet iyi olan bir filtrelenmiş örnek için sadece bir anda filtrelenmiş bir örnek için bir intrinsik kullanmaya alışkın bir egzersiz olarak anlamaya çalışıyorum.SSE4.1 intrinsics ile Bilinear filtre
inline __m128i DivideBy255_8xUint16(const __m128i value)
{
// Blinn 16bit divide by 255 trick but across 8 packed 16bit values
const __m128i plus128 = _mm_add_epi16(value, _mm_set1_epi16(128));
const __m128i plus128ThenDivideBy256 = _mm_srli_epi16(plus128, 8); // TODO: Should this be an arithmetic or logical shift or does it matter?
const __m128i partial = _mm_add_epi16(plus128, plus128ThenDivideBy256);
const __m128i result = _mm_srli_epi16(partial, 8); // TODO: Should this be an arithmetic or logical shift or does it matter?
return result;
}
inline uint32_t BilinearSSE41(const uint8_t* data, uint32_t pitch, uint32_t width, uint32_t height, float u, float v)
{
// TODO: There are probably intrinsics I haven't found yet to avoid using these?
// 0x80 is high bit set which means zero out that component
const __m128i unpack_fraction_u_mask = _mm_set_epi8(0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0);
const __m128i unpack_fraction_v_mask = _mm_set_epi8(0x80, 1, 0x80, 1, 0x80, 1, 0x80, 1, 0x80, 1, 0x80, 1, 0x80, 1, 0x80, 1);
const __m128i unpack_two_texels_mask = _mm_set_epi8(0x80, 7, 0x80, 6, 0x80, 5, 0x80, 4, 0x80, 3, 0x80, 2, 0x80, 1, 0x80, 0);
// TODO: Potentially wasting two channels of operations for now
const __m128i size = _mm_set_epi32(0, 0, height - 1, width - 1);
const __m128 uv = _mm_set_ps(0.0f, 0.0f, v, u);
const __m128 floor_uv_f = _mm_floor_ps(uv);
const __m128 fraction_uv_f = _mm_sub_ps(uv, floor_uv_f);
const __m128 fraction255_uv_f = _mm_mul_ps(fraction_uv_f, _mm_set_ps1(255.0f));
const __m128i fraction255_uv_i = _mm_cvttps_epi32(fraction255_uv_f); // TODO: Did this get rounded correctly?
const __m128i fraction255_u_i = _mm_shuffle_epi8(fraction255_uv_i, unpack_fraction_u_mask); // Splat fraction_u*255 across all 16 bit words
const __m128i fraction255_v_i = _mm_shuffle_epi8(fraction255_uv_i, unpack_fraction_v_mask); // Splat fraction_v*255 across all 16 bit words
const __m128i inverse_fraction255_u_i = _mm_sub_epi16(_mm_set1_epi16(255), fraction255_u_i);
const __m128i inverse_fraction255_v_i = _mm_sub_epi16(_mm_set1_epi16(255), fraction255_v_i);
const __m128i floor_uv_i = _mm_cvttps_epi32(floor_uv_f);
const __m128i clipped_floor_uv_i = _mm_min_epu32(floor_uv_i, size); // TODO: I haven't clamped this probably if uv was less than zero yet...
// TODO: Calculating the addresses in the SSE register set would maybe be better
int u0 = _mm_extract_epi32(floor_uv_i, 0);
int v0 = _mm_extract_epi32(floor_uv_i, 1);
const uint8_t* row = data + (u0<<2) + pitch*v0;
const __m128i row0_packed = _mm_loadl_epi64((const __m128i*)data);
const __m128i row0 = _mm_shuffle_epi8(row0_packed, unpack_two_texels_mask);
const __m128i row1_packed = _mm_loadl_epi64((const __m128i*)(data + pitch));
const __m128i row1 = _mm_shuffle_epi8(row1_packed, unpack_two_texels_mask);
// Compute (row0*fraction)/255 + row1*(255 - fraction)/255 - probably slight precision loss across addition!
const __m128i vlerp0 = DivideBy255_8xUint16(_mm_mullo_epi16(row0, fraction255_v_i));
const __m128i vlerp1 = DivideBy255_8xUint16(_mm_mullo_epi16(row1, inverse_fraction255_v_i));
const __m128i vlerp = _mm_adds_epi16(vlerp0, vlerp1);
const __m128i hlerp0 = DivideBy255_8xUint16(_mm_mullo_epi16(vlerp, fraction255_u_i));
const __m128i hlerp1 = DivideBy255_8xUint16(_mm_srli_si128(_mm_mullo_epi16(vlerp, inverse_fraction255_u_i), 16 - 2*4));
const __m128i hlerp = _mm_adds_epi16(hlerp0, hlerp1);
// Pack down to 8bit from 16bit components and return 32bit ARGB result
return _mm_extract_epi32(_mm_packus_epi16(hlerp, hlerp), 0);
}
kod görüntü verileri ARGB8 ve şube zorunda kalmadan kenar davalarını üzere fazladan bir sütun ve satır vardır varsayar:
Şimdiye kadar şu var.
Bu çirkin karmaşanın boyutunu azaltmak için hangi talimatları kullanabileceğimin ve tabii ki daha hızlı çalışabilmesi için nasıl geliştirilebileceğine dair tavsiyem var!
Teşekkürler :)