Yucom/jxrlib/image/decode/strdec_x86.c
2020-09-29 14:29:06 -05:00

1624 lines
48 KiB
C

//*@@@+++@@@@******************************************************************
//
// Copyright © Microsoft Corp.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// • Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// • Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
//*@@@---@@@@******************************************************************
#include "strcodec.h"
#include "decode.h"
#if defined(WMP_OPT_SSE2)
#include <emmintrin.h>
#include <windows.h>
//================================================================
static __m128i g_const_d0;
static __m128i g_const_d1;
__m128i g_const_d3;
__m128i g_const_d4;
__m128i g_const_d0x80;
__m128i g_const_w0x80;
__m128i g_const_b0x80;
//================================================================
#if defined(WMP_OPT_CC_DEC)
__declspec(naked) void __stdcall storeRGB24_5(
U8* pbYCoCg,
size_t cbYCoCg,
const U8* pbRGB,
size_t cbRGB,
size_t cmb)
{
#define DISP 8
UNREFERENCED_PARAMETER( pbYCoCg );
UNREFERENCED_PARAMETER( cbYCoCg );
UNREFERENCED_PARAMETER( pbRGB );
UNREFERENCED_PARAMETER( cbRGB );
UNREFERENCED_PARAMETER( cmb );
__asm {
push ebp
push ebx
push esi
push edi
mov ebx, [esp + 36] // $ebx = cmb
mov edi, [esp + 28] // $edi = pbRGB
lea ebx, [ebx + ebx * 2] // $ebx = cmb * 3
mov edx, [esp + 32] // $edx = cbRGB
shl ebx, 4 // $ebx = cmb * 3 * 16
mov esi, [esp + 20] // $esi = pbYCoCg
add edi, ebx // $edi = pbRGB + 3 * 16 * cmb
mov ebp, [esp + 24] // $ebp = cbYCoCg
neg ebx
mov eax, esp
and esp, 0xffffff80
sub esp, 64 * 4 + DISP
mov [esp], eax // original $esp
mov [esp + 4], edi
}
Loop0:
__asm {
mov edi, [esp + 4] // $edi = pbRGB + 3 * 16 * cmb
// first 8 pixels
pxor xmm1, xmm1
pxor xmm5, xmm5
movdqa xmm0, [esi]
movdqa xmm4, [esi + 16]
psubd xmm1, [esi + ebp]
psubd xmm5, [esi + ebp + 16]
movdqa xmm2, [esi + ebp * 2]
movdqa xmm6, [esi + ebp * 2 + 16]
paddd xmm0, [g_const_d0x80]
paddd xmm4, [g_const_d0x80]
// ICC
movdqa xmm3, xmm1 // g -= r >> 1
movdqa xmm7, xmm5
psrad xmm3, 1
psrad xmm7, 1
psubd xmm0, xmm3
psubd xmm4, xmm7
movdqa xmm3, [g_const_d1] // r -= ((b + 1) >> 1) - g
movdqa xmm7, [g_const_d1]
paddd xmm3, xmm2
paddd xmm7, xmm6
paddd xmm1, xmm0
paddd xmm5, xmm4
psrad xmm3, 1
psrad xmm7, 1
psubd xmm1, xmm3
psubd xmm5, xmm7
paddd xmm2, xmm1 // b += r
paddd xmm6, xmm5
pslld xmm0, 8
pslld xmm2, 16
pslld xmm4, 8
pslld xmm6, 16
por xmm0, xmm1
por xmm4, xmm5
por xmm0, xmm2
por xmm4, xmm6
movdqa [esp + DISP + 64 * 0 + 16 * 0], xmm0
pslld xmm0, 8
movdqa [esp + DISP + 64 * 0 + 16 * 1], xmm4
pslld xmm4, 8
movdqa [esp + DISP + 64 * 0 + 16 * 2], xmm0
movdqa [esp + DISP + 64 * 0 + 16 * 3], xmm4
// second 8 pixels
pxor xmm1, xmm1
pxor xmm5, xmm5
movdqa xmm0, [esi + 32]
movdqa xmm4, [esi + 48]
psubd xmm1, [esi + ebp + 32]
psubd xmm5, [esi + ebp + 48]
movdqa xmm2, [esi + ebp * 2 + 32]
movdqa xmm6, [esi + ebp * 2 + 48]
paddd xmm0, [g_const_d0x80]
paddd xmm4, [g_const_d0x80]
// ICC
movdqa xmm3, xmm1 // g -= r >> 1
movdqa xmm7, xmm5
psrad xmm3, 1
psrad xmm7, 1
psubd xmm0, xmm3
psubd xmm4, xmm7
movdqa xmm3, [g_const_d1] // r -= ((b + 1) >> 1) - g
movdqa xmm7, [g_const_d1]
paddd xmm3, xmm2
paddd xmm7, xmm6
paddd xmm1, xmm0
paddd xmm5, xmm4
psrad xmm3, 1
psrad xmm7, 1
psubd xmm1, xmm3
psubd xmm5, xmm7
paddd xmm2, xmm1 // b += r
paddd xmm6, xmm5
pslld xmm0, 8
pslld xmm2, 16
pslld xmm4, 8
pslld xmm6, 16
por xmm0, xmm1
por xmm4, xmm5
por xmm0, xmm2
por xmm4, xmm6
movdqa [esp + DISP + 64 * 1 + 16 * 0], xmm0
pslld xmm0, 8
movdqa [esp + DISP + 64 * 1 + 16 * 1], xmm4
pslld xmm4, 8
movdqa [esp + DISP + 64 * 1 + 16 * 2], xmm0
movdqa [esp + DISP + 64 * 1 + 16 * 3], xmm4
//================
add esi, 64
// first 8 pixels
pxor xmm1, xmm1
pxor xmm5, xmm5
movdqa xmm0, [esi]
movdqa xmm4, [esi + 16]
psubd xmm1, [esi + ebp]
psubd xmm5, [esi + ebp + 16]
movdqa xmm2, [esi + ebp * 2]
movdqa xmm6, [esi + ebp * 2 + 16]
paddd xmm0, [g_const_d0x80]
paddd xmm4, [g_const_d0x80]
// ICC
movdqa xmm3, xmm1 // g -= r >> 1
movdqa xmm7, xmm5
psrad xmm3, 1
psrad xmm7, 1
psubd xmm0, xmm3
psubd xmm4, xmm7
movdqa xmm3, [g_const_d1] // r -= ((b + 1) >> 1) - g
movdqa xmm7, [g_const_d1]
paddd xmm3, xmm2
paddd xmm7, xmm6
paddd xmm1, xmm0
paddd xmm5, xmm4
psrad xmm3, 1
psrad xmm7, 1
psubd xmm1, xmm3
psubd xmm5, xmm7
paddd xmm2, xmm1 // b += r
paddd xmm6, xmm5
pslld xmm0, 8
pslld xmm2, 16
pslld xmm4, 8
pslld xmm6, 16
por xmm0, xmm1
por xmm4, xmm5
por xmm0, xmm2
por xmm4, xmm6
movdqa [esp + DISP + 64 * 2 + 16 * 0], xmm0
pslld xmm0, 8
movdqa [esp + DISP + 64 * 2 + 16 * 1], xmm4
pslld xmm4, 8
movdqa [esp + DISP + 64 * 2 + 16 * 2], xmm0
movdqa [esp + DISP + 64 * 2 + 16 * 3], xmm4
// second 8 pixels
pxor xmm1, xmm1
pxor xmm5, xmm5
movdqa xmm0, [esi + 32]
movdqa xmm4, [esi + 48]
psubd xmm1, [esi + ebp + 32]
psubd xmm5, [esi + ebp + 48]
movdqa xmm2, [esi + ebp * 2 + 32]
movdqa xmm6, [esi + ebp * 2 + 48]
paddd xmm0, [g_const_d0x80]
paddd xmm4, [g_const_d0x80]
// ICC
movdqa xmm3, xmm1 // g -= r >> 1
movdqa xmm7, xmm5
psrad xmm3, 1
psrad xmm7, 1
psubd xmm0, xmm3
psubd xmm4, xmm7
movdqa xmm3, [g_const_d1] // r -= ((b + 1) >> 1) - g
movdqa xmm7, [g_const_d1]
paddd xmm3, xmm2
paddd xmm7, xmm6
paddd xmm1, xmm0
paddd xmm5, xmm4
psrad xmm3, 1
psrad xmm7, 1
psubd xmm1, xmm3
psubd xmm5, xmm7
paddd xmm2, xmm1 // b += r
paddd xmm6, xmm5
pslld xmm0, 8
pslld xmm2, 16
pslld xmm4, 8
pslld xmm6, 16
por xmm0, xmm1
por xmm4, xmm5
por xmm0, xmm2
por xmm4, xmm6
movdqa [esp + DISP + 64 * 3 + 16 * 0], xmm0
pslld xmm0, 8
movdqa [esp + DISP + 64 * 3 + 16 * 1], xmm4
pslld xmm4, 8
movdqa [esp + DISP + 64 * 3 + 16 * 2], xmm0
movdqa [esp + DISP + 64 * 3 + 16 * 3], xmm4
//================================
// RGBX32 -> RGB24
mov eax, [esp + DISP + 64 * 0 + 4] // ..B1G1R1
mov ecx, [esp + DISP + 64 * 0 + 32] // B0G0R0..
shld eax, ecx, 24 // R1B0G0R0
mov [edi + ebx + 0], eax
mov eax, [esp + DISP + 64 * 0 + 20] // ..B5G5R5
mov ecx, [esp + DISP + 64 * 0 + 36] // B1G1R1..
shld eax, ecx, 16 // G5R5B1G1
mov [edi + ebx + 4], eax
mov eax, [esp + DISP + 64 * 0 + 16] // ..B4G4R4
mov ecx, [esp + DISP + 64 * 0 + 52] // B5G5R5..
shld eax, ecx, 8 // B4G4R4B5
mov [edi + ebx + 8], eax
add edi, edx // $edi = pbRGB += cbRGB
mov eax, [esp + DISP + 64 * 0 + 4 + 8] // ..B3G3R3
mov ecx, [esp + DISP + 64 * 0 + 32 + 8] // B2G2R2..
shld eax, ecx, 24 // R3B2G2R2
mov [edi + ebx + 0], eax
mov eax, [esp + DISP + 64 * 0 + 20 + 8] // ..B7G7R7
mov ecx, [esp + DISP + 64 * 0 + 36 + 8] // B3G3R3..
shld eax, ecx, 16 // G7R7B3G3
mov [edi + ebx + 4], eax
mov eax, [esp + DISP + 64 * 0 + 16 + 8] // ..B6G6R6
mov ecx, [esp + DISP + 64 * 0 + 52 + 8] // B7G7R7..
shld eax, ecx, 8 // B6G6R6B7
mov [edi + ebx + 8], eax
add edi, edx // $edi = pbRGB += cbRGB
// RGBX32 -> RGB24
mov eax, [esp + DISP + 64 * 1 + 4 + 8] // ..B3G3R3
mov ecx, [esp + DISP + 64 * 1 + 32 + 8] // B2G2R2..
shld eax, ecx, 24 // R3B2G2R2
mov [edi + ebx + 0], eax
mov eax, [esp + DISP + 64 * 1 + 20 + 8] // ..B7G7R7
mov ecx, [esp + DISP + 64 * 1 + 36 + 8] // B3G3R3..
shld eax, ecx, 16 // G7R7B3G3
mov [edi + ebx + 4], eax
mov eax, [esp + DISP + 64 * 1 + 16 + 8] // ..B6G6R6
mov ecx, [esp + DISP + 64 * 1 + 52 + 8] // B7G7R7..
shld eax, ecx, 8 // B6G6R6B7
mov [edi + ebx + 8], eax
add edi, edx // $edi = pbRGB += cbRGB
mov eax, [esp + DISP + 64 * 1 + 4] // ..B1G1R1
mov ecx, [esp + DISP + 64 * 1 + 32] // B0G0R0..
shld eax, ecx, 24 // R1B0G0R0
mov [edi + ebx + 0], eax
mov eax, [esp + DISP + 64 * 1 + 20] // ..B5G5R5
mov ecx, [esp + DISP + 64 * 1 + 36] // B1G1R1..
shld eax, ecx, 16 // G5R5B1G1
mov [edi + ebx + 4], eax
mov eax, [esp + DISP + 64 * 1 + 16] // ..B4G4R4
mov ecx, [esp + DISP + 64 * 1 + 52] // B5G5R5..
shld eax, ecx, 8 // B4G4R4B5
mov [edi + ebx + 8], eax
add edi, edx // $edi = pbRGB += cbRGB
// RGBX32 -> RGB24
mov eax, [esp + DISP + 64 * 2 + 4] // ..B1G1R1
mov ecx, [esp + DISP + 64 * 2 + 32] // B0G0R0..
shld eax, ecx, 24 // R1B0G0R0
mov [edi + ebx + 0], eax
mov eax, [esp + DISP + 64 * 2 + 20] // ..B5G5R5
mov ecx, [esp + DISP + 64 * 2 + 36] // B1G1R1..
shld eax, ecx, 16 // G5R5B1G1
mov [edi + ebx + 4], eax
mov eax, [esp + DISP + 64 * 2 + 16] // ..B4G4R4
mov ecx, [esp + DISP + 64 * 2 + 52] // B5G5R5..
shld eax, ecx, 8 // B4G4R4B5
mov [edi + ebx + 8], eax
add edi, edx // $edi = pbRGB += cbRGB
mov eax, [esp + DISP + 64 * 2 + 4 + 8] // ..B3G3R3
mov ecx, [esp + DISP + 64 * 2 + 32 + 8] // B2G2R2..
shld eax, ecx, 24 // R3B2G2R2
mov [edi + ebx + 0], eax
mov eax, [esp + DISP + 64 * 2 + 20 + 8] // ..B7G7R7
mov ecx, [esp + DISP + 64 * 2 + 36 + 8] // B3G3R3..
shld eax, ecx, 16 // G7R7B3G3
mov [edi + ebx + 4], eax
mov eax, [esp + DISP + 64 * 2 + 16 + 8] // ..B6G6R6
mov ecx, [esp + DISP + 64 * 2 + 52 + 8] // B7G7R7..
shld eax, ecx, 8 // B6G6R6B7
mov [edi + ebx + 8], eax
add edi, edx // $edi = pbRGB += cbRGB
// RGBX32 -> RGB24
mov eax, [esp + DISP + 64 * 3 + 4 + 8] // ..B3G3R3
mov ecx, [esp + DISP + 64 * 3 + 32 + 8] // B2G2R2..
shld eax, ecx, 24 // R3B2G2R2
mov [edi + ebx + 0], eax
mov eax, [esp + DISP + 64 * 3 + 20 + 8] // ..B7G7R7
mov ecx, [esp + DISP + 64 * 3 + 36 + 8] // B3G3R3..
shld eax, ecx, 16 // G7R7B3G3
mov [edi + ebx + 4], eax
mov eax, [esp + DISP + 64 * 3 + 16 + 8] // ..B6G6R6
mov ecx, [esp + DISP + 64 * 3 + 52 + 8] // B7G7R7..
shld eax, ecx, 8 // B6G6R6B7
mov [edi + ebx + 8], eax
add edi, edx // $edi = pbRGB += cbRGB
mov eax, [esp + DISP + 64 * 3 + 4] // ..B1G1R1
mov ecx, [esp + DISP + 64 * 3 + 32] // B0G0R0..
shld eax, ecx, 24 // R1B0G0R0
mov [edi + ebx + 0], eax
mov eax, [esp + DISP + 64 * 3 + 20] // ..B5G5R5
mov ecx, [esp + DISP + 64 * 3 + 36] // B1G1R1..
shld eax, ecx, 16 // G5R5B1G1
mov [edi + ebx + 4], eax
mov eax, [esp + DISP + 64 * 3 + 16] // ..B4G4R4
mov ecx, [esp + DISP + 64 * 3 + 52] // B5G5R5..
shld eax, ecx, 8 // B4G4R4B5
mov [edi + ebx + 8], eax
//================================
add esi, 256 - 64
add ebx, 12
jnz Loop0
//================
pop esp
pop edi
pop esi
pop ebx
pop ebp
ret 20
}
}
Int outputMBRow_RGB24_Lossless_1(CWMImageStrCodec* pSC)
{
const size_t cbRGB = pSC->WMIBI.cbStride;
const U8* const pbRGB = (U8*)pSC->WMIBI.pv + cbRGB * (pSC->cRow - 1) * 16;
U8* const pbY = (U8*)pSC->a0MBbuffer[0];
U8* const pbU = (U8*)pSC->a0MBbuffer[1];
// U8* const pbV = (U8*)pSC->a0MBbuffer[2];
const size_t cmbColumn = (pSC->WMII.cWidth + 15) / 16;
assert(BD_8 == pSC->WMII.bdBitDepth);
assert(CF_RGB == pSC->WMII.cfColorFormat);
assert(24 == pSC->WMII.cBitsPerUnit);
assert(pSC->WMII.bRGB);
assert(O_NONE == pSC->WMII.oOrientation);
assert(YUV_444 == pSC->m_param.cfColorFormat);
assert(!pSC->m_param.bScaledArith);
assert(pSC->m_Dparam->bDecodeFullFrame);
storeRGB24_5(pbY + 64 * 0, pbU - pbY, pbRGB + cbRGB * 0, cbRGB, cmbColumn);
storeRGB24_5(pbY + 64 * 2, pbU - pbY, pbRGB + cbRGB * 8, cbRGB, cmbColumn);
return ICERR_OK;
}
__declspec(naked) void __stdcall storeRGB24_3(
U8* pbYCoCg,
size_t cbYCoCg,
const U8* pbRGB,
size_t cbRGB,
size_t cmb,
const U8* Shift)
{
UNREFERENCED_PARAMETER( pbYCoCg );
UNREFERENCED_PARAMETER( cbYCoCg );
UNREFERENCED_PARAMETER( pbRGB );
UNREFERENCED_PARAMETER( cbRGB );
UNREFERENCED_PARAMETER( cmb );
UNREFERENCED_PARAMETER( Shift );
__asm {
push ebp
push ebx
push esi
push edi
mov ecx, [esp + 40] // $ecx = Shift
mov ebx, [esp + 36] // $ebx = cmb
mov edi, [esp + 28] // $edi = pbRGB
lea ebx, [ebx + ebx * 2] // $ebx = cmb * 3
mov edx, [esp + 32] // $edx = cbRGB
shl ebx, 4 // $ebx = cmb * 3 * 16
mov esi, [esp + 20] // $esi = pbYCoCg
add edi, ebx // $edi = pbRGB + 3 * 16 * cmb
mov ebp, [esp + 24] // $ebp = cbYCoCg
neg ebx
mov eax, esp
and esp, 0xffffff80
sub esp, 320
mov [esp], eax // original $esp
mov [esp + 4], edi
mov [esp + 8], ecx
}
Loop0:
__asm {
mov edi, [esp + 4] // $edi = pbRGB + 3 * 16 * cmb
//================
// first 8 pixels
movdqa xmm0, [esi]
movdqa xmm4, [esi + 16]
movdqa xmm3, [esi + ebp]
movdqa xmm7, [esi + ebp + 16]
movdqa xmm2, [esi + ebp * 2]
movdqa xmm6, [esi + ebp * 2 + 16]
mov ecx, [esp + 8]
movdqa xmm1, [ecx]
movdqa xmm5, [g_const_d0x80]
pslld xmm5, xmm1
paddd xmm5, xmm1
paddd xmm0, xmm5 // bias
paddd xmm4, xmm5 // bias
pxor xmm1, xmm1
pxor xmm5, xmm5
psubd xmm1, xmm3
psubd xmm5, xmm7
// ICC
movdqa xmm3, xmm1 // g -= r >> 1
movdqa xmm7, xmm5
psrad xmm3, 1
psrad xmm7, 1
psubd xmm0, xmm3
psubd xmm4, xmm7
movdqa xmm3, [g_const_d1] // r -= ((b + 1) >> 1) - g
movdqa xmm7, [g_const_d1]
paddd xmm3, xmm2
paddd xmm7, xmm6
paddd xmm1, xmm0
paddd xmm5, xmm4
psrad xmm3, 1
psrad xmm7, 1
psubd xmm1, xmm3
psubd xmm5, xmm7
paddd xmm2, xmm1 // b += r
paddd xmm6, xmm5
// clip
movdqa xmm3, [g_const_w0x80]
packssdw xmm0, xmm4
packssdw xmm1, xmm5
packssdw xmm2, xmm6
mov ecx, [esp + 8]
movdqa xmm4, [ecx]
psraw xmm0, xmm4
psraw xmm1, xmm4
psraw xmm2, xmm4
psubw xmm0, xmm3
psubw xmm1, xmm3
psubw xmm2, xmm3
movdqa [esp + 16], xmm0
movdqa [esp + 32], xmm1
movdqa [esp + 48], xmm2
//================
// second 8 pixels
movdqa xmm0, [esi + 32]
movdqa xmm4, [esi + 48]
movdqa xmm3, [esi + ebp + 32]
movdqa xmm7, [esi + ebp + 48]
movdqa xmm2, [esi + ebp * 2 + 32]
movdqa xmm6, [esi + ebp * 2 + 48]
mov ecx, [esp + 8]
movdqa xmm1, [ecx]
movdqa xmm5, [g_const_d0x80]
pslld xmm5, xmm1
paddd xmm5, xmm1
paddd xmm0, xmm5 // bias
paddd xmm4, xmm5 // bias
pxor xmm1, xmm1
pxor xmm5, xmm5
psubd xmm1, xmm3
psubd xmm5, xmm7
// ICC
movdqa xmm3, xmm1 // g -= r >> 1
movdqa xmm7, xmm5
psrad xmm3, 1
psrad xmm7, 1
psubd xmm0, xmm3
psubd xmm4, xmm7
movdqa xmm3, [g_const_d1] // r -= ((b + 1) >> 1) - g
movdqa xmm7, [g_const_d1]
paddd xmm3, xmm2
paddd xmm7, xmm6
paddd xmm1, xmm0
paddd xmm5, xmm4
psrad xmm3, 1
psrad xmm7, 1
psubd xmm1, xmm3
psubd xmm5, xmm7
paddd xmm2, xmm1 // b += r
paddd xmm6, xmm5
// clip
movdqa xmm3, [g_const_w0x80]
packssdw xmm0, xmm4
packssdw xmm1, xmm5
packssdw xmm2, xmm6
mov ecx, [esp + 8]
movdqa xmm4, [ecx]
psraw xmm0, xmm4
psraw xmm1, xmm4
psraw xmm2, xmm4
psubw xmm0, xmm3
psubw xmm1, xmm3
psubw xmm2, xmm3
//================
// 16 pixels
movdqa xmm3, [g_const_b0x80]
packsswb xmm0, [esp + 16]
packsswb xmm1, [esp + 32]
packsswb xmm2, [esp + 48]
psubb xmm0, xmm3
psubb xmm1, xmm3
psubb xmm2, xmm3
pxor xmm7, xmm7
movdqa xmm4, xmm0
movdqa xmm5, xmm1
movdqa xmm6, xmm2
punpckhbw xmm0, xmm7
punpckhbw xmm1, xmm7
punpckhbw xmm2, xmm7
punpcklbw xmm4, xmm7
punpcklbw xmm5, xmm7
punpcklbw xmm6, xmm7
// spill second 8 pixels
movdqa [esp + 16], xmm4
movdqa [esp + 32], xmm5
movdqa [esp + 48], xmm6
// first 8 pixels
movdqa xmm4, xmm0
movdqa xmm5, xmm1
movdqa xmm6, xmm2
punpcklwd xmm0, xmm7
punpcklwd xmm1, xmm7
punpcklwd xmm2, xmm7
punpckhwd xmm4, xmm7
punpckhwd xmm5, xmm7
punpckhwd xmm6, xmm7
pslld xmm0, 8
pslld xmm2, 16
pslld xmm4, 8
pslld xmm6, 16
por xmm0, xmm1
por xmm4, xmm5
por xmm0, xmm2
por xmm4, xmm6
movdqa [esp + 64], xmm0
pslld xmm0, 8
movdqa [esp + 80], xmm4
pslld xmm4, 8
movdqa [esp + 96], xmm0
movdqa [esp + 112], xmm4
// second 8 pixels
movdqa xmm0, [esp + 16]
movdqa xmm1, [esp + 32]
movdqa xmm2, [esp + 48]
movdqa xmm4, xmm0
movdqa xmm5, xmm1
movdqa xmm6, xmm2
punpcklwd xmm0, xmm7
punpcklwd xmm1, xmm7
punpcklwd xmm2, xmm7
punpckhwd xmm4, xmm7
punpckhwd xmm5, xmm7
punpckhwd xmm6, xmm7
pslld xmm0, 8
pslld xmm2, 16
pslld xmm4, 8
pslld xmm6, 16
por xmm0, xmm1
por xmm4, xmm5
por xmm0, xmm2
por xmm4, xmm6
movdqa [esp + 128], xmm0
pslld xmm0, 8
movdqa [esp + 144], xmm4
pslld xmm4, 8
movdqa [esp + 160], xmm0
movdqa [esp + 176], xmm4
//================================
add esi, 64
//================
// first 8 pixels
movdqa xmm0, [esi]
movdqa xmm4, [esi + 16]
movdqa xmm3, [esi + ebp]
movdqa xmm7, [esi + ebp + 16]
movdqa xmm2, [esi + ebp * 2]
movdqa xmm6, [esi + ebp * 2 + 16]
mov ecx, [esp + 8]
movdqa xmm1, [ecx]
movdqa xmm5, [g_const_d0x80]
pslld xmm5, xmm1
paddd xmm5, xmm1
paddd xmm0, xmm5 // bias
paddd xmm4, xmm5 // bias
pxor xmm1, xmm1
pxor xmm5, xmm5
psubd xmm1, xmm3
psubd xmm5, xmm7
// ICC
movdqa xmm3, xmm1 // g -= r >> 1
movdqa xmm7, xmm5
psrad xmm3, 1
psrad xmm7, 1
psubd xmm0, xmm3
psubd xmm4, xmm7
movdqa xmm3, [g_const_d1] // r -= ((b + 1) >> 1) - g
movdqa xmm7, [g_const_d1]
paddd xmm3, xmm2
paddd xmm7, xmm6
paddd xmm1, xmm0
paddd xmm5, xmm4
psrad xmm3, 1
psrad xmm7, 1
psubd xmm1, xmm3
psubd xmm5, xmm7
paddd xmm2, xmm1 // b += r
paddd xmm6, xmm5
// clip
movdqa xmm3, [g_const_w0x80]
packssdw xmm0, xmm4
packssdw xmm1, xmm5
packssdw xmm2, xmm6
mov ecx, [esp + 8]
movdqa xmm4, [ecx]
psraw xmm0, xmm4
psraw xmm1, xmm4
psraw xmm2, xmm4
psubw xmm0, xmm3
psubw xmm1, xmm3
psubw xmm2, xmm3
movdqa [esp + 16], xmm0
movdqa [esp + 32], xmm1
movdqa [esp + 48], xmm2
//================
// second 8 pixels
movdqa xmm0, [esi + 32]
movdqa xmm4, [esi + 48]
movdqa xmm3, [esi + ebp + 32]
movdqa xmm7, [esi + ebp + 48]
movdqa xmm2, [esi + ebp * 2 + 32]
movdqa xmm6, [esi + ebp * 2 + 48]
mov ecx, [esp + 8]
movdqa xmm1, [ecx]
movdqa xmm5, [g_const_d0x80]
pslld xmm5, xmm1
paddd xmm5, xmm1
paddd xmm0, xmm5 // bias
paddd xmm4, xmm5 // bias
pxor xmm1, xmm1
pxor xmm5, xmm5
psubd xmm1, xmm3
psubd xmm5, xmm7
// ICC
movdqa xmm3, xmm1 // g -= r >> 1
movdqa xmm7, xmm5
psrad xmm3, 1
psrad xmm7, 1
psubd xmm0, xmm3
psubd xmm4, xmm7
movdqa xmm3, [g_const_d1] // r -= ((b + 1) >> 1) - g
movdqa xmm7, [g_const_d1]
paddd xmm3, xmm2
paddd xmm7, xmm6
paddd xmm1, xmm0
paddd xmm5, xmm4
psrad xmm3, 1
psrad xmm7, 1
psubd xmm1, xmm3
psubd xmm5, xmm7
paddd xmm2, xmm1 // b += r
paddd xmm6, xmm5
// clip
movdqa xmm3, [g_const_w0x80]
packssdw xmm0, xmm4
packssdw xmm1, xmm5
packssdw xmm2, xmm6
mov ecx, [esp + 8]
movdqa xmm4, [ecx]
psraw xmm0, xmm4
psraw xmm1, xmm4
psraw xmm2, xmm4
psubw xmm0, xmm3
psubw xmm1, xmm3
psubw xmm2, xmm3
//================
// 16 pixels
movdqa xmm3, [g_const_b0x80]
packsswb xmm0, [esp + 16]
packsswb xmm1, [esp + 32]
packsswb xmm2, [esp + 48]
psubb xmm0, xmm3
psubb xmm1, xmm3
psubb xmm2, xmm3
pxor xmm7, xmm7
movdqa xmm4, xmm0
movdqa xmm5, xmm1
movdqa xmm6, xmm2
punpckhbw xmm0, xmm7
punpckhbw xmm1, xmm7
punpckhbw xmm2, xmm7
punpcklbw xmm4, xmm7
punpcklbw xmm5, xmm7
punpcklbw xmm6, xmm7
// spill second 8 pixels
movdqa [esp + 16], xmm4
movdqa [esp + 32], xmm5
movdqa [esp + 48], xmm6
// first 8 pixels
movdqa xmm4, xmm0
movdqa xmm5, xmm1
movdqa xmm6, xmm2
punpcklwd xmm0, xmm7
punpcklwd xmm1, xmm7
punpcklwd xmm2, xmm7
punpckhwd xmm4, xmm7
punpckhwd xmm5, xmm7
punpckhwd xmm6, xmm7
pslld xmm0, 8
pslld xmm2, 16
pslld xmm4, 8
pslld xmm6, 16
por xmm0, xmm1
por xmm4, xmm5
por xmm0, xmm2
por xmm4, xmm6
movdqa [esp + 192], xmm0
pslld xmm0, 8
movdqa [esp + 208], xmm4
pslld xmm4, 8
movdqa [esp + 224], xmm0
movdqa [esp + 240], xmm4
// second 8 pixels
movdqa xmm0, [esp + 16]
movdqa xmm1, [esp + 32]
movdqa xmm2, [esp + 48]
movdqa xmm4, xmm0
movdqa xmm5, xmm1
movdqa xmm6, xmm2
punpcklwd xmm0, xmm7
punpcklwd xmm1, xmm7
punpcklwd xmm2, xmm7
punpckhwd xmm4, xmm7
punpckhwd xmm5, xmm7
punpckhwd xmm6, xmm7
pslld xmm0, 8
pslld xmm2, 16
pslld xmm4, 8
pslld xmm6, 16
por xmm0, xmm1
por xmm4, xmm5
por xmm0, xmm2
por xmm4, xmm6
movdqa [esp + 256], xmm0
pslld xmm0, 8
movdqa [esp + 272], xmm4
pslld xmm4, 8
movdqa [esp + 288], xmm0
movdqa [esp + 304], xmm4
// RGBX32 -> RGB24
mov eax, [esp + 68] // ..B1G1R1
mov ecx, [esp + 96] // B0G0R0..
shld eax, ecx, 24 // R1B0G0R0
mov [edi + ebx + 0], eax
mov eax, [esp + 84] // ..B5G5R5
mov ecx, [esp + 100] // B1G1R1..
shld eax, ecx, 16 // G5R5B1G1
mov [edi + ebx + 4], eax
mov eax, [esp + 80] // ..B4G4R4
mov ecx, [esp + 116] // B5G5R5..
shld eax, ecx, 8 // B4G4R4B5
mov [edi + ebx + 8], eax
add edi, edx // $edi = pbRGB += cbRGB
mov eax, [esp + 76] // ..B3G3R3
mov ecx, [esp + 104] // B2G2R2..
shld eax, ecx, 24 // R3B2G2R2
mov [edi + ebx + 0], eax
mov eax, [esp + 92] // ..B7G7R7
mov ecx, [esp + 108] // B3G3R3..
shld eax, ecx, 16 // G7R7B3G3
mov [edi + ebx + 4], eax
mov eax, [esp + 88] // ..B6G6R6
mov ecx, [esp + 124] // B7G7R7..
shld eax, ecx, 8 // B6G6R6B7
mov [edi + ebx + 8], eax
add edi, edx // $edi = pbRGB += cbRGB
// RGBX32 -> RGB24
mov eax, [esp + 140] // ..B3G3R3
mov ecx, [esp + 168] // B2G2R2..
shld eax, ecx, 24 // R3B2G2R2
mov [edi + ebx + 0], eax
mov eax, [esp + 156] // ..B7G7R7
mov ecx, [esp + 172] // B3G3R3..
shld eax, ecx, 16 // G7R7B3G3
mov [edi + ebx + 4], eax
mov eax, [esp + 152] // ..B6G6R6
mov ecx, [esp + 188] // B7G7R7..
shld eax, ecx, 8 // B6G6R6B7
mov [edi + ebx + 8], eax
add edi, edx // $edi = pbRGB += cbRGB
mov eax, [esp + 132] // ..B1G1R1
mov ecx, [esp + 160] // B0G0R0..
shld eax, ecx, 24 // R1B0G0R0
mov [edi + ebx + 0], eax
mov eax, [esp + 148] // ..B5G5R5
mov ecx, [esp + 164] // B1G1R1..
shld eax, ecx, 16 // G5R5B1G1
mov [edi + ebx + 4], eax
mov eax, [esp + 144] // ..B4G4R4
mov ecx, [esp + 180] // B5G5R5..
shld eax, ecx, 8 // B4G4R4B5
mov [edi + ebx + 8], eax
add edi, edx // $edi = pbRGB += cbRGB
// RGBX32 -> RGB24
mov eax, [esp + 196] // ..B1G1R1
mov ecx, [esp + 224] // B0G0R0..
shld eax, ecx, 24 // R1B0G0R0
mov [edi + ebx + 0], eax
mov eax, [esp + 212] // ..B5G5R5
mov ecx, [esp + 228] // B1G1R1..
shld eax, ecx, 16 // G5R5B1G1
mov [edi + ebx + 4], eax
mov eax, [esp + 208] // ..B4G4R4
mov ecx, [esp + 244] // B5G5R5..
shld eax, ecx, 8 // B4G4R4B5
mov [edi + ebx + 8], eax
add edi, edx // $edi = pbRGB += cbRGB
mov eax, [esp + 204] // ..B3G3R3
mov ecx, [esp + 232] // B2G2R2..
shld eax, ecx, 24 // R3B2G2R2
mov [edi + ebx + 0], eax
mov eax, [esp + 220] // ..B7G7R7
mov ecx, [esp + 236] // B3G3R3..
shld eax, ecx, 16 // G7R7B3G3
mov [edi + ebx + 4], eax
mov eax, [esp + 216] // ..B6G6R6
mov ecx, [esp + 252] // B7G7R7..
shld eax, ecx, 8 // B6G6R6B7
mov [edi + ebx + 8], eax
add edi, edx // $edi = pbRGB += cbRGB
// RGBX32 -> RGB24
mov eax, [esp + 268] // ..B3G3R3
mov ecx, [esp + 296] // B2G2R2..
shld eax, ecx, 24 // R3B2G2R2
mov [edi + ebx + 0], eax
mov eax, [esp + 284] // ..B7G7R7
mov ecx, [esp + 300] // B3G3R3..
shld eax, ecx, 16 // G7R7B3G3
mov [edi + ebx + 4], eax
mov eax, [esp + 280] // ..B6G6R6
mov ecx, [esp + 316] // B7G7R7..
shld eax, ecx, 8 // B6G6R6B7
mov [edi + ebx + 8], eax
add edi, edx // $edi = pbRGB += cbRGB
mov eax, [esp + 260] // ..B1G1R1
mov ecx, [esp + 288] // B0G0R0..
shld eax, ecx, 24 // R1B0G0R0
mov [edi + ebx + 0], eax
mov eax, [esp + 276] // ..B5G5R5
mov ecx, [esp + 292] // B1G1R1..
shld eax, ecx, 16 // G5R5B1G1
mov [edi + ebx + 4], eax
mov eax, [esp + 272] // ..B4G4R4
mov ecx, [esp + 308] // B5G5R5..
shld eax, ecx, 8 // B4G4R4B5
mov [edi + ebx + 8], eax
add edi, edx // $edi = pbRGB += cbRGB
//================================
add esi, 256 - 64
add ebx, 12
jnz Loop0
//================
pop esp
pop edi
pop esi
pop ebx
pop ebp
ret 24
}
}
Int outputMBRow_RGB24_Lossy_3(CWMImageStrCodec* pSC)
{
const size_t cbRGB = pSC->WMIBI.cbStride;
const U8* const pbRGB = (U8*)pSC->WMIBI.pv + cbRGB * (pSC->cRow - 1) * 16;
U8* const pbY = (U8*)pSC->a0MBbuffer[0];
U8* const pbU = (U8*)pSC->a0MBbuffer[1];
// U8* const pbV = (U8*)pSC->a0MBbuffer[2];
const size_t cmbColumn = (pSC->WMII.cWidth + 15) / 16;
__declspec(align(16)) U8 Shift[16];
assert(BD_8 == pSC->WMII.bdBitDepth);
assert(CF_RGB == pSC->WMII.cfColorFormat);
assert(24 == pSC->WMII.cBitsPerUnit);
assert(pSC->WMII.bRGB);
assert(O_NONE == pSC->WMII.oOrientation);
assert(YUV_444 == pSC->m_param.cfColorFormat);
assert(pSC->m_Dparam->bDecodeFullFrame);
_mm_store_si128((__m128i *) Shift, pSC->m_param.bScaledArith ? g_const_d3 : g_const_d0);
storeRGB24_3(pbY + 64 * 0, pbU - pbY, pbRGB + cbRGB * 0, cbRGB, cmbColumn,
Shift);
storeRGB24_3(pbY + 64 * 2, pbU - pbY, pbRGB + cbRGB * 8, cbRGB, cmbColumn,
Shift);
return ICERR_OK;
}
#endif
//================================================================
#if defined(WMP_OPT_TRFM_DEC)
FORCE_INLINE Void strDCT2x2up_OPT(PixelI *pa, PixelI *pb, PixelI *pc, PixelI *pd)
{
PixelI a, b, c, d, C, t;
a = *pa;
b = *pb;
C = *pc;
d = *pd;
a += d;
b -= C;
t = ((a - b + 1) >> 1);
c = t - d;
d = t - C;
a -= d;
b += c;
*pa = a;
*pb = b;
*pc = c;
*pd = d;
}
FORCE_INLINE Void invOdd_OPT(PixelI *pa, PixelI *pb, PixelI *pc, PixelI *pd)
{
PixelI a, b, c, d;
a = *pa;
b = *pb;
c = *pc;
d = *pd;
/** butterflies **/
b += d;
a -= c;
d -= (b) >> 1;
c += (a + 1) >> 1;
/** rotate pi/8 **/
#define IROTATE2(a, b) (a) -= (((b)*3 + 4) >> 3), (b) += (((a)*3 + 4) >> 3)
IROTATE2(a, b);
IROTATE2(c, d);
/** butterflies **/
c -= (b + 1) >> 1;
d = ((a + 1) >> 1) - d;
b += c;
a -= d;
*pa = a;
*pb = b;
*pc = c;
*pd = d;
}
FORCE_INLINE Void invOddOdd_OPT(PixelI* pa, PixelI* pb, PixelI* pc, PixelI* pd)
{
PixelI a, b, c, d, t1, t2;
a = *pa;
b = *pb;
c = *pc;
d = *pd;
/** butterflies **/
d += a;
c -= b;
a -= (t1 = d >> 1);
b += (t2 = c >> 1);
/** rotate pi/4 **/
a -= (b * 3 + 3) >> 3;
b += (a * 3 + 3) >> 2;
a -= (b * 3 + 4) >> 3;
/** butterflies **/
b -= t2;
a += t1;
c += b;
d -= a;
/** sign flips **/
*pa = a;
*pb = -b;
*pc = -c;
*pd = d;
}
FORCE_INLINE Void strDCT2x2dn_SSE2_1(PixelI* p)
{
__m128i* const pdq = (__m128i*)p;
__m128i a = pdq[0];
__m128i b = pdq[1];
const __m128i C = pdq[2];
__m128i d = pdq[3];
__m128i t;
__m128i c;
a = _mm_add_epi32(a, d);
b = _mm_sub_epi32(b, C);
t = _mm_sub_epi32(a, b);
t = _mm_srai_epi32(t, 1);
c = _mm_sub_epi32(t, d);
d = _mm_sub_epi32(t, C);
a = _mm_sub_epi32(a, d);
b = _mm_add_epi32(b, c);
pdq[0] = a;
pdq[1] = b;
pdq[2] = c;
pdq[3] = d;
}
Void strIDCT4x4Stage1_OPT_H1(PixelI* p)
{
/** top left corner, butterfly => butterfly **/
strDCT2x2up_OPT(p + 0, p + 1, p + 2, p + 3);
/** top right corner, -pi/8 rotation => butterfly **/
invOdd_OPT(p + 5, p + 4, p + 7, p + 6);
/** bottom left corner, butterfly => -pi/8 rotation **/
invOdd_OPT(p + 10, p + 8, p + 11, p + 9);
/** bottom right corner, -pi/8 rotation => -pi/8 rotation **/
invOddOdd_OPT(p + 15, p + 14, p + 13, p + 12);
}
FORCE_INLINE Void strIDCT4x4Stage1_OPT_H2(PixelI* p)
{
/** butterfly **/
strDCT2x2dn_SSE2_1(p);
}
Void strIDCT4x4Stage1_OPT5(PixelI* p0, PixelI* p1)
{
_mm_prefetch((char*)(p0 - 96 + 256), _MM_HINT_T0);
strIDCT4x4Stage1_OPT_H1(p0 - 96);
strIDCT4x4Stage1_OPT_H1(p0 - 80);
strIDCT4x4Stage1_OPT_H1(p0 - 32);
strIDCT4x4Stage1_OPT_H1(p0 - 16);
_mm_prefetch((char*)(p0 - 32 + 256), _MM_HINT_T0);
strIDCT4x4Stage1_OPT_H1(p0 + 32);
strIDCT4x4Stage1_OPT_H1(p0 + 48);
strIDCT4x4Stage1_OPT_H1(p0 + 96);
strIDCT4x4Stage1_OPT_H1(p0 + 112);
_mm_prefetch((char*)(p0 + 32 + 256), _MM_HINT_T0);
strIDCT4x4Stage1_OPT_H1(p1 - 128);
strIDCT4x4Stage1_OPT_H1(p1 - 112);
strIDCT4x4Stage1_OPT_H1(p1 - 64);
strIDCT4x4Stage1_OPT_H1(p1 - 48);
_mm_prefetch((char*)(p0 + 96 + 256), _MM_HINT_T0);
strIDCT4x4Stage1_OPT_H1(p1 + 0);
strIDCT4x4Stage1_OPT_H1(p1 + 16);
strIDCT4x4Stage1_OPT_H1(p1 + 64);
strIDCT4x4Stage1_OPT_H1(p1 + 80);
strIDCT4x4Stage1_OPT_H2(p0 - 96);
strIDCT4x4Stage1_OPT_H2(p0 - 80);
strIDCT4x4Stage1_OPT_H2(p0 - 32);
strIDCT4x4Stage1_OPT_H2(p0 - 16);
strIDCT4x4Stage1_OPT_H2(p0 + 32);
strIDCT4x4Stage1_OPT_H2(p0 + 48);
strIDCT4x4Stage1_OPT_H2(p0 + 96);
strIDCT4x4Stage1_OPT_H2(p0 + 112);
strIDCT4x4Stage1_OPT_H2(p1 - 128);
strIDCT4x4Stage1_OPT_H2(p1 - 112);
strIDCT4x4Stage1_OPT_H2(p1 - 64);
strIDCT4x4Stage1_OPT_H2(p1 - 48);
strIDCT4x4Stage1_OPT_H2(p1 + 0);
strIDCT4x4Stage1_OPT_H2(p1 + 16);
strIDCT4x4Stage1_OPT_H2(p1 + 64);
strIDCT4x4Stage1_OPT_H2(p1 + 80);
}
//================================
__declspec(naked) void __stdcall strPost4x4Stage1_alternate_ASM5(PixelI* p0, PixelI* p1)
{
UNREFERENCED_PARAMETER( p0 );
UNREFERENCED_PARAMETER( p1 );
__asm {
push ebp
push ebx
push esi
push edi
//================
// pointer array
mov eax, [esp + 20] // $esi = p0
mov edx, [esp + 24] // $edi = p1
mov ecx, 4 * 16
mov ebx, 4 * 48
prefetcht0 [eax + 512]
prefetcht0 [eax + 768]
prefetcht0 [eax + 1024]
prefetcht0 [eax + 1280]
add edx, ecx
add eax, ebx
push edx
sub edx, ecx
push edx
push edx
sub edx, ebx
push eax
push eax
sub eax, ecx
push eax
push eax
sub eax, ecx
push eax
sub eax, ecx
push edx
sub edx, ecx
push edx
sub eax, ecx
push edx
sub edx, ebx
push eax
push eax
sub eax, ecx
push eax
push eax
sub eax, ecx
push eax
sub eax, ecx
push edx
sub edx, ecx
push edx
sub eax, ecx
push edx
sub edx, ebx
push eax
push eax
sub eax, ecx
push eax
push eax
sub eax, ecx
push eax
sub eax, ecx
push edx
sub edx, ecx
push edx
sub eax, ecx
push edx
push eax
push eax
sub eax, ecx
push eax
push eax
sub eax, ecx
push eax
mov ebp, (4 + 4) * -16
push ebp
}
Loop0:
__asm {
mov esi, [esp + (4 + 4) * 16 + 4 + ebp ] // $esi = p0
mov edi, [esp + (4 + 4) * 16 + 4 + ebp + 4] // $edi = p1
//================
movdqa xmm2, [esi + 4 * 12] // a = xmm2
movdqa xmm1, [esi + 4 * 72] // b = xmm1
movdqa xmm6, [edi + 4 * 4] // c = xmm6
movdqa xmm7, [edi + 4 * 64] // d = xmm7
//================
// buttefly
paddd xmm2, xmm7
psubd xmm1, xmm6
movdqa xmm0, xmm2 // a = xmm0
psubd xmm2, xmm1
psrad xmm2, 1
movdqa xmm3, xmm2
psubd xmm2, xmm7 // c = xmm2
psubd xmm3, xmm6 // d = xmm3
paddd xmm1, xmm2
psubd xmm0, xmm3
//================
// bottom right corner: -pi/8 rotation => -pi/8 rotation
pshufd xmm7, xmm3, 0x3
movd eax, xmm3
movd edx, xmm7
pshufd xmm7, xmm3, 0x1
movd ebx, xmm7
pshufd xmm7, xmm3, 0x2
movd ecx, xmm7
add edx, eax
sub ecx, ebx
mov esi, edx
sar esi, 1
mov edi, ecx
sar edi, 1
sub eax, esi
add ebx, edi
lea ebp, [ebx + ebx * 2 + 6]
sar ebp, 3
sub eax, ebp
lea ebp, [eax + eax * 2 + 2]
sar ebp, 2
add ebx, ebp
lea ebp, [ebx + ebx * 2 + 4]
sar ebp, 3
sub eax, ebp
mov ebp, [esp]
sub ebx, edi
add eax, esi
add ecx, ebx
sub edx, eax
mov esi, [esp + (4 + 4) * 16 + 4 + ebp ] // $esi = p0
mov edi, [esp + (4 + 4) * 16 + 4 + ebp + 4] // $edi = p1
movd xmm3, eax
movd xmm4, ebx
movd xmm5, ecx
movd xmm6, edx
punpckldq xmm3, xmm4
punpckldq xmm5, xmm6
punpcklqdq xmm3, xmm5
//================
// anti diagonal corners: rotation by -pi/8
movdqa xmm5, g_const_d1
movdqa xmm6, g_const_d1
pshufd xmm2, xmm2, 0xd8 // 7, 5, 6, 4
movdqa xmm4, xmm1 // 75, 74, 73, 72
punpckhqdq xmm1, xmm2 // 7, 5, 75, 74
punpcklqdq xmm4, xmm2 // 6, 4, 73, 72
paddd xmm5, xmm1
psrad xmm5, 1
psubd xmm4, xmm5
paddd xmm6, xmm4
psrad xmm6, 1
paddd xmm1, xmm6
movdqa xmm2, xmm4 // 6, 4, 73, 72
punpckhqdq xmm4, xmm1 // 7, 5, 6, 4
punpcklqdq xmm2, xmm1 // 75, 74, 73, 72
pshufd xmm4, xmm4, 0xd8 // 7, 6, 5, 4
//================
// butterfly
// a = xmm0, b = xmm2, c = xmm4, d = xmm3
paddd xmm0, xmm3
movdqa xmm1, xmm0 // a = xmm1
psrad xmm0, 1
psubd xmm0, xmm3 // d = xmm0
movdqa xmm3, xmm0 // d = xmm3
paddd xmm0, xmm0
paddd xmm0, xmm3
psrad xmm0, 3
paddd xmm1, xmm0
movdqa xmm0, xmm1 // a = xmm0
paddd xmm1, xmm1
paddd xmm1, xmm0
psrad xmm1, 4
paddd xmm3, xmm1
movdqa xmm5, xmm0 // a
psrad xmm5, 7
paddd xmm3, xmm5 // d += (a >> 7)
psrad xmm5, 3
psubd xmm3, xmm5 // d -= (a >> 10)
movdqa xmm5, [g_const_d4]
movdqa xmm1, xmm3 // d = xmm1
psubd xmm2, xmm4
paddd xmm5, xmm3
paddd xmm3, xmm3
paddd xmm3, xmm5
psrad xmm3, 3
paddd xmm0, xmm3
movdqa xmm3, xmm2 // b = xmm3
psrad xmm2, 1
psubd xmm1, xmm2
movdqa xmm2, xmm0 // a = xmm2
psubd xmm0, xmm3
psrad xmm0, 1
psubd xmm0, xmm4 // c = xmm0
paddd xmm3, xmm1
psubd xmm2, xmm0
//================
movdqa [edi + 4 * 4], xmm1
movdqa [edi + 4 * 64], xmm0
movdqa [esi + 4 * 12], xmm2
movdqa [esi + 4 * 72], xmm3
add ebp, 8
mov [esp], ebp
jnz Loop0
//================
add esp, (4 + 4) * 16 + 4
pop edi
pop esi
pop ebx
pop ebp
ret 4 * 2
}
}
Int invTransformMacroblock_YUV444_Center5(CWMImageStrCodec * pSC)
{
const OVERLAP olOverlap = pSC->WMISCP.olOverlap;
int i = 0;
assert(0 < pSC->cRow && pSC->cRow < pSC->cmbHeight);
assert(0 < pSC->cColumn && pSC->cColumn < pSC->cmbWidth);
assert(0 == pSC->WMII.cPostProcStrength);
assert(YUV_444 == pSC->m_param.cfColorFormat);
assert(3 == pSC->m_param.cNumChannels);
assert(pSC->m_Dparam->bDecodeFullWidth);
assert(1 == pSC->m_Dparam->cThumbnailScale);
for (i = 0; i < 3; ++i)
{
PixelI* const p0 = pSC->p0MBbuffer[i];
PixelI* const p1 = pSC->p1MBbuffer[i];
//================================
// second level inverse transform
strIDCT4x4Stage2(p1);
if (pSC->m_param.bScaledArith) {
strNormalizeDec(p1, (i != 0));
}
//================================
// second level inverse overlap
if (OL_TWO <= olOverlap)
{
strPost4x4Stage2Split_alternate(p0, p1);
}
//================================
// first level inverse transform
strIDCT4x4Stage1_OPT5(p0, p1);
//================================
// first level inverse overlap
if (OL_ONE <= olOverlap)
{
strPost4x4Stage1_alternate_ASM5(p0, p1);
}
}
return ICERR_OK;
}
#endif
#endif
//================================================================
void StrDecOpt(CWMImageStrCodec* pSC)
{
#if defined(WMP_OPT_SSE2)
if (IsProcessorFeaturePresent(PF_XMMI64_INSTRUCTIONS_AVAILABLE))
{
CWMImageInfo* pII = &pSC->WMII;
// CWMIStrCodecParam* pSCP = &pSC->WMISCP;
g_const_d0 = _mm_setzero_si128();
g_const_d3 = _mm_set1_epi32(3);
g_const_d1 = _mm_set_epi32(1, 1, 1, 1);
g_const_d4 = _mm_set_epi32(4, 4, 4, 4);
g_const_d0x80 = _mm_set_epi32(0x80, 0x80, 0x80, 0x80);
g_const_w0x80 = _mm_set_epi16(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
g_const_b0x80 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
if (pSC->WMII.fPaddedUserBuffer &&
//pSC->m_Dparam->bDecodeFullFrame &&
//((pII->cWidth & 0xf) == 0) &&
//(((int) pSC->WMIBI.pv & 0xf) == 0) &&
BD_8 == pII->bdBitDepth &&
CF_RGB == pII->cfColorFormat &&
24 == pII->cBitsPerUnit &&
pII->bRGB &&
O_NONE == pII->oOrientation &&
YUV_444 == pSC->m_param.cfColorFormat &&
pSC->p1MBbuffer[1] - pSC->p1MBbuffer[0] == pSC->p1MBbuffer[2] - pSC->p1MBbuffer[1] &&
pSC->m_Dparam->bDecodeFullFrame &&
1)
{
#if defined(WMP_OPT_CC_DEC)
if (pSC->m_param.bScaledArith || pSC->WMISCP.olOverlap != OL_NONE)
{
pSC->Load = outputMBRow_RGB24_Lossy_3;
}
else
{
pSC->Load = outputMBRow_RGB24_Lossless_1;
}
#endif // WMP_OPT_CC_DEC
}
if (YUV_444 == pSC->m_param.cfColorFormat &&
pSC->p1MBbuffer[1] - pSC->p1MBbuffer[0] == pSC->p1MBbuffer[2] - pSC->p1MBbuffer[1] &&
pSC->m_Dparam->bDecodeFullWidth &&
pSC->m_param.cSubVersion == CODEC_SUBVERSION_NEWSCALING_SOFT_TILES &&
1 == pSC->m_Dparam->cThumbnailScale)
{
#if defined(WMP_OPT_TRFM_DEC)
pSC->TransformCenter = invTransformMacroblock_YUV444_Center5;
#endif
}
}
#else
UNREFERENCED_PARAMETER( pSC );
#endif
}