This source file includes following definitions.
- initSearch
- setLambdaFromQP
- invalidateContexts
- invalidateContexts
- codeSubdivCbfQTChroma
- codeCoeffQTChroma
- codeIntraLumaQT
- codeIntraLumaTSkip
- residualTransformQuantIntra
- extractIntraResultQT
- offsetCBFs
- offsetSubTUCBFs
- codeIntraChromaQt
- codeIntraChromaTSkip
- extractIntraResultChromaQT
- residualQTIntraChroma
- checkIntra
- checkIntraInInter
- encodeIntraInInter
- estIntraPredQT
- getBestIntraModeChroma
- estIntraPredChromaQT
- mergeEstimation
- getLowresMV
- selectMVP
- processTasks
- processPME
- singleMotionEstimation
- searchMV
- predInterSearch
- getBlkBits
- checkBestMVP
- setSearchRange
- encodeResAndCalcRdSkipCU
- encodeResAndCalcRdInterCU
- residualTransformQuantInter
- estimateNullCbfCost
- splitTU
- estimateResidualQT
- codeInterSubdivCbfQT
- saveResidualQTData
- getIntraRemModeBits
- updateCandList
- checkDQP
- checkDQPForSplitPred
#include "common.h"
#include "primitives.h"
#include "picyuv.h"
#include "cudata.h"
#include "search.h"
#include "entropy.h"
#include "rdcost.h"
#include "analysis.h"
#include "framedata.h"
using namespace X265_NS;
#if _MSC_VER
#pragma warning(disable: 4800)
#pragma warning(disable: 4244)
#pragma warning(disable: 4127)
#endif
#define MVP_IDX_BITS 1
ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 };
Search::Search()
{
memset(m_rqt, 0, sizeof(m_rqt));
for (int i = 0; i < 3; i++)
{
m_qtTempTransformSkipFlag[i] = NULL;
m_qtTempCbf[i] = NULL;
}
m_numLayers = 0;
m_intraPred = NULL;
m_intraPredAngs = NULL;
m_fencScaled = NULL;
m_fencTransposed = NULL;
m_tsCoeff = NULL;
m_tsResidual = NULL;
m_tsRecon = NULL;
m_param = NULL;
m_slice = NULL;
m_frame = NULL;
m_maxTUDepth = -1;
}
bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
{
uint32_t maxLog2CUSize = g_log2Size[param.maxCUSize];
m_param = ¶m;
m_bFrameParallel = param.frameNumThreads > 1;
m_numLayers = g_log2Size[param.maxCUSize] - 2;
m_rdCost.setPsyRdScale(param.psyRd);
m_rdCost.setSsimRd(param.bSsimRd);
m_me.init(param.internalCsp);
bool ok = m_quant.init(param.psyRdoq, scalingList, m_entropyCoder);
if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
ok &= m_quant.allocNoiseReduction(param);
ok &= Predict::allocBuffers(param.internalCsp);
m_refLagPixels = m_bFrameParallel ? param.searchRange : param.sourceHeight;
uint32_t sizeL = 1 << (maxLog2CUSize * 2);
uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
uint32_t numPartitions = 1 << (maxLog2CUSize - LOG2_UNIT_SIZE) * 2;
m_limitTU = 0;
if (m_param->limitTU)
{
if (m_param->limitTU == 1)
m_limitTU = X265_TU_LIMIT_BFS;
else if (m_param->limitTU == 2)
m_limitTU = X265_TU_LIMIT_DFS;
else if (m_param->limitTU == 3)
m_limitTU = X265_TU_LIMIT_NEIGH;
else if (m_param->limitTU == 4)
m_limitTU = X265_TU_LIMIT_DFS + X265_TU_LIMIT_NEIGH;
}
if (param.internalCsp != X265_CSP_I400)
{
for (uint32_t i = 0; i <= m_numLayers; i++)
{
CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
ok &= m_rqt[i].reconQtYuv.create(param.maxCUSize, param.internalCsp);
ok &= m_rqt[i].resiQtYuv.create(param.maxCUSize, param.internalCsp);
}
}
else
{
for (uint32_t i = 0; i <= m_numLayers; i++)
{
CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL);
m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[2] = NULL;
ok &= m_rqt[i].reconQtYuv.create(param.maxCUSize, param.internalCsp);
ok &= m_rqt[i].resiQtYuv.create(param.maxCUSize, param.internalCsp);
}
}
for (uint32_t i = 0; i <= m_param->maxCUDepth; i++)
{
int cuSize = param.maxCUSize >> i;
ok &= m_rqt[i].tmpResiYuv.create(cuSize, param.internalCsp);
ok &= m_rqt[i].tmpPredYuv.create(cuSize, param.internalCsp);
ok &= m_rqt[i].bidirPredYuv[0].create(cuSize, param.internalCsp);
ok &= m_rqt[i].bidirPredYuv[1].create(cuSize, param.internalCsp);
}
if (param.internalCsp != X265_CSP_I400)
{
CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
}
else
{
CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions);
m_qtTempCbf[1] = m_qtTempCbf[2] = NULL;
CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions);
m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[2] = NULL;
}
CHECKED_MALLOC(m_intraPred, pixel, (32 * 32) * (33 + 3));
m_fencScaled = m_intraPred + 32 * 32;
m_fencTransposed = m_fencScaled + 32 * 32;
m_intraPredAngs = m_fencTransposed + 32 * 32;
CHECKED_MALLOC(m_tsCoeff, coeff_t, MAX_TS_SIZE * MAX_TS_SIZE);
CHECKED_MALLOC(m_tsResidual, int16_t, MAX_TS_SIZE * MAX_TS_SIZE);
CHECKED_MALLOC(m_tsRecon, pixel, MAX_TS_SIZE * MAX_TS_SIZE);
return ok;
fail:
return false;
}
Search::~Search()
{
for (uint32_t i = 0; i <= m_numLayers; i++)
{
X265_FREE(m_rqt[i].coeffRQT[0]);
m_rqt[i].reconQtYuv.destroy();
m_rqt[i].resiQtYuv.destroy();
}
for (uint32_t i = 0; i <= m_param->maxCUDepth; i++)
{
m_rqt[i].tmpResiYuv.destroy();
m_rqt[i].tmpPredYuv.destroy();
m_rqt[i].bidirPredYuv[0].destroy();
m_rqt[i].bidirPredYuv[1].destroy();
}
X265_FREE(m_qtTempCbf[0]);
X265_FREE(m_qtTempTransformSkipFlag[0]);
X265_FREE(m_intraPred);
X265_FREE(m_tsCoeff);
X265_FREE(m_tsResidual);
X265_FREE(m_tsRecon);
}
int Search::setLambdaFromQP(const CUData& ctu, int qp, int lambdaQp)
{
X265_CHECK(qp >= QP_MIN && qp <= QP_MAX_MAX, "QP used for lambda is out of range\n");
m_me.setQP(qp);
m_rdCost.setQP(*m_slice, lambdaQp < 0 ? qp : lambdaQp);
int quantQP = x265_clip3(QP_MIN, QP_MAX_SPEC, qp);
m_quant.setQPforQuant(ctu, quantQP);
return quantQP;
}
#if CHECKED_BUILD || _DEBUG
void Search::invalidateContexts(int fromDepth)
{
for (int d = fromDepth; d < NUM_FULL_DEPTH; d++)
{
m_rqt[d].cur.markInvalid();
m_rqt[d].rqtTemp.markInvalid();
m_rqt[d].rqtRoot.markInvalid();
m_rqt[d].rqtTest.markInvalid();
}
}
#else
void Search::invalidateContexts(int) {}
#endif
void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx)
{
uint32_t subdiv = tuDepth < cu.m_tuDepth[absPartIdx];
uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
if (!(log2TrSize - m_hChromaShift < 2))
{
uint32_t parentIdx = absPartIdx & (0xFF << (log2TrSize + 1 - LOG2_UNIT_SIZE) * 2);
if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_U, tuDepth - 1))
m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv);
if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_V, tuDepth - 1))
m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv);
}
if (subdiv)
{
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
codeSubdivCbfQTChroma(cu, tuDepth + 1, absPartIdx);
}
}
void Search::codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype)
{
if (!cu.getCbf(absPartIdx, ttype, tuDepth))
return;
uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
if (tuDepth < cu.m_tuDepth[absPartIdx])
{
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
codeCoeffQTChroma(cu, tuDepth + 1, absPartIdx, ttype);
return;
}
uint32_t tuDepthC = tuDepth;
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
if (log2TrSizeC < 2)
{
X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
if (absPartIdx & 3)
return;
log2TrSizeC = 2;
tuDepthC--;
}
uint32_t qtLayer = log2TrSize - 2;
if (m_csp != X265_CSP_I422)
{
uint32_t shift = (m_csp == X265_CSP_I420) ? 2 : 0;
uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - shift);
coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
}
else
{
uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - 1);
coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
uint32_t subTUSize = 1 << (log2TrSizeC * 2);
uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, ttype);
}
}
void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2])
{
CUData& cu = mode.cu;
uint32_t fullDepth = cuGeom.depth + tuDepth;
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
uint32_t qtLayer = log2TrSize - 2;
uint32_t sizeIdx = log2TrSize - 2;
bool mightNotSplit = log2TrSize <= depthRange[1];
bool mightSplit = (log2TrSize > depthRange[0]) && (bAllowSplit || !mightNotSplit);
bool bEnableRDOQ = !!m_param->rdoqLevel;
if (m_param->rdPenalty == 2 && m_slice->m_sliceType != I_SLICE && log2TrSize == 5 && depthRange[0] <= 4)
{
mightNotSplit = false;
mightSplit = true;
}
Cost fullCost;
uint32_t bCBF = 0;
pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
if (mightNotSplit)
{
if (mightSplit)
m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
pixel* pred = mode.predYuv.getLumaAddr(absPartIdx);
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
uint32_t stride = mode.fencYuv->m_size;
uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
IntraNeighbors intraNeighbors;
initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
if (bEnableRDOQ)
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
if (numSig)
{
m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
primitives.cu[sizeIdx].add_ps(reconQt, reconQtStride, pred, residual, stride, stride);
}
else
primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, pred, stride);
bCBF = !!numSig << tuDepth;
cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
fullCost.distortion = primitives.cu[sizeIdx].sse_pp(reconQt, reconQtStride, fenc, stride);
m_entropyCoder.resetBits();
if (!absPartIdx)
{
if (!cu.m_slice->isIntra())
{
if (cu.m_slice->m_pps->bTransquantBypassEnabled)
m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
m_entropyCoder.codeSkipFlag(cu, 0);
m_entropyCoder.codePredMode(cu.m_predMode[0]);
}
m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
}
if (cu.m_partSize[0] == SIZE_2Nx2N)
{
if (!absPartIdx)
m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
}
else
{
uint32_t qNumParts = cuGeom.numPartitions >> 2;
if (!tuDepth)
{
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
}
else if (!(absPartIdx & (qNumParts - 1)))
m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
}
if (log2TrSize != depthRange[0])
m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA);
fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
if (m_param->rdPenalty && log2TrSize == 5 && m_slice->m_sliceType != I_SLICE)
fullCost.bits *= 4;
if (m_rdCost.m_psyRd)
{
fullCost.energy = m_rdCost.psyCost(sizeIdx, fenc, mode.fencYuv->m_size, reconQt, reconQtStride);
fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
}
else if(m_rdCost.m_ssimRd)
{
fullCost.energy = m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSize, TEXT_LUMA, absPartIdx);
fullCost.rdcost = m_rdCost.calcSsimRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
}
else
fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
}
else
fullCost.rdcost = MAX_INT64;
if (mightSplit)
{
if (mightNotSplit)
{
m_entropyCoder.store(m_rqt[fullDepth].rqtTest);
m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
}
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
if (m_param->bEnableTSkipFast)
checkTransformSkip &= cu.m_partSize[0] != SIZE_2Nx2N;
Cost splitCost;
uint32_t cbf = 0;
for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
if (checkTransformSkip)
codeIntraLumaTSkip(mode, cuGeom, tuDepth + 1, qPartIdx, splitCost);
else
codeIntraLumaQT(mode, cuGeom, tuDepth + 1, qPartIdx, bAllowSplit, splitCost, depthRange);
cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
}
cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth);
if (mightNotSplit && log2TrSize != depthRange[0])
{
m_entropyCoder.resetBits();
m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
splitCost.bits += m_entropyCoder.getNumberOfWrittenBits();
if (m_rdCost.m_psyRd)
splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
else if(m_rdCost.m_ssimRd)
splitCost.rdcost = m_rdCost.calcSsimRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
else
splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
}
if (splitCost.rdcost < fullCost.rdcost)
{
outCost.rdcost += splitCost.rdcost;
outCost.distortion += splitCost.distortion;
outCost.bits += splitCost.bits;
outCost.energy += splitCost.energy;
return;
}
else
{
m_entropyCoder.load(m_rqt[fullDepth].rqtTest);
cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
}
}
PicYuv* reconPic = m_frame->m_reconPic;
pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
intptr_t picStride = reconPic->m_stride;
primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
outCost.rdcost += fullCost.rdcost;
outCost.distortion += fullCost.distortion;
outCost.bits += fullCost.bits;
outCost.energy += fullCost.energy;
}
void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
{
uint32_t fullDepth = cuGeom.depth + tuDepth;
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
uint32_t tuSize = 1 << log2TrSize;
bool bEnableRDOQ = !!m_param->rdoqLevel;
X265_CHECK(tuSize <= MAX_TS_SIZE, "transform skip is only possible at 4x4 TUs\n");
CUData& cu = mode.cu;
Yuv* predYuv = &mode.predYuv;
const Yuv* fencYuv = mode.fencYuv;
Cost fullCost;
fullCost.rdcost = MAX_INT64;
int bTSkip = 0;
uint32_t bCBF = 0;
const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
pixel* pred = predYuv->getLumaAddr(absPartIdx);
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
uint32_t stride = fencYuv->m_size;
uint32_t sizeIdx = log2TrSize - 2;
uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
IntraNeighbors intraNeighbors;
initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
uint32_t qtLayer = log2TrSize - 2;
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
if (bEnableRDOQ)
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
int checkTransformSkip = 1;
for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
{
uint64_t tmpCost;
uint32_t tmpEnergy = 0;
coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffY);
pixel* tmpRecon = (useTSkip ? m_tsRecon : reconQt);
uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip);
if (numSig)
{
m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
primitives.cu[sizeIdx].add_ps(tmpRecon, tmpReconStride, pred, residual, stride, stride);
}
else if (useTSkip)
{
checkTransformSkip = 0;
break;
}
else
primitives.cu[sizeIdx].copy_pp(tmpRecon, tmpReconStride, pred, stride);
sse_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
if (useTSkip)
m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
m_entropyCoder.resetBits();
if (!absPartIdx)
{
if (!cu.m_slice->isIntra())
{
if (cu.m_slice->m_pps->bTransquantBypassEnabled)
m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
m_entropyCoder.codeSkipFlag(cu, 0);
m_entropyCoder.codePredMode(cu.m_predMode[0]);
}
m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
}
if (cu.m_partSize[0] == SIZE_2Nx2N)
{
if (!absPartIdx)
m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
}
else
{
uint32_t qNumParts = cuGeom.numPartitions >> 2;
if (!tuDepth)
{
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
}
else if (!(absPartIdx & (qNumParts - 1)))
m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
}
m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize, TEXT_LUMA);
uint32_t tmpBits = m_entropyCoder.getNumberOfWrittenBits();
if (!useTSkip)
m_entropyCoder.store(m_rqt[fullDepth].rqtTemp);
if (m_rdCost.m_psyRd)
{
tmpEnergy = m_rdCost.psyCost(sizeIdx, fenc, fencYuv->m_size, tmpRecon, tmpReconStride);
tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
}
else if(m_rdCost.m_ssimRd)
{
tmpEnergy = m_quant.ssimDistortion(cu, fenc, stride, tmpRecon, tmpReconStride, log2TrSize, TEXT_LUMA, absPartIdx);
tmpCost = m_rdCost.calcSsimRdCost(tmpDist, tmpBits, tmpEnergy);
}
else
tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
if (tmpCost < fullCost.rdcost)
{
bTSkip = useTSkip;
bCBF = !!numSig;
fullCost.rdcost = tmpCost;
fullCost.distortion = tmpDist;
fullCost.bits = tmpBits;
fullCost.energy = tmpEnergy;
}
}
if (bTSkip)
{
memcpy(coeffY, m_tsCoeff, sizeof(coeff_t) << (log2TrSize * 2));
primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, m_tsRecon, tuSize);
}
else if (checkTransformSkip)
{
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
cu.setCbfSubParts(bCBF << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
m_entropyCoder.load(m_rqt[fullDepth].rqtTemp);
}
PicYuv* reconPic = m_frame->m_reconPic;
pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
intptr_t picStride = reconPic->m_stride;
primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
outCost.rdcost += fullCost.rdcost;
outCost.distortion += fullCost.distortion;
outCost.bits += fullCost.bits;
outCost.energy += fullCost.energy;
}
void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
{
CUData& cu = mode.cu;
uint32_t fullDepth = cuGeom.depth + tuDepth;
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
bool bCheckFull = log2TrSize <= depthRange[1];
X265_CHECK(m_slice->m_sliceType != I_SLICE, "residualTransformQuantIntra not intended for I slices\n");
if (m_param->rdPenalty == 2 && log2TrSize == 5 && depthRange[0] <= 4)
bCheckFull = false;
if (bCheckFull)
{
const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
pixel* pred = mode.predYuv.getLumaAddr(absPartIdx);
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
uint32_t stride = mode.fencYuv->m_size;
uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
IntraNeighbors intraNeighbors;
initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n");
cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
coeff_t* coeffY = cu.m_trCoeff[0] + coeffOffsetY;
uint32_t sizeIdx = log2TrSize - 2;
primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
PicYuv* reconPic = m_frame->m_reconPic;
pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
intptr_t picStride = reconPic->m_stride;
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
if (numSig)
{
m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
primitives.cu[sizeIdx].add_ps(picReconY, picStride, pred, residual, stride, stride);
cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
}
else
{
primitives.cu[sizeIdx].copy_pp(picReconY, picStride, pred, stride);
cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
}
}
else
{
X265_CHECK(log2TrSize > depthRange[0], "intra luma split state failure\n");
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t cbf = 0;
for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
residualTransformQuantIntra(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
}
cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth);
}
}
void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx)
{
uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
if (tuDepth == cu.m_tuDepth[absPartIdx])
{
uint32_t qtLayer = log2TrSize - 2;
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
coeff_t* coeffDestY = cu.m_trCoeff[0] + coeffOffsetY;
memcpy(coeffDestY, coeffSrcY, sizeof(coeff_t) << (log2TrSize * 2));
m_rqt[qtLayer].reconQtYuv.copyPartToPartLuma(reconYuv, absPartIdx, log2TrSize);
}
else
{
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
extractIntraResultQT(cu, reconYuv, tuDepth + 1, absPartIdx);
}
}
inline void offsetCBFs(uint8_t subTUCBF[2])
{
uint8_t combinedCBF = subTUCBF[0] | subTUCBF[1];
subTUCBF[0] = subTUCBF[0] << 1 | combinedCBF;
subTUCBF[1] = subTUCBF[1] << 1 | combinedCBF;
}
void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx)
{
uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
if (log2TrSize == 2)
{
X265_CHECK(m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
++log2TrSize;
}
uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1);
uint8_t subTUCBF[2];
subTUCBF[0] = cu.getCbf(absPartIdx , ttype, tuDepth);
subTUCBF[1] = cu.getCbf(absPartIdx+ tuNumParts, ttype, tuDepth);
offsetCBFs(subTUCBF);
cu.setCbfPartRange(subTUCBF[0] << tuDepth, ttype, absPartIdx , tuNumParts);
cu.setCbfPartRange(subTUCBF[1] << tuDepth, ttype, absPartIdx + tuNumParts, tuNumParts);
}
void Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
{
CUData& cu = mode.cu;
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
bool bEnableRDOQ = !!m_param->rdoqLevel;
if (tuDepth < cu.m_tuDepth[absPartIdx])
{
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t splitCbfU = 0, splitCbfV = 0;
for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, outCost);
splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
}
cu.m_cbf[1][absPartIdx] |= (splitCbfU << tuDepth);
cu.m_cbf[2][absPartIdx] |= (splitCbfV << tuDepth);
return;
}
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
uint32_t tuDepthC = tuDepth;
if (log2TrSizeC < 2)
{
X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
if (absPartIdx & 3)
return;
log2TrSizeC = 2;
tuDepthC--;
}
if (bEnableRDOQ)
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]);
if (checkTransformSkip)
{
codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, outCost);
return;
}
ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
uint32_t qtLayer = log2TrSize - 2;
uint32_t stride = mode.fencYuv->m_csize;
const uint32_t sizeIdxC = log2TrSizeC - 2;
uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
TURecurse tuIterator(splitType, curPartNum, absPartIdx);
do
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
IntraNeighbors intraNeighbors;
initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
TextType ttype = (TextType)chromaId;
const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
PicYuv* reconPic = m_frame->m_reconPic;
pixel* picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
intptr_t picStride = reconPic->m_strideC;
uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
if (chromaPredMode == DM_CHROMA_IDX)
chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
if (m_csp == X265_CSP_I422)
chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
if (numSig)
{
m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
primitives.cu[sizeIdxC].add_ps(reconQt, reconQtStride, pred, residual, stride, stride);
cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
else
{
primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, pred, stride);
cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
outCost.distortion += m_rdCost.scaleChromaDist(chromaId, primitives.cu[sizeIdxC].sse_pp(reconQt, reconQtStride, fenc, stride));
if (m_rdCost.m_psyRd)
outCost.energy += m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
else if(m_rdCost.m_ssimRd)
outCost.energy += m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSizeC, ttype, absPartIdxC);
primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, reconQt, reconQtStride);
}
}
while (tuIterator.isNextSection());
if (splitType == VERTICAL_SPLIT)
{
offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
}
}
void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, Cost& outCost)
{
CUData& cu = mode.cu;
uint32_t fullDepth = cuGeom.depth + tuDepth;
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
const uint32_t log2TrSizeC = 2;
uint32_t qtLayer = log2TrSize - 2;
m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
TURecurse tuIterator(splitType, curPartNum, absPartIdx);
do
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
IntraNeighbors intraNeighbors;
initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
TextType ttype = (TextType)chromaId;
const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t stride = mode.fencYuv->m_csize;
const uint32_t sizeIdxC = log2TrSizeC - 2;
uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
if (chromaPredMode == DM_CHROMA_IDX)
chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
if (m_csp == X265_CSP_I422)
chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
uint64_t bCost = MAX_INT64;
sse_t bDist = 0;
uint32_t bCbf = 0;
uint32_t bEnergy = 0;
int bTSkip = 0;
int checkTransformSkip = 1;
for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
{
coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffC);
pixel* recon = (useTSkip ? m_tsRecon : reconQt);
uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip);
if (numSig)
{
m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
primitives.cu[sizeIdxC].add_ps(recon, reconStride, pred, residual, stride, stride);
cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
else if (useTSkip)
{
checkTransformSkip = 0;
break;
}
else
{
primitives.cu[sizeIdxC].copy_pp(recon, reconStride, pred, stride);
cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
sse_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride);
tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist);
cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
uint32_t tmpBits = 0, tmpEnergy = 0;
if (numSig)
{
m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
m_entropyCoder.resetBits();
m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
tmpBits = m_entropyCoder.getNumberOfWrittenBits();
}
uint64_t tmpCost;
if (m_rdCost.m_psyRd)
{
tmpEnergy = m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
}
else if(m_rdCost.m_ssimRd)
{
tmpEnergy = m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSizeC, ttype, absPartIdxC);
tmpCost = m_rdCost.calcSsimRdCost(tmpDist, tmpBits, tmpEnergy);
}
else
tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
if (tmpCost < bCost)
{
bCost = tmpCost;
bDist = tmpDist;
bTSkip = useTSkip;
bCbf = !!numSig;
bEnergy = tmpEnergy;
}
}
if (bTSkip)
{
memcpy(coeffC, m_tsCoeff, sizeof(coeff_t) << (log2TrSizeC * 2));
primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, m_tsRecon, MAX_TS_SIZE);
}
cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
PicYuv* reconPic = m_frame->m_reconPic;
pixel* reconPicC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
intptr_t picStride = reconPic->m_strideC;
primitives.cu[sizeIdxC].copy_pp(reconPicC, picStride, reconQt, reconQtStride);
outCost.distortion += bDist;
outCost.energy += bEnergy;
}
}
while (tuIterator.isNextSection());
if (splitType == VERTICAL_SPLIT)
{
offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
}
m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
}
void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth)
{
uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
if (tuDepthL == tuDepth || log2TrSizeC == 2)
{
uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
uint32_t coeffOffsetC = absPartIdx << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
uint32_t qtLayer = log2TrSize - 2 - (tuDepthL - tuDepth);
coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC;
coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC;
memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
m_rqt[qtLayer].reconQtYuv.copyPartToPartChroma(reconYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
}
else
{
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
extractIntraResultChromaQT(cu, reconYuv, absPartIdx, tuDepth + 1);
}
}
void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth)
{
CUData& cu = mode.cu;
uint32_t log2TrSize = cu.m_log2CUSize[absPartIdx] - tuDepth;
if (tuDepth < cu.m_tuDepth[absPartIdx])
{
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t splitCbfU = 0, splitCbfV = 0;
for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
residualQTIntraChroma(mode, cuGeom, qPartIdx, tuDepth + 1);
splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
}
cu.m_cbf[1][absPartIdx] |= (splitCbfU << tuDepth);
cu.m_cbf[2][absPartIdx] |= (splitCbfV << tuDepth);
return;
}
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
uint32_t tuDepthC = tuDepth;
if (log2TrSizeC < 2)
{
X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
if (absPartIdx & 3)
return;
log2TrSizeC = 2;
tuDepthC--;
}
ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
uint32_t stride = mode.fencYuv->m_csize;
const uint32_t sizeIdxC = log2TrSizeC - 2;
uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
TURecurse tuIterator(splitType, curPartNum, absPartIdx);
do
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
IntraNeighbors intraNeighbors;
initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
TextType ttype = (TextType)chromaId;
const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
coeff_t* coeffC = cu.m_trCoeff[ttype] + coeffOffsetC;
PicYuv* reconPic = m_frame->m_reconPic;
pixel* picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
intptr_t picStride = reconPic->m_strideC;
uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
if (chromaPredMode == DM_CHROMA_IDX)
chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
if (m_csp == X265_CSP_I422)
chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
if (numSig)
{
m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
primitives.cu[sizeIdxC].add_ps(picReconC, picStride, pred, residual, stride, stride);
cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
else
{
primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, pred, stride);
cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
}
}
while (tuIterator.isNextSection());
if (splitType == VERTICAL_SPLIT)
{
offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
}
}
void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize)
{
CUData& cu = intraMode.cu;
cu.setPartSizeSubParts(partSize);
cu.setPredModeSubParts(MODE_INTRA);
uint32_t tuDepthRange[2];
cu.getIntraTUQtDepthRange(tuDepthRange, 0);
intraMode.initCosts();
intraMode.lumaDistortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange);
if (m_csp != X265_CSP_I400)
{
intraMode.chromaDistortion += estIntraPredChromaQT(intraMode, cuGeom);
intraMode.distortion += intraMode.lumaDistortion + intraMode.chromaDistortion;
}
else
intraMode.distortion += intraMode.lumaDistortion;
cu.m_distortion[0] = intraMode.distortion;
m_entropyCoder.resetBits();
if (m_slice->m_pps->bTransquantBypassEnabled)
m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
int skipFlagBits = 0;
if (!m_slice->isIntra())
{
m_entropyCoder.codeSkipFlag(cu, 0);
skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
m_entropyCoder.codePredMode(cu.m_predMode[0]);
}
m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
m_entropyCoder.codePredInfo(cu, 0);
intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
bool bCodeDQP = m_slice->m_pps->bUseDQP;
m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
m_entropyCoder.store(intraMode.contexts);
intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits;
const Yuv* fencYuv = intraMode.fencYuv;
if (m_rdCost.m_psyRd)
intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
else if(m_rdCost.m_ssimRd)
intraMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size, cuGeom.log2CUSize, TEXT_LUMA, 0);
intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
updateModeCost(intraMode);
checkDQP(intraMode, cuGeom);
}
void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
{
ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
CUData& cu = intraMode.cu;
uint32_t depth = cuGeom.depth;
cu.setPartSizeSubParts(SIZE_2Nx2N);
cu.setPredModeSubParts(MODE_INTRA);
const uint32_t initTuDepth = 0;
uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;
uint32_t tuSize = 1 << log2TrSize;
const uint32_t absPartIdx = 0;
IntraNeighbors intraNeighbors;
initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
const pixel* fenc = intraMode.fencYuv->m_buf[0];
uint32_t stride = intraMode.fencYuv->m_size;
int sad, bsad;
uint32_t bits, bbits, mode, bmode;
uint64_t cost, bcost;
int scaleTuSize = tuSize;
int scaleStride = stride;
int costShift = 0;
int sizeIdx = log2TrSize - 2;
if (tuSize > 32)
{
primitives.scale2D_64to32(m_fencScaled, fenc, stride);
fenc = m_fencScaled;
pixel nScale[129];
intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0];
primitives.scale1D_128to64(nScale + 1, intraNeighbourBuf[0] + 1);
memcpy(&intraNeighbourBuf[0][1], &nScale[1], 2 * 64 * sizeof(pixel));
memcpy(&intraNeighbourBuf[1][1], &nScale[1], 2 * 64 * sizeof(pixel));
scaleTuSize = 32;
scaleStride = 32;
costShift = 2;
sizeIdx = 5 - 2;
}
pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
int predsize = scaleTuSize * scaleTuSize;
m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
uint64_t mpms;
uint32_t mpmModes[3];
uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);
primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPredAngs, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
bsad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift;
bmode = mode = DC_IDX;
bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
bcost = m_rdCost.calcRdSADCost(bsad, bbits);
pixel* planar = intraNeighbourBuf[0];
if (tuSize & (8 | 16 | 32))
planar = intraNeighbourBuf[1];
primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPredAngs, scaleStride, planar, 0, 0);
sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift;
mode = PLANAR_IDX;
bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
cost = m_rdCost.calcRdSADCost(sad, bits);
COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
bool allangs = true;
if (primitives.cu[sizeIdx].intra_pred_allangs)
{
primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);
primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
}
else
allangs = false;
#define TRY_ANGLE(angle) \
if (allangs) { \
if (angle < 18) \
sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \
else \
sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \
bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
cost = m_rdCost.calcRdSADCost(sad, bits); \
} else { \
int filter = !!(g_intraFilterFlags[angle] & scaleTuSize); \
primitives.cu[sizeIdx].intra_pred[angle](m_intraPredAngs, scaleTuSize, intraNeighbourBuf[filter], angle, scaleTuSize <= 16); \
sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleTuSize) << costShift; \
bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
cost = m_rdCost.calcRdSADCost(sad, bits); \
}
if (m_param->bEnableFastIntra)
{
int asad = 0;
uint32_t lowmode, highmode, amode = 5, abits = 0;
uint64_t acost = MAX_INT64;
for (mode = 5; mode < 35; mode += 5)
{
TRY_ANGLE(mode);
COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
}
for (uint32_t dist = 2; dist >= 1; dist--)
{
lowmode = amode - dist;
highmode = amode + dist;
X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
TRY_ANGLE(lowmode);
COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
TRY_ANGLE(highmode);
COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
}
if (amode == 33)
{
TRY_ANGLE(34);
COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
}
COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
}
else
{
for (mode = 2; mode < 35; mode++)
{
TRY_ANGLE(mode);
COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
}
}
cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTuDepth);
intraMode.initCosts();
intraMode.totalBits = bbits;
intraMode.distortion = bsad;
intraMode.sa8dCost = bcost;
intraMode.sa8dBits = bbits;
}
void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
{
ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
CUData& cu = intraMode.cu;
Yuv* reconYuv = &intraMode.reconYuv;
X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
uint32_t tuDepthRange[2];
cu.getIntraTUQtDepthRange(tuDepthRange, 0);
m_entropyCoder.load(m_rqt[cuGeom.depth].cur);
Cost icosts;
codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
extractIntraResultQT(cu, *reconYuv, 0, 0);
intraMode.lumaDistortion = icosts.distortion;
if (m_csp != X265_CSP_I400)
{
intraMode.chromaDistortion = estIntraPredChromaQT(intraMode, cuGeom);
intraMode.distortion = intraMode.lumaDistortion + intraMode.chromaDistortion;
}
else
intraMode.distortion = intraMode.lumaDistortion;
m_entropyCoder.resetBits();
if (m_slice->m_pps->bTransquantBypassEnabled)
m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
m_entropyCoder.codeSkipFlag(cu, 0);
int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
m_entropyCoder.codePredMode(cu.m_predMode[0]);
m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
m_entropyCoder.codePredInfo(cu, 0);
intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
bool bCodeDQP = m_slice->m_pps->bUseDQP;
m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits;
const Yuv* fencYuv = intraMode.fencYuv;
if (m_rdCost.m_psyRd)
intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
else if(m_rdCost.m_ssimRd)
intraMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cuGeom.log2CUSize, TEXT_LUMA, 0);
intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
m_entropyCoder.store(intraMode.contexts);
updateModeCost(intraMode);
checkDQP(intraMode, cuGeom);
}
sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2])
{
CUData& cu = intraMode.cu;
Yuv* reconYuv = &intraMode.reconYuv;
Yuv* predYuv = &intraMode.predYuv;
const Yuv* fencYuv = intraMode.fencYuv;
uint32_t depth = cuGeom.depth;
uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N;
uint32_t numPU = 1 << (2 * initTuDepth);
uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;
uint32_t tuSize = 1 << log2TrSize;
uint32_t qNumParts = cuGeom.numPartitions >> 2;
uint32_t sizeIdx = log2TrSize - 2;
uint32_t absPartIdx = 0;
sse_t totalDistortion = 0;
int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N;
for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)
{
uint32_t bmode = 0;
if (intraMode.cu.m_lumaIntraDir[puIdx] != (uint8_t)ALL_IDX)
bmode = intraMode.cu.m_lumaIntraDir[puIdx];
else
{
uint64_t candCostList[MAX_RD_INTRA_MODES];
uint32_t rdModeList[MAX_RD_INTRA_MODES];
uint64_t bcost;
int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1);
{
ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
IntraNeighbors intraNeighbors;
initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
uint32_t stride = predYuv->m_size;
int scaleTuSize = tuSize;
int scaleStride = stride;
int costShift = 0;
m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
uint64_t mpms;
uint32_t mpmModes[3];
uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);
pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
uint64_t modeCosts[35];
primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPred, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, DC_IDX) : rbits;
uint32_t sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
pixel* planar = intraNeighbourBuf[0];
if (tuSize >= 8 && tuSize <= 32)
planar = intraNeighbourBuf[1];
primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPred, scaleStride, planar, 0, 0);
bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, PLANAR_IDX) : rbits;
sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
if (primitives.cu[sizeIdx].intra_pred_allangs)
{
primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);
primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
for (int mode = 2; mode < 35; mode++)
{
bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
if (mode < 18)
sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
else
sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
COPY1_IF_LT(bcost, modeCosts[mode]);
}
}
else
{
for (int mode = 2; mode < 35; mode++)
{
bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
int filter = !!(g_intraFilterFlags[mode] & scaleTuSize);
primitives.cu[sizeIdx].intra_pred[mode](m_intraPred, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16);
sad = sa8d(fenc, scaleStride, m_intraPred, scaleTuSize) << costShift;
modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
COPY1_IF_LT(bcost, modeCosts[mode]);
}
}
for (int i = 0; i < maxCandCount; i++)
candCostList[i] = MAX_INT64;
uint64_t paddedBcost = bcost + (bcost >> 2);
for (int mode = 0; mode < 35; mode++)
if ((modeCosts[mode] < paddedBcost) || ((uint32_t)mode == mpmModes[0]))
updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList);
}
bcost = MAX_INT64;
for (int i = 0; i < maxCandCount; i++)
{
if (candCostList[i] == MAX_INT64)
break;
ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
m_entropyCoder.load(m_rqt[depth].cur);
cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth);
Cost icosts;
if (checkTransformSkip)
codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
else
codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange);
COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]);
}
}
ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth);
m_entropyCoder.load(m_rqt[depth].cur);
Cost icosts;
if (checkTransformSkip)
codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
else
codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange);
totalDistortion += icosts.distortion;
extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx);
if (puIdx != numPU - 1)
{
PicYuv* reconPic = m_frame->m_reconPic;
pixel* dst = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
uint32_t dststride = reconPic->m_stride;
const pixel* src = reconYuv->getLumaAddr(absPartIdx);
uint32_t srcstride = reconYuv->m_size;
primitives.cu[log2TrSize - 2].copy_pp(dst, dststride, src, srcstride);
}
}
if (numPU > 1)
{
uint32_t combCbfY = 0;
for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);
cu.m_cbf[0][0] |= combCbfY;
}
m_entropyCoder.load(m_rqt[depth].cur);
return totalDistortion;
}
void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom)
{
CUData& cu = intraMode.cu;
const Yuv* fencYuv = intraMode.fencYuv;
Yuv* predYuv = &intraMode.predYuv;
uint32_t bestMode = 0;
uint64_t bestCost = MAX_INT64;
uint32_t modeList[NUM_CHROMA_MODE];
uint32_t log2TrSizeC = cu.m_log2CUSize[0] - m_hChromaShift;
uint32_t tuSize = 1 << log2TrSizeC;
uint32_t tuDepth = 0;
int32_t costShift = 0;
if (tuSize > 32)
{
tuDepth = 1;
costShift = 2;
log2TrSizeC = 5;
}
IntraNeighbors intraNeighbors;
initIntraNeighbors(cu, 0, tuDepth, false, &intraNeighbors);
cu.getAllowedChromaDir(0, modeList);
for (uint32_t mode = 0; mode < NUM_CHROMA_MODE; mode++)
{
uint32_t chromaPredMode = modeList[mode];
if (chromaPredMode == DM_CHROMA_IDX)
chromaPredMode = cu.m_lumaIntraDir[0];
if (m_csp == X265_CSP_I422)
chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
uint64_t cost = 0;
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
const pixel* fenc = fencYuv->m_buf[chromaId];
pixel* pred = predYuv->m_buf[chromaId];
Predict::initAdiPatternChroma(cu, cuGeom, 0, intraNeighbors, chromaId);
predIntraChromaAng(chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC);
cost += primitives.cu[log2TrSizeC - 2].sa8d(fenc, predYuv->m_csize, pred, fencYuv->m_csize) << costShift;
}
if (cost < bestCost)
{
bestCost = cost;
bestMode = modeList[mode];
}
}
cu.setChromIntraDirSubParts(bestMode, 0, cuGeom.depth);
}
sse_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
{
CUData& cu = intraMode.cu;
Yuv& reconYuv = intraMode.reconYuv;
uint32_t depth = cuGeom.depth;
uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444;
uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;
uint32_t absPartStep = cuGeom.numPartitions;
sse_t totalDistortion = 0;
int size = partitionFromLog2Size(log2TrSize);
TURecurse tuIterator((initTuDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);
do
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
uint32_t bestMode = 0;
sse_t bestDist = 0;
uint64_t bestCost = MAX_INT64;
uint32_t minMode = 0;
uint32_t maxMode = NUM_CHROMA_MODE;
uint32_t modeList[NUM_CHROMA_MODE];
if (intraMode.cu.m_chromaIntraDir[0] != (uint8_t)ALL_IDX && !initTuDepth)
{
for (uint32_t l = 0; l < NUM_CHROMA_MODE; l++)
modeList[l] = intraMode.cu.m_chromaIntraDir[0];
maxMode = 1;
}
else
cu.getAllowedChromaDir(absPartIdxC, modeList);
if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
{
for (uint32_t l = 1; l < NUM_CHROMA_MODE; l++)
modeList[l] = modeList[0];
maxMode = 1;
}
for (uint32_t mode = minMode; mode < maxMode; mode++)
{
m_entropyCoder.load(m_rqt[depth].cur);
cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTuDepth);
Cost outCost;
codeIntraChromaQt(intraMode, cuGeom, initTuDepth, absPartIdxC, outCost);
if (m_slice->m_pps->bTransformSkipEnabled)
m_entropyCoder.load(m_rqt[depth].cur);
m_entropyCoder.resetBits();
if (cu.m_partSize[0] == SIZE_2Nx2N || m_csp != X265_CSP_I444)
{
if (!absPartIdxC)
m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
}
else
{
uint32_t qNumParts = cuGeom.numPartitions >> 2;
if (!(absPartIdxC & (qNumParts - 1)))
m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
}
codeSubdivCbfQTChroma(cu, initTuDepth, absPartIdxC);
codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U);
codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V);
uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(outCost.distortion, bits, outCost.energy) : m_rdCost.m_ssimRd ? m_rdCost.calcSsimRdCost(outCost.distortion, bits, outCost.energy)
: m_rdCost.calcRdCost(outCost.distortion, bits);
if (cost < bestCost)
{
bestCost = cost;
bestDist = outCost.distortion;
bestMode = modeList[mode];
extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTuDepth);
memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(m_qtTempTransformSkipFlag[2], cu.m_transformSkip[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
}
}
if (!tuIterator.isLastSection())
{
uint32_t zorder = cuGeom.absPartIdx + absPartIdxC;
PicYuv* reconPic = m_frame->m_reconPic;
uint32_t dststride = reconPic->m_strideC;
const pixel* src;
pixel* dst;
dst = reconPic->getCbAddr(cu.m_cuAddr, zorder);
src = reconYuv.getCbAddr(absPartIdxC);
primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize);
dst = reconPic->getCrAddr(cu.m_cuAddr, zorder);
src = reconYuv.getCrAddr(absPartIdxC);
primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize);
}
memcpy(cu.m_cbf[1] + absPartIdxC, m_qtTempCbf[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTuDepth);
totalDistortion += bestDist;
}
while (tuIterator.isNextSection());
if (initTuDepth != 0)
{
uint32_t combCbfU = 0;
uint32_t combCbfV = 0;
uint32_t qNumParts = tuIterator.absPartIdxStep;
for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
combCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, 1);
combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1);
}
cu.m_cbf[1][0] |= combCbfU;
cu.m_cbf[2][0] |= combCbfV;
}
m_entropyCoder.load(m_rqt[depth].cur);
return totalDistortion;
}
uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m)
{
X265_CHECK(cu.m_partSize[0] != SIZE_2Nx2N, "mergeEstimation() called for 2Nx2N\n");
MVField candMvField[MRG_MAX_NUM_CANDS][2];
uint8_t candDir[MRG_MAX_NUM_CANDS];
uint32_t numMergeCand = cu.getInterMergeCandidates(pu.puAbsPartIdx, puIdx, candMvField, candDir);
if (cu.isBipredRestriction())
{
for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand)
{
if (candDir[mergeCand] == 3)
{
candDir[mergeCand] = 1;
candMvField[mergeCand][1].refIdx = REF_NOT_VALID;
}
}
}
Yuv& tempYuv = m_rqt[cuGeom.depth].tmpPredYuv;
uint32_t outCost = MAX_UINT;
for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand)
{
if (m_bFrameParallel)
{
if (m_param->maxSlices > 1)
{
if (cu.m_bFirstRowInSlice &
((candMvField[mergeCand][0].mv.y < (2 * 4)) | (candMvField[mergeCand][1].mv.y < (2 * 4))))
continue;
if (cu.m_bLastRowInSlice &&
((candMvField[mergeCand][0].mv.y > -3 * 4) | (candMvField[mergeCand][1].mv.y > -3 * 4)))
continue;
}
if (candMvField[mergeCand][0].mv.y >= (m_param->searchRange + 1) * 4 ||
candMvField[mergeCand][1].mv.y >= (m_param->searchRange + 1) * 4)
continue;
}
cu.m_mv[0][pu.puAbsPartIdx] = candMvField[mergeCand][0].mv;
cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][0].refIdx;
cu.m_mv[1][pu.puAbsPartIdx] = candMvField[mergeCand][1].mv;
cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][1].refIdx;
motionCompensation(cu, pu, tempYuv, true, m_me.bChromaSATD);
uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(pu.puAbsPartIdx), tempYuv.m_size);
if (m_me.bChromaSATD)
costCand += m_me.bufChromaSATD(tempYuv, pu.puAbsPartIdx);
uint32_t bitsCand = getTUBits(mergeCand, numMergeCand);
costCand = costCand + m_rdCost.getCost(bitsCand);
if (costCand < outCost)
{
outCost = costCand;
m.bits = bitsCand;
m.index = mergeCand;
}
}
m.mvField[0] = candMvField[m.index][0];
m.mvField[1] = candMvField[m.index][1];
m.dir = candDir[m.index];
return outCost;
}
MV Search::getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref)
{
int diffPoc = abs(m_slice->m_poc - m_slice->m_refPOCList[list][ref]);
if (diffPoc > m_param->bframes + 1)
return 0;
MV* mvs = m_frame->m_lowres.lowresMvs[list][diffPoc - 1];
if (mvs[0].x == 0x7FFF)
return 0;
uint32_t block_x = (cu.m_cuPelX + g_zscanToPelX[pu.puAbsPartIdx] + pu.width / 2) >> 4;
uint32_t block_y = (cu.m_cuPelY + g_zscanToPelY[pu.puAbsPartIdx] + pu.height / 2) >> 4;
uint32_t idx = block_y * m_frame->m_lowres.maxBlocksInRow + block_x;
X265_CHECK(block_x < m_frame->m_lowres.maxBlocksInRow, "block_x is too high\n");
X265_CHECK(block_y < m_frame->m_lowres.maxBlocksInCol, "block_y is too high\n");
return mvs[idx] << 1;
}
int Search::selectMVP(const CUData& cu, const PredictionUnit& pu, const MV amvp[AMVP_NUM_CANDS], int list, int ref)
{
if (amvp[0] == amvp[1])
return 0;
Yuv& tmpPredYuv = m_rqt[cu.m_cuDepth[0]].tmpPredYuv;
uint32_t costs[AMVP_NUM_CANDS];
for (int i = 0; i < AMVP_NUM_CANDS; i++)
{
MV mvCand = amvp[i];
if (m_bFrameParallel)
{
costs[i] = m_me.COST_MAX;
if (mvCand.y >= (m_param->searchRange + 1) * 4)
continue;
if ((m_param->maxSlices > 1) &
((mvCand.y < m_sliceMinY)
| (mvCand.y > m_sliceMaxY)))
continue;
}
cu.clipMv(mvCand);
predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicList[list][ref], mvCand);
costs[i] = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size);
}
return (costs[0] <= costs[1]) ? 0 : 1;
}
void Search::PME::processTasks(int workerThreadId)
{
#if DETAILED_CU_STATS
int fe = mode.cu.m_encData->m_frameEncoderID;
master.m_stats[fe].countPMETasks++;
ScopedElapsedTime pmeTime(master.m_stats[fe].pmeTime);
#endif
ProfileScopeEvent(pme);
master.processPME(*this, master.m_tld[workerThreadId].analysis);
}
void Search::processPME(PME& pme, Search& slave)
{
int meId;
pme.m_lock.acquire();
if (pme.m_jobTotal > pme.m_jobAcquired)
{
meId = pme.m_jobAcquired++;
pme.m_lock.release();
}
else
{
pme.m_lock.release();
return;
}
if (&slave != this)
{
slave.m_slice = m_slice;
slave.m_frame = m_frame;
slave.m_param = m_param;
slave.setLambdaFromQP(pme.mode.cu, m_rdCost.m_qp);
bool bChroma = slave.m_frame->m_fencPic->m_picCsp != X265_CSP_I400;
slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.pu.ctuAddr, pme.pu.cuAbsPartIdx, pme.pu.puAbsPartIdx, pme.pu.width, pme.pu.height, m_param->searchMethod, m_param->subpelRefine, bChroma);
}
do
{
if (meId < pme.m_jobs.refCnt[0])
{
int refIdx = pme.m_jobs.ref[0][meId];
slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 0, refIdx);
}
else
{
int refIdx = pme.m_jobs.ref[1][meId - pme.m_jobs.refCnt[0]];
slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 1, refIdx);
}
meId = -1;
pme.m_lock.acquire();
if (pme.m_jobTotal > pme.m_jobAcquired)
meId = pme.m_jobAcquired++;
pme.m_lock.release();
}
while (meId >= 0);
}
void Search::singleMotionEstimation(Search& master, Mode& interMode, const PredictionUnit& pu, int part, int list, int ref)
{
uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS;
bits += getTUBits(ref, m_slice->m_numRefIdx[list]);
MotionData* bestME = interMode.bestME[part];
MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
const MV* amvp = interMode.amvpCand[list][ref];
int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);
MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
if (!m_param->analysisReuseMode)
{
MV lmv = getLowresMV(interMode.cu, pu, list, ref);
if (lmv.notZero())
mvc[numMvc++] = lmv;
}
setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices,
m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
bits += m_me.bitcost(outmv);
uint32_t mvCost = m_me.mvcost(outmv);
uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
ScopedLock _lock(master.m_meLock);
if (cost < bestME[list].cost ||
(cost == bestME[list].cost && ref < bestME[list].ref))
{
bestME[list].mv = outmv;
bestME[list].mvp = mvp;
bestME[list].mvpIdx = mvpIdx;
bestME[list].ref = ref;
bestME[list].cost = cost;
bestME[list].bits = bits;
bestME[list].mvCost = mvCost;
}
}
void Search::searchMV(Mode& interMode, const PredictionUnit& pu, int list, int ref, MV& outmv)
{
CUData& cu = interMode.cu;
const Slice *slice = m_slice;
MV mv = cu.m_mv[list][pu.puAbsPartIdx];
cu.clipMv(mv);
MV mvmin, mvmax;
setSearchRange(cu, mv, m_param->searchRange, mvmin, mvmax);
m_me.refineMV(&slice->m_mref[list][ref], mvmin, mvmax, mv, outmv);
}
void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2])
{
ProfileCUScope(interMode.cu, motionEstimationElapsedTime, countMotionEstimate);
CUData& cu = interMode.cu;
Yuv* predYuv = &interMode.predYuv;
MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
const Slice *slice = m_slice;
int numPart = cu.getNumPartInter(0);
int numPredDir = slice->isInterP() ? 1 : 2;
const int* numRefIdx = slice->m_numRefIdx;
uint32_t lastMode = 0;
int totalmebits = 0;
MV mvzero(0, 0);
Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
MergeData merge;
memset(&merge, 0, sizeof(merge));
for (int puIdx = 0; puIdx < numPart; puIdx++)
{
MotionData* bestME = interMode.bestME[puIdx];
PredictionUnit pu(cu, cuGeom, puIdx);
m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
uint32_t mrgCost = numPart == 1 ? MAX_UINT : mergeEstimation(cu, cuGeom, pu, puIdx, merge);
bestME[0].cost = MAX_UINT;
bestME[1].cost = MAX_UINT;
getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits);
bool bDoUnidir = true;
cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, interMode.interNeighbours);
if ((m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10)
|| (m_param->analysisMultiPassRefine && m_param->rc.bStatRead))
{
for (int list = 0; list < numPredDir; list++)
{
int ref = bestME[list].ref;
if (ref < 0)
continue;
uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
bits += getTUBits(ref, numRefIdx[list]);
int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
const MV* amvp = interMode.amvpCand[list][ref];
int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
if (m_param->searchMethod == X265_SEA)
{
int puX = puIdx & 1;
int puY = puIdx >> 1;
for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic->m_stride;
}
setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
MV mvpIn = mvp;
if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && mvpIdx == bestME[list].mvpIdx)
mvpIn = bestME[list].mv;
int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvpIn, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices,
m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
bits += m_me.bitcost(outmv);
uint32_t mvCost = m_me.mvcost(outmv);
uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
if (!m_param->analysisMultiPassRefine)
mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
else
{
int diffBits = m_me.bitcost(outmv, amvp[!mvpIdx]) - m_me.bitcost(outmv, mvpIn);
if (diffBits < 0)
{
mvpIdx = !mvpIdx;
uint32_t origOutBits = bits;
bits = origOutBits + diffBits;
cost = (cost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(bits);
}
mvp = amvp[mvpIdx];
}
if (cost < bestME[list].cost)
{
bestME[list].mv = outmv;
bestME[list].mvp = mvp;
bestME[list].mvpIdx = mvpIdx;
bestME[list].cost = cost;
bestME[list].bits = bits;
bestME[list].mvCost = mvCost;
}
bDoUnidir = false;
}
}
else if (m_param->bDistributeMotionEstimation)
{
PME pme(*this, interMode, cuGeom, pu, puIdx);
pme.m_jobTotal = 0;
pme.m_jobAcquired = 1;
uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
for (int list = 0; list < numPredDir; list++)
{
int idx = 0;
for (int ref = 0; ref < numRefIdx[list]; ref++)
{
if (!(refMask & (1 << ref)))
continue;
pme.m_jobs.ref[list][idx++] = ref;
pme.m_jobTotal++;
}
pme.m_jobs.refCnt[list] = idx;
refMask >>= 16;
}
if (pme.m_jobTotal > 2)
{
pme.tryBondPeers(*m_frame->m_encData->m_jobProvider, pme.m_jobTotal - 1);
processPME(pme, *this);
int ref = pme.m_jobs.refCnt[0] ? pme.m_jobs.ref[0][0] : pme.m_jobs.ref[1][0];
singleMotionEstimation(*this, interMode, pu, puIdx, 0, ref);
bDoUnidir = false;
ProfileCUScopeNamed(pmeWaitScope, interMode.cu, pmeBlockTime, countPMEMasters);
pme.waitForExit();
}
}
if (bDoUnidir)
{
interMode.bestME[puIdx][0].ref = interMode.bestME[puIdx][1].ref = -1;
uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
for (int list = 0; list < numPredDir; list++)
{
for (int ref = 0; ref < numRefIdx[list]; ref++)
{
ProfileCounter(interMode.cu, totalMotionReferences[cuGeom.depth]);
if (!(refMask & (1 << ref)))
{
ProfileCounter(interMode.cu, skippedMotionReferences[cuGeom.depth]);
continue;
}
uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
bits += getTUBits(ref, numRefIdx[list]);
int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
const MV* amvp = interMode.amvpCand[list][ref];
int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
if (!m_param->analysisReuseMode)
{
MV lmv = getLowresMV(cu, pu, list, ref);
if (lmv.notZero())
mvc[numMvc++] = lmv;
}
if (m_param->searchMethod == X265_SEA)
{
int puX = puIdx & 1;
int puY = puIdx >> 1;
for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic->m_stride;
}
setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices,
m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
bits += m_me.bitcost(outmv);
uint32_t mvCost = m_me.mvcost(outmv);
uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
if (cost < bestME[list].cost)
{
bestME[list].mv = outmv;
bestME[list].mvp = mvp;
bestME[list].mvpIdx = mvpIdx;
bestME[list].ref = ref;
bestME[list].cost = cost;
bestME[list].bits = bits;
bestME[list].mvCost = mvCost;
}
}
refMask >>= 16;
}
}
MotionData bidir[2];
uint32_t bidirCost = MAX_UINT;
int bidirBits = 0;
if (slice->isInterB() && !cu.isBipredRestriction() &&
cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N &&
bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT)
{
bidir[0] = bestME[0];
bidir[1] = bestME[1];
int satdCost;
if (m_me.bChromaSATD)
{
cu.m_mv[0][pu.puAbsPartIdx] = bidir[0].mv;
cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
cu.m_mv[1][pu.puAbsPartIdx] = bidir[1].mv;
cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
motionCompensation(cu, pu, tmpPredYuv, true, true);
satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
}
else
{
PicYuv* refPic0 = slice->m_refReconPicList[0][bestME[0].ref];
PicYuv* refPic1 = slice->m_refReconPicList[1][bestME[1].ref];
Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv);
predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv);
primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,
bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32);
satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
}
bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
bidirCost = satdCost + m_rdCost.getCost(bidirBits);
bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
if (bTryZero)
{
MV mvmin, mvmax;
int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
setSearchRange(cu, mvzero, merange, mvmin, mvmax);
mvmax.y += 2;
mvmin <<= 2;
mvmax <<= 2;
bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
}
if (bTryZero)
{
if (m_me.bChromaSATD)
{
cu.m_mv[0][pu.puAbsPartIdx] = mvzero;
cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
cu.m_mv[1][pu.puAbsPartIdx] = mvzero;
cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
motionCompensation(cu, pu, tmpPredYuv, true, true);
satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
}
else
{
const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
intptr_t refStride = slice->m_mref[0][0].lumaStride;
primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
}
MV mvp0 = bestME[0].mvp;
int mvpIdx0 = bestME[0].mvpIdx;
uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
MV mvp1 = bestME[1].mvp;
int mvpIdx1 = bestME[1].mvpIdx;
uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
mvp0 = checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvpIdx0, bits0, cost);
mvp1 = checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvpIdx1, bits1, cost);
if (cost < bidirCost)
{
bidir[0].mv = mvzero;
bidir[1].mv = mvzero;
bidir[0].mvp = mvp0;
bidir[1].mvp = mvp1;
bidir[0].mvpIdx = mvpIdx0;
bidir[1].mvpIdx = mvpIdx1;
bidirCost = cost;
bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
}
}
}
if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost)
{
cu.m_mergeFlag[pu.puAbsPartIdx] = true;
cu.m_mvpIdx[0][pu.puAbsPartIdx] = merge.index;
cu.setPUInterDir(merge.dir, pu.puAbsPartIdx, puIdx);
cu.setPUMv(0, merge.mvField[0].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(0, merge.mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
cu.setPUMv(1, merge.mvField[1].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(1, merge.mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
totalmebits += merge.bits;
}
else if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost)
{
lastMode = 2;
cu.m_mergeFlag[pu.puAbsPartIdx] = false;
cu.setPUInterDir(3, pu.puAbsPartIdx, puIdx);
cu.setPUMv(0, bidir[0].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
cu.m_mvd[0][pu.puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
cu.m_mvpIdx[0][pu.puAbsPartIdx] = bidir[0].mvpIdx;
cu.setPUMv(1, bidir[1].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
cu.m_mvd[1][pu.puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
cu.m_mvpIdx[1][pu.puAbsPartIdx] = bidir[1].mvpIdx;
totalmebits += bidirBits;
}
else if (bestME[0].cost <= bestME[1].cost)
{
lastMode = 0;
cu.m_mergeFlag[pu.puAbsPartIdx] = false;
cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
cu.setPUMv(0, bestME[0].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
cu.m_mvd[0][pu.puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;
cu.m_mvpIdx[0][pu.puAbsPartIdx] = bestME[0].mvpIdx;
cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
cu.setPUMv(1, mvzero, pu.puAbsPartIdx, puIdx);
totalmebits += bestME[0].bits;
}
else
{
lastMode = 1;
cu.m_mergeFlag[pu.puAbsPartIdx] = false;
cu.setPUInterDir(2, pu.puAbsPartIdx, puIdx);
cu.setPUMv(1, bestME[1].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
cu.m_mvd[1][pu.puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;
cu.m_mvpIdx[1][pu.puAbsPartIdx] = bestME[1].mvpIdx;
cu.setPURefIdx(0, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
cu.setPUMv(0, mvzero, pu.puAbsPartIdx, puIdx);
totalmebits += bestME[1].bits;
}
motionCompensation(cu, pu, *predYuv, true, bChromaMC);
}
interMode.sa8dBits += totalmebits;
}
void Search::getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3])
{
if (cuMode == SIZE_2Nx2N)
{
blockBit[0] = (!bPSlice) ? 3 : 1;
blockBit[1] = 3;
blockBit[2] = 5;
}
else if (cuMode == SIZE_2NxN || cuMode == SIZE_2NxnU || cuMode == SIZE_2NxnD)
{
static const uint32_t listBits[2][3][3] =
{
{ { 0, 0, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
{ { 5, 7, 7 }, { 7, 5, 7 }, { 9 - 3, 9 - 3, 9 - 3 } }
};
if (bPSlice)
{
blockBit[0] = 3;
blockBit[1] = 0;
blockBit[2] = 0;
}
else
memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
}
else if (cuMode == SIZE_Nx2N || cuMode == SIZE_nLx2N || cuMode == SIZE_nRx2N)
{
static const uint32_t listBits[2][3][3] =
{
{ { 0, 2, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
{ { 5, 7, 7 }, { 7 - 2, 7 - 2, 9 - 2 }, { 9 - 3, 9 - 3, 9 - 3 } }
};
if (bPSlice)
{
blockBit[0] = 3;
blockBit[1] = 0;
blockBit[2] = 0;
}
else
memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
}
else if (cuMode == SIZE_NxN)
{
blockBit[0] = (!bPSlice) ? 3 : 1;
blockBit[1] = 3;
blockBit[2] = 5;
}
else
{
X265_CHECK(0, "getBlkBits: unknown cuMode\n");
}
}
const MV& Search::checkBestMVP(const MV* amvpCand, const MV& mv, int& mvpIdx, uint32_t& outBits, uint32_t& outCost) const
{
int diffBits = m_me.bitcost(mv, amvpCand[!mvpIdx]) - m_me.bitcost(mv, amvpCand[mvpIdx]);
if (diffBits < 0)
{
mvpIdx = !mvpIdx;
uint32_t origOutBits = outBits;
outBits = origOutBits + diffBits;
outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits);
}
return amvpCand[mvpIdx];
}
void Search::setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const
{
MV dist((int16_t)merange << 2, (int16_t)merange << 2);
mvmin = mvp - dist;
mvmax = mvp + dist;
cu.clipMv(mvmin);
cu.clipMv(mvmax);
if (cu.m_encData->m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirStartCol &&
m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol < m_slice->m_sps->numCuInWidth)
{
int safeX, maxSafeMv;
safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3;
maxSafeMv = (safeX - cu.m_cuPelX) * 4;
mvmax.x = X265_MIN(mvmax.x, maxSafeMv);
mvmin.x = X265_MIN(mvmin.x, maxSafeMv);
}
if ((m_param->maxSlices > 1) & m_bFrameParallel)
{
mvmin.y = X265_MAX(mvmin.y, m_sliceMinY);
mvmax.y = X265_MIN(mvmax.y, m_sliceMaxY);
}
const int maxMvLen = (1 << 15) - 1;
mvmin.x = X265_MAX(mvmin.x, -maxMvLen);
mvmin.y = X265_MAX(mvmin.y, -maxMvLen);
mvmax.x = X265_MIN(mvmax.x, maxMvLen);
mvmax.y = X265_MIN(mvmax.y, maxMvLen);
mvmin >>= 2;
mvmax >>= 2;
mvmin.y = X265_MIN(mvmin.y, (int16_t)m_refLagPixels);
mvmax.y = X265_MIN(mvmax.y, (int16_t)m_refLagPixels);
mvmax.y = X265_MAX(mvmax.y, mvmin.y);
}
void Search::encodeResAndCalcRdSkipCU(Mode& interMode)
{
CUData& cu = interMode.cu;
Yuv* reconYuv = &interMode.reconYuv;
const Yuv* fencYuv = interMode.fencYuv;
Yuv* predYuv = &interMode.predYuv;
X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
uint32_t depth = cu.m_cuDepth[0];
cu.setPredModeSubParts(MODE_SKIP);
cu.clearCbf();
cu.setTUDepthSubParts(0, 0, depth);
reconYuv->copyFromYuv(interMode.predYuv);
int part = partitionFromLog2Size(cu.m_log2CUSize[0]);
interMode.lumaDistortion = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
interMode.distortion = interMode.lumaDistortion;
if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
{
interMode.chromaDistortion = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
interMode.chromaDistortion += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
interMode.distortion += interMode.chromaDistortion;
}
cu.m_distortion[0] = interMode.distortion;
m_entropyCoder.load(m_rqt[depth].cur);
m_entropyCoder.resetBits();
if (m_slice->m_pps->bTransquantBypassEnabled)
m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
m_entropyCoder.codeSkipFlag(cu, 0);
int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
m_entropyCoder.codeMergeIndex(cu, 0);
interMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
interMode.coeffBits = 0;
interMode.totalBits = interMode.mvBits + skipFlagBits;
if (m_rdCost.m_psyRd)
interMode.psyEnergy = m_rdCost.psyCost(part, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
else if(m_rdCost.m_ssimRd)
interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0);
interMode.resEnergy = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
updateModeCost(interMode);
m_entropyCoder.store(interMode.contexts);
}
void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
{
ProfileCUScope(interMode.cu, interRDOElapsedTime[cuGeom.depth], countInterRDO[cuGeom.depth]);
CUData& cu = interMode.cu;
Yuv* reconYuv = &interMode.reconYuv;
Yuv* predYuv = &interMode.predYuv;
uint32_t depth = cuGeom.depth;
ShortYuv* resiYuv = &m_rqt[depth].tmpResiYuv;
const Yuv* fencYuv = interMode.fencYuv;
X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
uint32_t log2CUSize = cuGeom.log2CUSize;
int sizeIdx = log2CUSize - 2;
resiYuv->subtract(*fencYuv, *predYuv, log2CUSize, m_frame->m_fencPic->m_picCsp);
uint32_t tuDepthRange[2];
cu.getInterTUQtDepthRange(tuDepthRange, 0);
m_entropyCoder.load(m_rqt[depth].cur);
if ((m_limitTU & X265_TU_LIMIT_DFS) && !(m_limitTU & X265_TU_LIMIT_NEIGH))
m_maxTUDepth = -1;
else if (m_limitTU & X265_TU_LIMIT_BFS)
memset(&m_cacheTU, 0, sizeof(TUInfoCache));
Cost costs;
if (m_limitTU & X265_TU_LIMIT_NEIGH)
{
int32_t tempDepth = m_maxTUDepth;
if (m_maxTUDepth != -1)
{
uint32_t splitFlag = interMode.cu.m_partSize[0] != SIZE_2Nx2N;
uint32_t minSize = tuDepthRange[0];
uint32_t maxSize = tuDepthRange[1];
maxSize = X265_MIN(maxSize, cuGeom.log2CUSize - splitFlag);
m_maxTUDepth = x265_clip3(cuGeom.log2CUSize - maxSize, cuGeom.log2CUSize - minSize, (uint32_t)m_maxTUDepth);
}
estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
m_maxTUDepth = tempDepth;
}
else
estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
uint32_t tqBypass = cu.m_tqBypass[0];
if (!tqBypass)
{
sse_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
{
cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
}
m_entropyCoder.load(m_rqt[depth].cur);
m_entropyCoder.resetBits();
m_entropyCoder.codeQtRootCbfZero();
uint32_t cbf0Bits = m_entropyCoder.getNumberOfWrittenBits();
uint32_t cbf0Energy; uint64_t cbf0Cost;
if (m_rdCost.m_psyRd)
{
cbf0Energy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
cbf0Cost = m_rdCost.calcPsyRdCost(cbf0Dist, cbf0Bits, cbf0Energy);
}
else if(m_rdCost.m_ssimRd)
{
cbf0Energy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size, log2CUSize, TEXT_LUMA, 0);
cbf0Cost = m_rdCost.calcSsimRdCost(cbf0Dist, cbf0Bits, cbf0Energy);
}
else
cbf0Cost = m_rdCost.calcRdCost(cbf0Dist, cbf0Bits);
if (cbf0Cost < costs.rdcost)
{
cu.clearCbf();
cu.setTUDepthSubParts(0, 0, depth);
}
}
if (cu.getQtRootCbf(0))
saveResidualQTData(cu, *resiYuv, 0, 0);
m_entropyCoder.load(m_rqt[depth].cur);
m_entropyCoder.resetBits();
if (m_slice->m_pps->bTransquantBypassEnabled)
m_entropyCoder.codeCUTransquantBypassFlag(tqBypass);
uint32_t coeffBits, bits, mvBits;
if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
{
cu.setPredModeSubParts(MODE_SKIP);
coeffBits = mvBits = 0;
m_entropyCoder.codeSkipFlag(cu, 0);
int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
m_entropyCoder.codeMergeIndex(cu, 0);
mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
bits = mvBits + skipFlagBits;
}
else
{
m_entropyCoder.codeSkipFlag(cu, 0);
int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
m_entropyCoder.codePredMode(cu.m_predMode[0]);
m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
m_entropyCoder.codePredInfo(cu, 0);
mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
bool bCodeDQP = m_slice->m_pps->bUseDQP;
m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
bits = m_entropyCoder.getNumberOfWrittenBits();
coeffBits = bits - mvBits - skipFlagBits;
}
m_entropyCoder.store(interMode.contexts);
if (cu.getQtRootCbf(0))
reconYuv->addClip(*predYuv, *resiYuv, log2CUSize, m_frame->m_fencPic->m_picCsp);
else
reconYuv->copyFromYuv(*predYuv);
sse_t bestLumaDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
interMode.distortion = bestLumaDist;
if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
{
sse_t bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
bestChromaDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
interMode.chromaDistortion = bestChromaDist;
interMode.distortion += bestChromaDist;
}
if (m_rdCost.m_psyRd)
interMode.psyEnergy = m_rdCost.psyCost(sizeIdx, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
else if(m_rdCost.m_ssimRd)
interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0);
interMode.resEnergy = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
interMode.totalBits = bits;
interMode.lumaDistortion = bestLumaDist;
interMode.coeffBits = coeffBits;
interMode.mvBits = mvBits;
cu.m_distortion[0] = interMode.distortion;
updateModeCost(interMode);
checkDQP(interMode, cuGeom);
}
void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
{
uint32_t depth = cuGeom.depth + tuDepth;
CUData& cu = mode.cu;
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
bool bCheckFull = log2TrSize <= depthRange[1];
if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && log2TrSize > depthRange[0])
bCheckFull = false;
if (bCheckFull)
{
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0;
uint32_t tuDepthC = tuDepth;
if (log2TrSizeC < 2)
{
X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
log2TrSizeC = 2;
tuDepthC--;
codeChroma &= !(absPartIdx & 3);
}
uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
uint32_t setCbf = 1 << tuDepth;
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
coeff_t* coeffCurY = cu.m_trCoeff[0] + coeffOffsetY;
uint32_t sizeIdx = log2TrSize - 2;
cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
const Yuv* fencYuv = mode.fencYuv;
int16_t* curResiY = resiYuv.getLumaAddr(absPartIdx);
uint32_t strideResiY = resiYuv.m_size;
const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
if (numSigY)
{
m_quant.invtransformNxN(cu, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY);
cu.setCbfSubParts(setCbf, TEXT_LUMA, absPartIdx, depth);
}
else
{
primitives.cu[sizeIdx].blockfill_s(curResiY, strideResiY, 0);
cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth);
}
if (codeChroma)
{
uint32_t sizeIdxC = log2TrSizeC - 2;
uint32_t strideResiC = resiYuv.m_csize;
uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
coeff_t* coeffCurU = cu.m_trCoeff[1] + coeffOffsetC;
coeff_t* coeffCurV = cu.m_trCoeff[2] + coeffOffsetC;
bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
do
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
cu.setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
cu.setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC);
const pixel* fencCb = fencYuv->getCbAddr(absPartIdxC);
uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false);
if (numSigU)
{
m_quant.invtransformNxN(cu, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU);
cu.setCbfPartRange(setCbf, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
}
else
{
primitives.cu[sizeIdxC].blockfill_s(curResiU, strideResiC, 0);
cu.setCbfPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
}
int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC);
const pixel* fencCr = fencYuv->getCrAddr(absPartIdxC);
uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false);
if (numSigV)
{
m_quant.invtransformNxN(cu, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV);
cu.setCbfPartRange(setCbf, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
}
else
{
primitives.cu[sizeIdxC].blockfill_s(curResiV, strideResiC, 0);
cu.setCbfPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
}
}
while (tuIterator.isNextSection());
if (splitIntoSubTUs)
{
offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
}
}
}
else
{
X265_CHECK(log2TrSize > depthRange[0], "residualTransformQuantInter recursion check failure\n");
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
residualTransformQuantInter(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
{
ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
}
}
cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth;
if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
{
cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth;
cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth;
}
}
}
uint64_t Search::estimateNullCbfCost(sse_t dist, uint32_t energy, uint32_t tuDepth, TextType compId)
{
uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth);
if (m_rdCost.m_psyRd)
return m_rdCost.calcPsyRdCost(dist, nullBits, energy);
else if(m_rdCost.m_ssimRd)
return m_rdCost.calcSsimRdCost(dist, nullBits, energy);
else
return m_rdCost.calcRdCost(dist, nullBits);
}
bool Search::splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& splitCost, const uint32_t depthRange[2], int32_t splitMore)
{
CUData& cu = mode.cu;
uint32_t depth = cuGeom.depth + tuDepth;
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
if ((m_limitTU & X265_TU_LIMIT_DFS) && tuDepth == 0 && qIdx == 1)
{
m_maxTUDepth = cu.m_tuDepth[0];
for (uint32_t i = 1; i < cuGeom.numPartitions / 4; i++)
m_maxTUDepth = X265_MAX(m_maxTUDepth, cu.m_tuDepth[i]);
}
estimateResidualQT(mode, cuGeom, qPartIdx, tuDepth + 1, resiYuv, splitCost, depthRange, splitMore);
ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
{
ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
}
}
cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth;
if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
{
cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth;
cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth;
}
m_entropyCoder.load(m_rqt[depth].rqtRoot);
m_entropyCoder.resetBits();
codeInterSubdivCbfQT(cu, absPartIdx, tuDepth, depthRange);
uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
splitCost.bits += splitCbfBits;
if (m_rdCost.m_psyRd)
splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
else if(m_rdCost.m_ssimRd)
splitCost.rdcost = m_rdCost.calcSsimRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
else
splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
return ycbf || ucbf || vcbf;
}
void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2], int32_t splitMore)
{
CUData& cu = mode.cu;
uint32_t depth = cuGeom.depth + tuDepth;
uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
bool bEnableRDOQ = !!m_param->rdoqLevel;
bool bCheckSplit = log2TrSize > depthRange[0];
bool bCheckFull = log2TrSize <= depthRange[1];
bool bSaveTUData = false, bLoadTUData = false;
uint32_t idx = 0;
if ((m_limitTU & X265_TU_LIMIT_BFS) && splitMore >= 0)
{
if (bCheckSplit && bCheckFull && tuDepth)
{
uint32_t qNumParts = 1 << (log2TrSize - LOG2_UNIT_SIZE) * 2;
uint32_t qIdx = (absPartIdx / qNumParts) % 4;
idx = (depth - 1) * 4 + qIdx;
if (splitMore)
{
bLoadTUData = true;
bCheckFull = false;
}
else
{
bSaveTUData = true;
bCheckSplit = false;
}
}
}
else if (m_limitTU & X265_TU_LIMIT_DFS || m_limitTU & X265_TU_LIMIT_NEIGH)
{
if (bCheckSplit && m_maxTUDepth >= 0)
{
uint32_t log2MaxTrSize = cuGeom.log2CUSize - m_maxTUDepth;
bCheckSplit = log2TrSize > log2MaxTrSize;
}
}
bool bSplitPresentFlag = bCheckSplit && bCheckFull;
if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && bCheckSplit)
bCheckFull = false;
X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0;
uint32_t tuDepthC = tuDepth;
if (log2TrSizeC < 2)
{
X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
log2TrSizeC = 2;
tuDepthC--;
codeChroma &= !(absPartIdx & 3);
}
Cost fullCost;
fullCost.rdcost = MAX_INT64;
uint8_t cbfFlag[MAX_NUM_COMPONENT][2 ] = { { 0, 0 }, {0, 0}, {0, 0} };
uint32_t numSig[MAX_NUM_COMPONENT][2 ] = { { 0, 0 }, {0, 0}, {0, 0} };
uint32_t singleBits[MAX_NUM_COMPONENT][2 ] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
sse_t singleDist[MAX_NUM_COMPONENT][2 ] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
uint32_t singleEnergy[MAX_NUM_COMPONENT][2 ] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 ] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
uint64_t minCost[MAX_NUM_COMPONENT][2 ] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} };
m_entropyCoder.store(m_rqt[depth].rqtRoot);
uint32_t trSize = 1 << log2TrSize;
const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
const Yuv* fencYuv = mode.fencYuv;
if (bCheckFull)
{
uint32_t trSizeC = 1 << log2TrSizeC;
int partSize = partitionFromLog2Size(log2TrSize);
int partSizeC = partitionFromLog2Size(log2TrSizeC);
const uint32_t qtLayer = log2TrSize - 2;
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0];
bool checkTransformSkipY = checkTransformSkip && log2TrSize <= MAX_LOG2_TS_SIZE;
bool checkTransformSkipC = checkTransformSkip && log2TrSizeC <= MAX_LOG2_TS_SIZE;
cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
if (bEnableRDOQ)
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
int16_t* resi = resiYuv.getLumaAddr(absPartIdx);
numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0];
m_entropyCoder.resetBits();
if (bSplitPresentFlag && log2TrSize > depthRange[0])
m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
if (cbfFlag[TEXT_LUMA][0])
m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
singleBits[TEXT_LUMA][0] = m_entropyCoder.getNumberOfWrittenBits();
X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n");
sse_t zeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size);
uint32_t zeroEnergyY = 0;
if (m_rdCost.m_psyRd)
zeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size);
else if(m_rdCost.m_ssimRd)
zeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size, log2TrSize, TEXT_LUMA, absPartIdx);
int16_t* curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size;
if (cbfFlag[TEXT_LUMA][0])
{
m_quant.invtransformNxN(cu, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]);
pixel* curReconY = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
uint32_t strideReconY = m_rqt[qtLayer].reconQtYuv.m_size;
primitives.cu[partSize].add_ps(curReconY, strideReconY, mode.predYuv.getLumaAddr(absPartIdx), curResiY, mode.predYuv.m_size, strideResiY);
const sse_t nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, curReconY, strideReconY);
uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
uint32_t nonZeroEnergyY = 0; uint64_t singleCostY = 0;
if (m_rdCost.m_psyRd)
{
nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, curReconY, strideReconY);
singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroEnergyY);
}
else if(m_rdCost.m_ssimRd)
{
nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, curReconY, strideReconY, log2TrSize, TEXT_LUMA, absPartIdx);
singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroEnergyY);
}
else
singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]);
if (cu.m_tqBypass[0])
{
singleDist[TEXT_LUMA][0] = nonZeroDistY;
singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY;
}
else
{
uint64_t nullCostY = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA);
if (nullCostY < singleCostY)
{
cbfFlag[TEXT_LUMA][0] = 0;
singleBits[TEXT_LUMA][0] = 0;
primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
#if CHECKED_BUILD || _DEBUG
uint32_t numCoeffY = 1 << (log2TrSize << 1);
memset(coeffCurY, 0, sizeof(coeff_t)* numCoeffY);
#endif
if (checkTransformSkipY)
minCost[TEXT_LUMA][0] = nullCostY;
singleDist[TEXT_LUMA][0] = zeroDistY;
singleEnergy[TEXT_LUMA][0] = zeroEnergyY;
}
else
{
if (checkTransformSkipY)
minCost[TEXT_LUMA][0] = singleCostY;
singleDist[TEXT_LUMA][0] = nonZeroDistY;
singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY;
}
}
}
else
{
if (checkTransformSkipY)
minCost[TEXT_LUMA][0] = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA);
primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
singleDist[TEXT_LUMA][0] = zeroDistY;
singleBits[TEXT_LUMA][0] = 0;
singleEnergy[TEXT_LUMA][0] = zeroEnergyY;
}
cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
if (codeChroma)
{
uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
sse_t zeroDistC = 0;
uint32_t zeroEnergyC = 0;
coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
do
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
if (bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
uint32_t latestBitCount = m_entropyCoder.getNumberOfWrittenBits();
if (cbfFlag[chromaId][tuIterator.section])
m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits() - latestBitCount;
int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
zeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[log2TrSizeC - 2].sse_pp(fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize));
if (m_rdCost.m_psyRd)
zeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize);
else if(m_rdCost.m_ssimRd)
zeroEnergyC = m_quant.ssimDistortion(cu, fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize, log2TrSizeC, (TextType)chromaId, absPartIdxC);
if (cbfFlag[chromaId][tuIterator.section])
{
m_quant.invtransformNxN(cu, curResiC, strideResiC, coeffCurC + subTUOffset,
log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
pixel* curReconC = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t strideReconC = m_rqt[qtLayer].reconQtYuv.m_csize;
primitives.cu[partSizeC].add_ps(curReconC, strideReconC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), curResiC, mode.predYuv.m_csize, strideResiC);
sse_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, curReconC, strideReconC));
uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
uint32_t nonZeroEnergyC = 0; uint64_t singleCostC = 0;
if (m_rdCost.m_psyRd)
{
nonZeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, curReconC, strideReconC);
singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
}
else if(m_rdCost.m_ssimRd)
{
nonZeroEnergyC = m_quant.ssimDistortion(cu, fenc, fencYuv->m_csize, curReconC, strideReconC, log2TrSizeC, (TextType)chromaId, absPartIdxC);
singleCostC = m_rdCost.calcSsimRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
}
else
singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]);
if (cu.m_tqBypass[0])
{
singleDist[chromaId][tuIterator.section] = nonZeroDistC;
singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
}
else
{
uint64_t nullCostC = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepth, (TextType)chromaId);
if (nullCostC < singleCostC)
{
cbfFlag[chromaId][tuIterator.section] = 0;
singleBits[chromaId][tuIterator.section] = 0;
primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0);
#if CHECKED_BUILD || _DEBUG
uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
#endif
if (checkTransformSkipC)
minCost[chromaId][tuIterator.section] = nullCostC;
singleDist[chromaId][tuIterator.section] = zeroDistC;
singleEnergy[chromaId][tuIterator.section] = zeroEnergyC;
}
else
{
if (checkTransformSkipC)
minCost[chromaId][tuIterator.section] = singleCostC;
singleDist[chromaId][tuIterator.section] = nonZeroDistC;
singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
}
}
}
else
{
if (checkTransformSkipC)
minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepthC, (TextType)chromaId);
primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0);
singleBits[chromaId][tuIterator.section] = 0;
singleDist[chromaId][tuIterator.section] = zeroDistC;
singleEnergy[chromaId][tuIterator.section] = zeroEnergyC;
}
cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
}
while (tuIterator.isNextSection());
}
}
if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
{
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
do
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
cu.setCbfPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
}
while(tuIterator.isNextSection());
}
}
if (checkTransformSkipY)
{
sse_t nonZeroDistY = 0;
uint32_t nonZeroEnergyY = 0;
uint64_t singleCostY = MAX_INT64;
m_entropyCoder.load(m_rqt[depth].rqtRoot);
cu.setTransformSkipSubParts(1, TEXT_LUMA, absPartIdx, depth);
if (bEnableRDOQ)
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
fenc = fencYuv->getLumaAddr(absPartIdx);
resi = resiYuv.getLumaAddr(absPartIdx);
uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, m_tsCoeff, log2TrSize, TEXT_LUMA, absPartIdx, true);
if (numSigTSkipY)
{
m_entropyCoder.resetBits();
m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth);
m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdx, log2TrSize, TEXT_LUMA);
const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits();
m_quant.invtransformNxN(cu, m_tsResidual, trSize, m_tsCoeff, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
primitives.cu[partSize].add_ps(m_tsRecon, trSize, mode.predYuv.getLumaAddr(absPartIdx), m_tsResidual, mode.predYuv.m_size, trSize);
nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, m_tsRecon, trSize);
if (m_rdCost.m_psyRd)
{
nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, m_tsRecon, trSize);
singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY);
}
else if(m_rdCost.m_ssimRd)
{
nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, m_tsRecon, trSize, log2TrSize, TEXT_LUMA, absPartIdx);
singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY);
}
else
singleCostY = m_rdCost.calcRdCost(nonZeroDistY, skipSingleBitsY);
}
if (!numSigTSkipY || minCost[TEXT_LUMA][0] < singleCostY)
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
else
{
singleDist[TEXT_LUMA][0] = nonZeroDistY;
singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY;
cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
bestTransformMode[TEXT_LUMA][0] = 1;
if (m_param->limitTU)
numSig[TEXT_LUMA][0] = numSigTSkipY;
uint32_t numCoeffY = 1 << (log2TrSize << 1);
memcpy(coeffCurY, m_tsCoeff, sizeof(coeff_t) * numCoeffY);
primitives.cu[partSize].copy_ss(curResiY, strideResiY, m_tsResidual, trSize);
}
cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
}
if (codeChroma && checkTransformSkipC)
{
sse_t nonZeroDistC = 0;
uint32_t nonZeroEnergyC = 0;
uint64_t singleCostC = MAX_INT64;
uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
m_entropyCoder.load(m_rqt[depth].rqtRoot);
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
do
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
cu.setTransformSkipPartRange(1, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
if (bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, m_tsCoeff, log2TrSizeC, (TextType)chromaId, absPartIdxC, true);
m_entropyCoder.resetBits();
singleBits[chromaId][tuIterator.section] = 0;
if (numSigTSkipC)
{
m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth);
m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
m_quant.invtransformNxN(cu, m_tsResidual, trSizeC, m_tsCoeff,
log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
primitives.cu[partSizeC].add_ps(m_tsRecon, trSizeC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), m_tsResidual, mode.predYuv.m_csize, trSizeC);
nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, m_tsRecon, trSizeC));
if (m_rdCost.m_psyRd)
{
nonZeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, m_tsRecon, trSizeC);
singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
}
else if(m_rdCost.m_ssimRd)
{
nonZeroEnergyC = m_quant.ssimDistortion(cu, fenc, mode.fencYuv->m_csize, m_tsRecon, trSizeC, log2TrSizeC, (TextType)chromaId, absPartIdxC);
singleCostC = m_rdCost.calcSsimRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
}
else
singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]);
}
if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC)
cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
else
{
singleDist[chromaId][tuIterator.section] = nonZeroDistC;
singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC;
bestTransformMode[chromaId][tuIterator.section] = 1;
uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
memcpy(coeffCurC + subTUOffset, m_tsCoeff, sizeof(coeff_t) * numCoeffC);
primitives.cu[partSizeC].copy_ss(curResiC, strideResiC, m_tsResidual, trSizeC);
}
cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
}
while (tuIterator.isNextSection());
}
}
m_entropyCoder.load(m_rqt[depth].rqtRoot);
m_entropyCoder.resetBits();
if (codeChroma)
{
if (!splitIntoSubTUs)
{
m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth);
m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth);
}
else
{
offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth);
m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][1], tuDepth);
m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth);
m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][1], tuDepth);
}
}
m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
uint32_t cbfBits = m_entropyCoder.getNumberOfWrittenBits();
uint32_t coeffBits = 0;
coeffBits = singleBits[TEXT_LUMA][0];
for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
{
coeffBits += singleBits[TEXT_CHROMA_U][subTUIndex];
coeffBits += singleBits[TEXT_CHROMA_V][subTUIndex];
}
fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits;
fullCost.distortion += singleDist[TEXT_LUMA][0];
fullCost.energy += singleEnergy[TEXT_LUMA][0];
for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
{
fullCost.distortion += singleDist[TEXT_CHROMA_U][subTUIndex];
fullCost.distortion += singleDist[TEXT_CHROMA_V][subTUIndex];
}
if (m_rdCost.m_psyRd)
fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
else if(m_rdCost.m_ssimRd)
fullCost.rdcost = m_rdCost.calcSsimRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
else
fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
if (m_param->limitTU && bCheckSplit)
{
uint32_t numCoeff = trSize * trSize;
if (cbfFlag[TEXT_LUMA][0] == 0)
bCheckSplit = false;
else if (numSig[TEXT_LUMA][0] < (numCoeff / 64))
{
uint32_t energy = 0;
for (uint32_t i = 0; i < numCoeff; i++)
energy += abs(coeffCurY[i]);
if (energy == numSig[TEXT_LUMA][0])
bCheckSplit = false;
}
}
if (bSaveTUData)
{
for (int plane = 0; plane < MAX_NUM_COMPONENT; plane++)
{
for(int part = 0; part < (m_csp == X265_CSP_I422) + 1; part++)
{
m_cacheTU.bestTransformMode[idx][plane][part] = bestTransformMode[plane][part];
m_cacheTU.cbfFlag[idx][plane][part] = cbfFlag[plane][part];
}
}
m_cacheTU.cost[idx] = fullCost;
m_entropyCoder.store(m_cacheTU.rqtStore[idx]);
}
}
if (bLoadTUData)
{
for (int plane = 0; plane < MAX_NUM_COMPONENT; plane++)
{
for(int part = 0; part < (m_csp == X265_CSP_I422) + 1; part++)
{
bestTransformMode[plane][part] = m_cacheTU.bestTransformMode[idx][plane][part];
cbfFlag[plane][part] = m_cacheTU.cbfFlag[idx][plane][part];
}
}
fullCost = m_cacheTU.cost[idx];
m_entropyCoder.load(m_cacheTU.rqtStore[idx]);
bCheckFull = true;
}
if (bCheckSplit)
{
if (bCheckFull)
{
m_entropyCoder.store(m_rqt[depth].rqtTest);
m_entropyCoder.load(m_rqt[depth].rqtRoot);
}
Cost splitCost;
if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
{
m_entropyCoder.resetBits();
m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
}
bool yCbCrCbf = splitTU(mode, cuGeom, absPartIdx, tuDepth, resiYuv, splitCost, depthRange, 0);
if (yCbCrCbf || !bCheckFull)
{
if (splitCost.rdcost < fullCost.rdcost)
{
if (m_limitTU & X265_TU_LIMIT_BFS)
{
uint32_t nextlog2TrSize = cuGeom.log2CUSize - (tuDepth + 1);
bool nextSplit = nextlog2TrSize > depthRange[0];
if (nextSplit)
{
m_entropyCoder.load(m_rqt[depth].rqtRoot);
splitCost.bits = splitCost.distortion = splitCost.rdcost = splitCost.energy = 0;
if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
{
m_entropyCoder.resetBits();
m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
}
splitTU(mode, cuGeom, absPartIdx, tuDepth, resiYuv, splitCost, depthRange, 1);
}
}
outCosts.distortion += splitCost.distortion;
outCosts.rdcost += splitCost.rdcost;
outCosts.bits += splitCost.bits;
outCosts.energy += splitCost.energy;
return;
}
else
outCosts.energy += splitCost.energy;
}
cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth);
if (codeChroma)
{
if (!splitIntoSubTUs)
{
cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx, depth);
cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx, depth);
}
else
{
uint32_t tuNumParts = absPartIdxStep >> 1;
cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx , tuNumParts);
cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][1], TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx , tuNumParts);
cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][1], TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
}
}
X265_CHECK(bCheckFull, "check-full must be set\n");
m_entropyCoder.load(m_rqt[depth].rqtTest);
}
cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
if (codeChroma)
{
if (!splitIntoSubTUs)
{
cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx, depth);
cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx, depth);
}
else
{
uint32_t tuNumParts = absPartIdxStep >> 1;
offsetCBFs(cbfFlag[TEXT_CHROMA_U]);
offsetCBFs(cbfFlag[TEXT_CHROMA_V]);
cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx , tuNumParts);
cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][1] << tuDepth, TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx , tuNumParts);
cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][1] << tuDepth, TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
}
}
outCosts.distortion += fullCost.distortion;
outCosts.rdcost += fullCost.rdcost;
outCosts.bits += fullCost.bits;
outCosts.energy += fullCost.energy;
}
void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2])
{
X265_CHECK(cu.isInter(absPartIdx), "codeInterSubdivCbfQT() with intra block\n");
const bool bSubdiv = tuDepth < cu.m_tuDepth[absPartIdx];
uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
{
if (!(log2TrSize - m_hChromaShift < 2))
{
uint32_t parentIdx = absPartIdx & (0xFF << (log2TrSize + 1 - LOG2_UNIT_SIZE) * 2);
if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_U, tuDepth - 1))
m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv);
if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_V, tuDepth - 1))
m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv);
}
}
if (!bSubdiv)
{
m_entropyCoder.codeQtCbfLuma(cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth), tuDepth);
}
else
{
uint32_t qNumParts = 1 << (log2TrSize -1 - LOG2_UNIT_SIZE) * 2;
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
codeInterSubdivCbfQT(cu, absPartIdx, tuDepth + 1, depthRange);
}
}
void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth)
{
const uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
if (tuDepth < cu.m_tuDepth[absPartIdx])
{
uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
saveResidualQTData(cu, resiYuv, absPartIdx, tuDepth + 1);
return;
}
const uint32_t qtLayer = log2TrSize - 2;
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0;
uint32_t tuDepthC = tuDepth;
if (log2TrSizeC < 2)
{
X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
log2TrSizeC = 2;
tuDepthC--;
codeChroma &= !(absPartIdx & 3);
}
m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize);
uint32_t numCoeffY = 1 << (log2TrSize * 2);
uint32_t coeffOffsetY = absPartIdx << LOG2_UNIT_SIZE * 2;
coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
coeff_t* coeffDstY = cu.m_trCoeff[0] + coeffOffsetY;
memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
if (codeChroma)
{
m_rqt[qtLayer].resiQtYuv.copyPartToPartChroma(resiYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC;
coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC;
memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
}
}
uint32_t Search::getIntraRemModeBits(CUData& cu, uint32_t absPartIdx, uint32_t mpmModes[3], uint64_t& mpms) const
{
cu.getIntraDirLumaPredictor(absPartIdx, mpmModes);
mpms = 0;
for (int i = 0; i < 3; ++i)
mpms |= ((uint64_t)1 << mpmModes[i]);
return m_entropyCoder.bitsIntraModeNonMPM();
}
void Search::updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList)
{
uint32_t maxIndex = 0;
uint64_t maxValue = 0;
for (int i = 0; i < maxCandCount; i++)
{
if (maxValue < candCostList[i])
{
maxValue = candCostList[i];
maxIndex = i;
}
}
if (cost < maxValue)
{
candCostList[maxIndex] = cost;
candModeList[maxIndex] = mode;
}
}
void Search::checkDQP(Mode& mode, const CUGeom& cuGeom)
{
CUData& cu = mode.cu;
if (cu.m_slice->m_pps->bUseDQP && cuGeom.depth <= cu.m_slice->m_pps->maxCuDQPDepth)
{
if (cu.getQtRootCbf(0))
{
if (m_param->rdLevel >= 3)
{
mode.contexts.resetBits();
mode.contexts.codeDeltaQP(cu, 0);
uint32_t bits = mode.contexts.getNumberOfWrittenBits();
mode.totalBits += bits;
updateModeCost(mode);
}
else if (m_param->rdLevel <= 1)
{
mode.sa8dBits++;
mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
}
else
{
mode.totalBits++;
updateModeCost(mode);
}
}
else
cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
}
}
void Search::checkDQPForSplitPred(Mode& mode, const CUGeom& cuGeom)
{
CUData& cu = mode.cu;
if ((cuGeom.depth == cu.m_slice->m_pps->maxCuDQPDepth) && cu.m_slice->m_pps->bUseDQP)
{
bool hasResidual = false;
for (uint32_t blkIdx = 0; blkIdx < cuGeom.numPartitions; blkIdx++)
{
if (cu.getQtRootCbf(blkIdx))
{
hasResidual = true;
break;
}
}
if (hasResidual)
{
if (m_param->rdLevel >= 3)
{
mode.contexts.resetBits();
mode.contexts.codeDeltaQP(cu, 0);
uint32_t bits = mode.contexts.getNumberOfWrittenBits();
mode.totalBits += bits;
updateModeCost(mode);
}
else if (m_param->rdLevel <= 1)
{
mode.sa8dBits++;
mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
}
else
{
mode.totalBits++;
updateModeCost(mode);
}
cu.setQPSubCUs(cu.getRefQP(0), 0, cuGeom.depth);
}
else
cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
}
}