//---------------------------------------------------------------------------------
//
// Little Color Management System, fast floating point extensions
// Copyright (c) 1998-2020 Marti Maria Saguer, all rights reserved
//
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see .
//
//---------------------------------------------------------------------------------
// Optimization for matrix-shaper in 8 bits using SSE2 intrinsics
#include "fast_float_internal.h"
#ifndef CMS_DONT_USE_SSE2
#ifdef _MSC_VER
#include
#else
#include
#include
#endif
#include
// This is the private data container used by this optimization
typedef struct {
// This is for SSE, MUST be aligned at 16 bit boundary
cmsFloat32Number Mat[4][4]; // n.14 to n.14 (needs a saturation after that)
void * real_ptr;
cmsContext ContextID;
cmsFloat32Number Shaper1R[256]; // from 0..255 to 1.14 (0.0...1.0)
cmsFloat32Number Shaper1G[256];
cmsFloat32Number Shaper1B[256];
cmsUInt8Number Shaper2R[0x4001]; // 1.14 to 0..255
cmsUInt8Number Shaper2G[0x4001];
cmsUInt8Number Shaper2B[0x4001];
} XMatShaper8Data;
static
XMatShaper8Data* malloc_aligned(cmsContext ContextID)
{
cmsUInt8Number* real_ptr = (cmsUInt8Number*) _cmsMallocZero(ContextID, sizeof(XMatShaper8Data) + 32);
cmsUInt8Number* aligned = (cmsUInt8Number*) (((uintptr_t)real_ptr + 16) & ~0xf);
XMatShaper8Data* p = (XMatShaper8Data*) aligned;
p ->real_ptr = real_ptr;
return p;
}
static
void free_aligned(cmsContext ContextID, XMatShaper8Data* a)
{
_cmsFree(ContextID, a->real_ptr);
}
// Free the private data container
static
void FreeMatShaper(cmsContext ContextID, void* Data)
{
UNUSED_PARAMETER(ContextID);
if (Data != NULL) free_aligned(ContextID, (XMatShaper8Data*) Data);
}
// This table converts from 8 bits to 1.14 after applying the curve
static
void FillFirstShaper(cmsContext ContextID, cmsFloat32Number* Table, cmsToneCurve* Curve)
{
cmsInt32Number i;
cmsFloat32Number R;
for (i = 0; i < 256; i++) {
R = (cmsFloat32Number)(i / 255.0);
Table[i] = cmsEvalToneCurveFloat(ContextID, Curve, R);
}
}
// This table converts form 1.14 (being 0x4000 the last entry) to 8 bits after applying the curve
static
void FillSecondShaper(cmsContext ContextID, cmsUInt8Number* Table, cmsToneCurve* Curve)
{
int i;
cmsFloat32Number R, Val;
cmsInt32Number w;
for (i=0; i < 0x4001; i++) {
R = (cmsFloat32Number) (i / 16384.0f);
Val = cmsEvalToneCurveFloat(ContextID, Curve, R);
w = (cmsInt32Number) (Val * 255.0f + 0.5f);
if (w < 0) w = 0;
if (w > 255) w = 255;
Table[i] = (cmsInt8Number) w;
}
}
// Compute the matrix-shaper structure
static
XMatShaper8Data* SetMatShaper(cmsContext ContextID, cmsToneCurve* Curve1[3], cmsMAT3* Mat, cmsVEC3* Off, cmsToneCurve* Curve2[3])
{
XMatShaper8Data* p;
int i, j;
// Allocate a big chuck of memory to store precomputed tables
p = malloc_aligned(ContextID);
if (p == NULL) return FALSE;
// Precompute tables
FillFirstShaper(ContextID, p ->Shaper1R, Curve1[0]);
FillFirstShaper(ContextID, p ->Shaper1G, Curve1[1]);
FillFirstShaper(ContextID, p ->Shaper1B, Curve1[2]);
FillSecondShaper(ContextID, p ->Shaper2R, Curve2[0]);
FillSecondShaper(ContextID, p ->Shaper2G, Curve2[1]);
FillSecondShaper(ContextID, p ->Shaper2B, Curve2[2]);
// Convert matrix to float
for (i=0; i < 3; i++) {
for (j=0; j < 3; j++) {
p ->Mat[j][i] = (cmsFloat32Number) Mat->v[i].n[j];
}
}
// Roundoff
for (i=0; i < 3; i++) {
if (Off == NULL) {
p->Mat[3][i] = 0.0f;
}
else {
p->Mat[3][i] = (cmsFloat32Number)Off->n[i];
}
}
return p;
}
// A fast matrix-shaper evaluator for 8 bits.
static
void MatShaperXform8SSE(cmsContext ContextID,
struct _cmstransform_struct *CMMcargo,
const void* Input,
void* Output,
cmsUInt32Number PixelsPerLine,
cmsUInt32Number LineCount,
const cmsStride* Stride)
{
XMatShaper8Data* p = (XMatShaper8Data*) _cmsGetTransformUserData(CMMcargo);
cmsUInt32Number i, ii;
cmsUInt32Number SourceStartingOrder[cmsMAXCHANNELS];
cmsUInt32Number SourceIncrements[cmsMAXCHANNELS];
cmsUInt32Number DestStartingOrder[cmsMAXCHANNELS];
cmsUInt32Number DestIncrements[cmsMAXCHANNELS];
const cmsUInt8Number* rin;
const cmsUInt8Number* gin;
const cmsUInt8Number* bin;
const cmsUInt8Number* ain = NULL;
cmsUInt8Number* rout;
cmsUInt8Number* gout;
cmsUInt8Number* bout;
cmsUInt8Number* aout = NULL;
cmsUInt32Number nalpha, strideIn, strideOut;
__m128 mat0 = _mm_load_ps(p->Mat[0]);
__m128 mat1 = _mm_load_ps(p->Mat[1]);
__m128 mat2 = _mm_load_ps(p->Mat[2]);
__m128 mat3 = _mm_load_ps(p->Mat[3]);
__m128 zero = _mm_setzero_ps();
__m128 one = _mm_set1_ps(1.0f);
__m128 scale = _mm_set1_ps((cmsFloat32Number)0x4000);
cmsUInt8Number buffer[32];
cmsUInt32Number* output_index = (cmsUInt32Number*)(((uintptr_t)buffer + 16) & ~0xf);
_cmsComputeComponentIncrements(cmsGetTransformInputFormat(ContextID, (cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneIn, NULL, &nalpha, SourceStartingOrder, SourceIncrements);
_cmsComputeComponentIncrements(cmsGetTransformOutputFormat(ContextID, (cmsHTRANSFORM)CMMcargo), Stride->BytesPerPlaneOut, NULL, &nalpha, DestStartingOrder, DestIncrements);
if (!(_cmsGetTransformFlags((cmsHTRANSFORM)CMMcargo) & cmsFLAGS_COPY_ALPHA))
nalpha = 0;
strideIn = strideOut = 0;
for (i = 0; i < LineCount; i++) {
rin = (const cmsUInt8Number*)Input + SourceStartingOrder[0] + strideIn;
gin = (const cmsUInt8Number*)Input + SourceStartingOrder[1] + strideIn;
bin = (const cmsUInt8Number*)Input + SourceStartingOrder[2] + strideIn;
if (nalpha)
ain = (const cmsUInt8Number*)Input + SourceStartingOrder[3] + strideIn;
rout = (cmsUInt8Number*)Output + DestStartingOrder[0] + strideOut;
gout = (cmsUInt8Number*)Output + DestStartingOrder[1] + strideOut;
bout = (cmsUInt8Number*)Output + DestStartingOrder[2] + strideOut;
if (nalpha)
aout = (cmsUInt8Number*)Output + DestStartingOrder[3] + strideOut;
/**
* Prefetch
*/
__m128 rvector = _mm_set1_ps(p->Shaper1R[*rin]);
__m128 gvector = _mm_set1_ps(p->Shaper1G[*gin]);
__m128 bvector = _mm_set1_ps(p->Shaper1B[*bin]);
for (ii = 0; ii < PixelsPerLine; ii++) {
__m128 el1 = _mm_mul_ps(rvector, mat0);
__m128 el2 = _mm_mul_ps(gvector, mat1);
__m128 el3 = _mm_mul_ps(bvector, mat2);
__m128 sum = _mm_add_ps(el1, _mm_add_ps(el2, _mm_add_ps(el3, mat3)));
__m128 out = _mm_min_ps(_mm_max_ps(sum, zero), one);
out = _mm_mul_ps(out, scale);
/**
* Rounding and converting to index.
* Actually this is a costly instruction that may be blocking performance
*/
_mm_store_si128((__m128i*)output_index, _mm_cvtps_epi32(out));
// Handle alpha
if (ain) {
*aout = *ain;
}
rin += SourceIncrements[0];
gin += SourceIncrements[1];
bin += SourceIncrements[2];
if (ain) ain += SourceIncrements[3];
/**
* Take next value whilst store is being performed
*/
if (ii < PixelsPerLine - 1)
{
rvector = _mm_set1_ps(p->Shaper1R[*rin]);
gvector = _mm_set1_ps(p->Shaper1G[*gin]);
bvector = _mm_set1_ps(p->Shaper1B[*bin]);
}
*rout = p->Shaper2R[output_index[0]];
*gout = p->Shaper2G[output_index[1]];
*bout = p->Shaper2B[output_index[2]];
rout += DestIncrements[0];
gout += DestIncrements[1];
bout += DestIncrements[2];
if (aout) aout += DestIncrements[3];
}
strideIn += Stride->BytesPerLineIn;
strideOut += Stride->BytesPerLineOut;
}
}
static
cmsBool IsSSE2Available(void)
{
#ifdef _MSC_VER
int cpuinfo[4];
__cpuid(cpuinfo, 1);
if (!(cpuinfo[3] & (1 << 26))) return FALSE;
return TRUE;
#else
unsigned int level = 1u;
unsigned int eax, ebx, ecx, edx;
unsigned int bits = (1u << 26);
unsigned int max = __get_cpuid_max(0, NULL);
if (level > max) {
return FALSE;
}
__cpuid_count(level, 0, eax, ebx, ecx, edx);
return (edx & bits) == bits;
#endif
}
// 8 bits on input allows matrix-shaper boost up a little bit
cmsBool Optimize8MatrixShaperSSE(cmsContext ContextID,
_cmsTransformFn* TransformFn,
void** UserData,
_cmsFreeUserDataFn* FreeUserData,
cmsPipeline** Lut,
cmsUInt32Number* InputFormat,
cmsUInt32Number* OutputFormat,
cmsUInt32Number* dwFlags)
{
cmsStage* Curve1, *Curve2;
cmsStage* Matrix1, *Matrix2;
_cmsStageMatrixData* Data1;
_cmsStageMatrixData* Data2;
cmsMAT3 res;
cmsBool IdentityMat = FALSE;
cmsPipeline* Dest, *Src;
cmsUInt32Number nChans;
// Check for SSE2 support
if (!(IsSSE2Available())) return FALSE;
// Only works on 3 to 3, probably RGB
if ( !( (T_CHANNELS(*InputFormat) == 3 && T_CHANNELS(*OutputFormat) == 3) ) ) return FALSE;
// Only works on 8 bit input
if (T_BYTES(*InputFormat) != 1 || T_BYTES(*OutputFormat) != 1) return FALSE;
// Seems suitable, proceed
Src = *Lut;
// Check for shaper-matrix-matrix-shaper structure, that is what this optimizer stands for
if (!cmsPipelineCheckAndRetreiveStages(ContextID, Src, 4,
cmsSigCurveSetElemType, cmsSigMatrixElemType, cmsSigMatrixElemType, cmsSigCurveSetElemType,
&Curve1, &Matrix1, &Matrix2, &Curve2)) return FALSE;
nChans = T_CHANNELS(*InputFormat);
// Get both matrices, which are 3x3
Data1 = (_cmsStageMatrixData*) cmsStageData(ContextID, Matrix1);
Data2 = (_cmsStageMatrixData*) cmsStageData(ContextID, Matrix2);
// Input offset should be zero
if (Data1->Offset != NULL) return FALSE;
// Multiply both matrices to get the result
_cmsMAT3per(ContextID, &res, (cmsMAT3*)Data2->Double, (cmsMAT3*)Data1->Double);
// Now the result is in res + Data2 -> Offset. Maybe is a plain identity?
IdentityMat = FALSE;
if (_cmsMAT3isIdentity(ContextID, &res) && Data2->Offset == NULL) {
// We can get rid of full matrix
IdentityMat = TRUE;
}
// Allocate an empty LUT
Dest = cmsPipelineAlloc(ContextID, nChans, nChans);
if (!Dest) return FALSE;
// Assamble the new LUT
cmsPipelineInsertStage(ContextID, Dest, cmsAT_BEGIN, cmsStageDup(ContextID, Curve1));
if (!IdentityMat) {
cmsPipelineInsertStage(ContextID, Dest, cmsAT_END,
cmsStageAllocMatrix(ContextID, 3, 3, (const cmsFloat64Number*) &res, Data2 ->Offset));
}
cmsPipelineInsertStage(ContextID, Dest, cmsAT_END, cmsStageDup(ContextID, Curve2));
// If identity on matrix, we can further optimize the curves, so call the join curves routine
if (IdentityMat) {
Optimize8ByJoiningCurves(ContextID, TransformFn, UserData, FreeUserData, &Dest, InputFormat, OutputFormat, dwFlags);
}
else {
_cmsStageToneCurvesData* mpeC1 = (_cmsStageToneCurvesData*) cmsStageData(ContextID, Curve1);
_cmsStageToneCurvesData* mpeC2 = (_cmsStageToneCurvesData*) cmsStageData(ContextID, Curve2);
// In this particular optimization, cache does not help as it takes more time to deal with
// the cache that with the pixel handling
*dwFlags |= cmsFLAGS_NOCACHE;
// Setup the optimizarion routines
*UserData = SetMatShaper(ContextID, mpeC1 ->TheCurves, &res, (cmsVEC3*) Data2 ->Offset, mpeC2->TheCurves);
*FreeUserData = FreeMatShaper;
*TransformFn = (_cmsTransformFn) MatShaperXform8SSE;
}
*dwFlags &= ~cmsFLAGS_CAN_CHANGE_FORMATTER;
cmsPipelineFree(ContextID, Src);
*Lut = Dest;
return TRUE;
}
#endif