Optimizaciones del código C - filtro bilateral -- performance campo con c campo con memory-optimization campo con vectorization campo con simd camp codereview Relacionados El problema

Optimizations of C Code - Bilateral Filter


3
vote

problema

Español

Estoy trabajando en una aproximación agradable para el filtro bilateral.
Tengo un código de trabajo que se ejecuta bastante rápido pero aún así, creo que se puede mejorar mucho.

El código ( C , compila con 9988777665544331 ) se le da (consulte el código en Compiler Explorer ):

  #define _USE_MATH_DEFINES  #include <stdio.h> #include <stdlib.h> #include <math.h> #include <memory.h> #include <omp.h>  #define OFF 0 #define ON  1  #include <immintrin.h> // AVX  #define SSE_STRIDE 4 #define SSE_ALIGNMENT 16  #define AVX_STRIDE 8 #define AVX_ALIGNMENT 32   #define M_PIf (float)(M_PI)  void ImageConvolutionGaussianKernel(float* mO, float* mI, float* mTmp, int numRows, int numCols, float gaussianStd, int stdToRadiusFactor); void InitOmegaArrays(float* mCOmega, float* mSOmega, float* mI, int numRows, int numCols, float paramOmega); void UpdateArrays(float* mO, float* mZ, float* mC, float* mS, float* mCFiltered, float* mSFiltered, int numRows, int numCols, int iterationIdx, float paramD); void InitArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols); void UpdateArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols); void UpdateOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL);   void BilateralFilterFastCompressive(float* mO, float* mI, int numRows, int numCols, float spatialStd, float rangeStd, int paramK) {     int ii, paramN;     float paramL, paramTau, *vParamD, *mZ, *mT, paramOmega, *mCOmega, *mSOmega, *mC, *mS, *mCFiltered, *mSFiltered;      mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero     mT = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Buffer     mC = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);     mS = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);     mCOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);     mSOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);     mCFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);     mSFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);      memset(mZ, 0.0f, numRows * numCols * sizeof(float));      // Init Parameters      paramL      = paramK * rangeStd;     paramTau    = paramK / M_PIf;     paramN      = ceilf((paramK * paramK) / M_PIf);     paramOmega  = M_PIf / paramL;      vParamD = (float*)_mm_malloc(paramN * sizeof(float), AVX_ALIGNMENT);      for (ii = 1; ii <= paramN; ii++)     {         vParamD[ii - 1] = 2 * expf(-(ii * ii) / (2 * paramTau * paramTau));     }      InitOmegaArrays(mCOmega, mSOmega, mI, numRows, numCols, paramOmega);      // Iteration Number 1     ii = 1;      ImageConvolutionGaussianKernel(mCFiltered, mCOmega, mT, numRows, numCols, spatialStd, paramK);     ImageConvolutionGaussianKernel(mSFiltered, mSOmega, mT, numRows, numCols, spatialStd, paramK);      UpdateArrays(mO, mZ, mCOmega, mSOmega, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);      // Iteration Number 2     ii = 2;     InitArraysSC(mC, mS, mCOmega, mSOmega, numRows, numCols);      ImageConvolutionGaussianKernel(mCFiltered, mC, mT, numRows, numCols, spatialStd, paramK);     ImageConvolutionGaussianKernel(mSFiltered, mS, mT, numRows, numCols, spatialStd, paramK);      UpdateArrays(mO, mZ, mC, mS, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);      for (ii = 3; ii <= paramN; ii++)     {         UpdateArraysSC(mC, mS, mCOmega, mSOmega, numRows, numCols);          ImageConvolutionGaussianKernel(mCFiltered, mC, mT, numRows, numCols, spatialStd, paramK);         ImageConvolutionGaussianKernel(mSFiltered, mS, mT, numRows, numCols, spatialStd, paramK);          UpdateArrays(mO, mZ, mC, mS, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);     }      UpdateOutput(mO, mZ, mI, numRows, numCols, rangeStd, paramL);      _mm_free(mZ);     _mm_free(mT);     _mm_free(mC);     _mm_free(mS);     _mm_free(mCOmega);     _mm_free(mSOmega);     _mm_free(mCFiltered);     _mm_free(mSFiltered);     _mm_free(vParamD);  }  // Auxiliary Functions void InitOmegaArrays(float* mCOmega, float* mSOmega, float* mI, int numRows, int numCols, float paramOmega) {      int ii;       for (ii = 0; ii < numRows * numCols; ii++)     {         mCOmega[ii] = cosf(paramOmega * mI[ii]);         mSOmega[ii] = sinf(paramOmega * mI[ii]);     }  }   void UpdateArrays(float* mO, float* mZ, float* mC, float* mS, float* mCFiltered, float* mSFiltered, int numRows, int numCols, int iterationIdx, float paramD) {      int ii;      for (ii = 0; ii < numRows * numCols; ii++)     {         mO[ii] += (iterationIdx * paramD) * (mC[ii] * mSFiltered[ii] - mS[ii] * mCFiltered[ii]);         mZ[ii] += paramD * (mC[ii] * mCFiltered[ii] + mS[ii] * mSFiltered[ii]);     }   }   void InitArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols) {      int ii;      for (ii = 0; ii < numRows * numCols; ii++)     {         mS[ii] = 2.0f * mCOmega[ii] * mSOmega[ii];         mC[ii] = 2.0f * mCOmega[ii] * mCOmega[ii] - 1.0f;     }   }   void UpdateArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols) {      int ii;     float varTmp;      for (ii = 0; ii < numRows * numCols; ii++)     {         varTmp = mC[ii] * mSOmega[ii] + mS[ii] * mCOmega[ii];         mC[ii] = mC[ii] * mCOmega[ii] - mS[ii] * mSOmega[ii];         mS[ii] = varTmp;     }   }   void UpdateOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL) {      int ii;     float outFactor;      outFactor = (M_PIf * rangeStd * rangeStd) / paramL;      for (ii = 0; ii < numRows * numCols; ii++)     {         mO[ii] = mI[ii] + (outFactor * (mO[ii] / (1.0f + mZ[ii])));     }   }   

Básicamente, pocos iteraciones de la falta de definición gaussiana y las operaciones sabias.

Estoy probando el código en la imagen del tamaño 8000 x 8000 CON spatialStd = 5 , rangeStd = 10 / 2555 9988776655544336 . < br /> Este conjunto de parámetros significa el número de iteraciones ( 9988776655544337 en el código) es 8 (Blur gaussiano se realiza dos veces por iteración).
Mi Implementación de desenfoque gaussiana toma ~ 0.18 [seg] por iteración en la configuración anterior cuando se prueba de forma independiente.

Los problemas que estoy teniendo con el código:

  1. por iteración, parece que todo el tiempo las operaciones sabias del elemento llevan más que la falta de definición gaussiana. Me parece que algo no es eficiente con el código.
  2. Overhead: si ejecuto el código saltando la falta de definición gaussiana, se tarda ~ 1.8 [SEC]. Pensando que 16 x 0.18 = 2.88 [SEC] (16 iteraciones del desenfoque gaussiano) significa que debería esperar tiempo de ejecución de ~ 5 [SEC]. En la práctica obtengo ~ 7 [SEC]. Significa que hay enorme espalda en algún lugar allí.

lo que he intentado:

  1. escribiendo toda la función pequeña usando AVX2 SIMD Intrinsics . Sin embargo, parece que tengo solo un 3-5% sobre el compilador (uso el compilador Intel).
  2. Uso del #pragma vector aligned para forzar el compilador para vectorizar el código y asumir ningún alias y sin problemas de alineación. Produce resultados que son 3-5% más lentos que la mano afinada que hice (ver 1).
  3. Deshabilitar OpenMP (no mejoró).
  4. Varios compiladores (MSVC, GCC 7.3 / 8.1, Compilador Intel 2018 [fue mejor]). Los resultados anteriores son los mejores logrados (compilador Intel). Intenté -Ofast y C990 en GCC. Usando C991 en Intel. FP Precision se establece en Rápido.

Me alegraría de obtener algunos comentarios sobre eso.

comentario
El código es parte de un trabajo en la universidad y se publicará en GitHub una vez que esté listo.

Original en ingles

I'm working on a nice approximation for the Bilateral Filter.
I have a working code which runs pretty fast yet still I think much can be improved.

The code (C Code, compiles with C99) is given by (See code on Compiler Explorer):

#define _USE_MATH_DEFINES  #include <stdio.h> #include <stdlib.h> #include <math.h> #include <memory.h> #include <omp.h>  #define OFF 0 #define ON  1  #include <immintrin.h> // AVX  #define SSE_STRIDE 4 #define SSE_ALIGNMENT 16  #define AVX_STRIDE 8 #define AVX_ALIGNMENT 32   #define M_PIf (float)(M_PI)  void ImageConvolutionGaussianKernel(float* mO, float* mI, float* mTmp, int numRows, int numCols, float gaussianStd, int stdToRadiusFactor); void InitOmegaArrays(float* mCOmega, float* mSOmega, float* mI, int numRows, int numCols, float paramOmega); void UpdateArrays(float* mO, float* mZ, float* mC, float* mS, float* mCFiltered, float* mSFiltered, int numRows, int numCols, int iterationIdx, float paramD); void InitArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols); void UpdateArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols); void UpdateOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL);   void BilateralFilterFastCompressive(float* mO, float* mI, int numRows, int numCols, float spatialStd, float rangeStd, int paramK) {     int ii, paramN;     float paramL, paramTau, *vParamD, *mZ, *mT, paramOmega, *mCOmega, *mSOmega, *mC, *mS, *mCFiltered, *mSFiltered;      mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero     mT = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Buffer     mC = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);     mS = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);     mCOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);     mSOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);     mCFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);     mSFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);      memset(mZ, 0.0f, numRows * numCols * sizeof(float));      // Init Parameters      paramL      = paramK * rangeStd;     paramTau    = paramK / M_PIf;     paramN      = ceilf((paramK * paramK) / M_PIf);     paramOmega  = M_PIf / paramL;      vParamD = (float*)_mm_malloc(paramN * sizeof(float), AVX_ALIGNMENT);      for (ii = 1; ii <= paramN; ii++)     {         vParamD[ii - 1] = 2 * expf(-(ii * ii) / (2 * paramTau * paramTau));     }      InitOmegaArrays(mCOmega, mSOmega, mI, numRows, numCols, paramOmega);      // Iteration Number 1     ii = 1;      ImageConvolutionGaussianKernel(mCFiltered, mCOmega, mT, numRows, numCols, spatialStd, paramK);     ImageConvolutionGaussianKernel(mSFiltered, mSOmega, mT, numRows, numCols, spatialStd, paramK);      UpdateArrays(mO, mZ, mCOmega, mSOmega, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);      // Iteration Number 2     ii = 2;     InitArraysSC(mC, mS, mCOmega, mSOmega, numRows, numCols);      ImageConvolutionGaussianKernel(mCFiltered, mC, mT, numRows, numCols, spatialStd, paramK);     ImageConvolutionGaussianKernel(mSFiltered, mS, mT, numRows, numCols, spatialStd, paramK);      UpdateArrays(mO, mZ, mC, mS, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);      for (ii = 3; ii <= paramN; ii++)     {         UpdateArraysSC(mC, mS, mCOmega, mSOmega, numRows, numCols);          ImageConvolutionGaussianKernel(mCFiltered, mC, mT, numRows, numCols, spatialStd, paramK);         ImageConvolutionGaussianKernel(mSFiltered, mS, mT, numRows, numCols, spatialStd, paramK);          UpdateArrays(mO, mZ, mC, mS, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);     }      UpdateOutput(mO, mZ, mI, numRows, numCols, rangeStd, paramL);      _mm_free(mZ);     _mm_free(mT);     _mm_free(mC);     _mm_free(mS);     _mm_free(mCOmega);     _mm_free(mSOmega);     _mm_free(mCFiltered);     _mm_free(mSFiltered);     _mm_free(vParamD);  }  // Auxiliary Functions void InitOmegaArrays(float* mCOmega, float* mSOmega, float* mI, int numRows, int numCols, float paramOmega) {      int ii;       for (ii = 0; ii < numRows * numCols; ii++)     {         mCOmega[ii] = cosf(paramOmega * mI[ii]);         mSOmega[ii] = sinf(paramOmega * mI[ii]);     }  }   void UpdateArrays(float* mO, float* mZ, float* mC, float* mS, float* mCFiltered, float* mSFiltered, int numRows, int numCols, int iterationIdx, float paramD) {      int ii;      for (ii = 0; ii < numRows * numCols; ii++)     {         mO[ii] += (iterationIdx * paramD) * (mC[ii] * mSFiltered[ii] - mS[ii] * mCFiltered[ii]);         mZ[ii] += paramD * (mC[ii] * mCFiltered[ii] + mS[ii] * mSFiltered[ii]);     }   }   void InitArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols) {      int ii;      for (ii = 0; ii < numRows * numCols; ii++)     {         mS[ii] = 2.0f * mCOmega[ii] * mSOmega[ii];         mC[ii] = 2.0f * mCOmega[ii] * mCOmega[ii] - 1.0f;     }   }   void UpdateArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols) {      int ii;     float varTmp;      for (ii = 0; ii < numRows * numCols; ii++)     {         varTmp = mC[ii] * mSOmega[ii] + mS[ii] * mCOmega[ii];         mC[ii] = mC[ii] * mCOmega[ii] - mS[ii] * mSOmega[ii];         mS[ii] = varTmp;     }   }   void UpdateOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL) {      int ii;     float outFactor;      outFactor = (M_PIf * rangeStd * rangeStd) / paramL;      for (ii = 0; ii < numRows * numCols; ii++)     {         mO[ii] = mI[ii] + (outFactor * (mO[ii] / (1.0f + mZ[ii])));     }   } 

Basically few iterations of Gaussian Blur and Element Wise operations.

I'm testing the code on image of size 8000 x 8000 with spatialStd = 5, rangeStd = 10 / 255 and paramK = 5.
This set of parameters means the number of iterations (paramN in the code) is 8 (Gaussian Blur is done twice per iteration).
My Gaussian Blur implementation takes ~0.18 [Sec] per iteration in the settings above when tested independently.

The issues I'm having with the code:

  1. Per iteration, it seems the time all Element Wise operations takes more than the Gaussian Blur. It seems to me something isn't efficient with the code.
  2. Overhead - If I run the code skipping the Gaussian Blur it takes ~1.8 [Sec]. Thinking that 16 x 0.18 = 2.88 [Sec] (16 iterations of the Gaussian Blur) means I should expect run time of ~5 [Sec]. In practice I get ~ 7 [Sec]. It means there is huge overhead somewhere there.

What I've tried:

  1. Writing all small function using AVX2 SIMD intrinsics. Yet it seems I gain only 3-5% over the compiler (I use Intel Compiler).
  2. Using the #pragma vector aligned to force compiler to vectorize the code and assume no aliasing and no alignment issues. It yields results which are 3-5% slower than the hand tuned I did (See 1).
  3. Disabling OpenMP (Didn't improve).
  4. Various compilers (MSVC, GCC 7.3 / 8.1, Intel Compiler 2018 [Was best]). Results above are the best achieved (Intel Compiler). I tried -Ofast and -O3 on GCC. Using O3 on Intel. FP Precision is set to fast.

I'd be happy to get some feedback on that.

Remark
The code is part of a work at University and will be published on GitHub once it is ready.

              
       
       

Lista de respuestas

1
 
vote

¡El código publicado no compila! Por favor, poste el código que compila. Al compilar, siempre habilite las advertencias, luego fije esas advertencias. (para gcc , a un uso mínimo: -Wall -Wextra -Wconversion -pedantic -std=gnu177 ) NOTA, Otros compiladores tienen un conjunto diferente de opciones para lograr lo mismo

Aquí es lo que produce el compilador cuando se recibe el código publicado:

  gcc -ggdb -Wall -Wextra -Wconversion -pedantic -std=gnu11 -c "untitled.c"   untitled.c: In function ‘BilateralFilterFastCompressive’:  untitled.c:15:18: warning: implicit declaration of function ‘_mm_malloc’ [-Wimplicit-function-declaration]      mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero                   ^~~~~~~~~~  untitled.c:15:47: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]      mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero                                                 ^ untitled.c:15:64: error: ‘AVX_ALIGNMENT’ undeclared (first use in this function)      mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero                                                                 ^~~~~~~~~~~~~  untitled.c:15:64: note: each undeclared identifier is reported only once for each function it appears in  untitled.c:16:47: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]      mT = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Buffer                                                ^  untitled.c:17:47: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]      mC = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);                                                ^  untitled.c:18:47: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]      mS = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);                                                ^  untitled.c:19:52: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]      mCOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);                                                     ^  untitled.c:20:52: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]      mSOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);                                                     ^  untitled.c:21:55: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]      mCFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);                                                        ^  untitled.c:22:55: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]      mSFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);                                                        ^  untitled.c:24:5: warning: implicit declaration of function ‘memset’ [-Wimplicit-function-declaration]      memset(mZ, 0.0f, numRows * numCols * sizeof(float));      ^~~~~~  untitled.c:24:5: warning: incompatible implicit declaration of built-in function ‘memset’  untitled.c:24:5: note: include ‘<string.h>’ or provide a declaration of ‘memset’  untitled.c:24:40: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]      memset(mZ, 0.0f, numRows * numCols * sizeof(float));                                         ^  untitled.c:28:26: warning: conversion to ‘float’ from ‘int’ may alter its value [-Wconversion]      paramL      = paramK * rangeStd;                           ^  untitled.c:1:23: error: ‘M_PI’ undeclared (first use in this function); did you mean ‘M_PIf’?  #define M_PIf (float)(M_PI)                        ^  untitled.c:29:28: note: in expansion of macro ‘M_PIf’      paramTau    = paramK / M_PIf;                             ^~~~~  untitled.c:30:19: warning: implicit declaration of function ‘ceilf’ [-Wimplicit-function-declaration]      paramN      = ceilf((paramK * paramK) / M_PIf);                    ^~~~~  untitled.c:30:19: warning: incompatible implicit declaration of built-in function ‘ceilf’  untitled.c:30:19: note: include ‘<math.h>’ or provide a declaration of ‘ceilf’  untitled.c:33:41: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Wsign-conversion]      vParamD = (float*)_mm_malloc(paramN * sizeof(float), AVX_ALIGNMENT);                                          ^  untitled.c:37:31: warning: implicit declaration of function ‘expf’ [-Wimplicit-function-declaration]          vParamD[ii - 1] = 2 * expf(-(ii * ii) / (2 * paramTau * paramTau));                                ^~~~  untitled.c:37:31: warning: incompatible implicit declaration of built-in function ‘expf’  untitled.c:37:31: note: include ‘<math.h>’ or provide a declaration of ‘expf’  untitled.c:37:47: warning: conversion to ‘float’ from ‘int’ may alter its value [-Wconversion]          vParamD[ii - 1] = 2 * expf(-(ii * ii) / (2 * paramTau * paramTau));                                                ^  untitled.c:45:5: warning: implicit declaration of function ‘ImageConvolutionGaussianKernel’ [-Wimplicit-function-declaration]      ImageConvolutionGaussianKernel(mCFiltered, mCOmega, mT, numRows, numCols, spatialStd, paramK);      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  untitled.c:71:5: warning: implicit declaration of function ‘_mm_free’ [-Wimplicit-function-declaration]      _mm_free(mZ);      ^~~~~~~~  untitled.c: In function ‘InitOmegaArrays’:  untitled.c:91:23: warning: implicit declaration of function ‘cosf’ [-Wimplicit-function-declaration]          mCOmega[ii] = cosf(paramOmega * mI[ii]);                        ^~~~  untitled.c:91:23: warning: incompatible implicit declaration of built-in function ‘cosf’  untitled.c:91:23: note: include ‘<math.h>’ or provide a declaration of ‘cosf’  untitled.c:92:23: warning: implicit declaration of function ‘sinf’ [-Wimplicit-function-declaration]          mSOmega[ii] = sinf(paramOmega * mI[ii]);                        ^~~~  untitled.c:92:23: warning: incompatible implicit declaration of built-in function ‘sinf’  untitled.c:92:23: note: include ‘<math.h>’ or provide a declaration of ‘sinf’  untitled.c: In function ‘UpdateArrays’:  untitled.c:104:33: warning: conversion to ‘float’ from ‘int’ may alter its value [-Wconversion]          mO[ii] += (iterationIdx * paramD) * (mC[ii] * mSFiltered[ii] - mS[ii] * mCFiltered[ii]);                                  ^  untitled.c: In function ‘UpdtaeOutput’:  untitled.c:147:28: error: ‘outFactor’ undeclared (first use in this function)          mO[ii] = mI[ii] + (outFactor * (mO[ii] / (1.0f + mZ[ii])));                             ^~~~~~~~~  untitled.c:141:84: warning: unused parameter ‘rangeStd’ [-Wunused-parameter]  void UpdtaeOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL) {                                                                                     ^~~~~~~~  untitled.c:141:100: warning: unused parameter ‘paramL’ [-Wunused-parameter]  void UpdtaeOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL) {                                                                                                     ^~~~~~  Compilation failed.   

Por supuesto, si ha publicado las declaraciones #include para los archivos de encabezado necesarios que habrían ayudado mucho.

Sugerir fuertemente honrar el margen derecho (generalmente columna 72 u 80) al romper / sangrar las líneas, similar a:

  return0  

Entonces el código sería mucho más fácil de leer y entender.

Hay muchas líneas en blanco "aleatorias" en el código publicado. Para facilitar la legibilidad y la comprensión, 1) bloques de código separados ( return1 99887776655443313 return5 < / Código> return6 return7 a través de una sola línea en blanco. 2) Funciones separadas por 2 o 3 líneas en blanco (ser consistentes) 3) Siga El Axioma: solo una declaración por línea y (a lo sumo) una declaración variable por declaración.

Si, por return9 Se está realizando una llamada a void0 : 1) en C El tipo devuelto es 99887776655443321 que se puede asignar a cualquier puntero. El lanzamiento simplemente agota el código, lo que hace que sea más difícil de entender, depurar, etc. 2) ¡Siempre verifique (! = Nulo) el valor devuelto para asegurar que la operación fue exitosa. 3) void2 ya se asigna a la alineación máxima

Para obtener la velocidad máxima de ejecución, sugiera configurar el nivel de optimización al valor máximo. ES DECIR. void3 EN void4

con respecto a:

  void5  

1) VARTMP se puede eliminar asignando la primera declaración directamente a void6

2) Aunque el compilador debe optimizar la instrucción 99887766655443327 , sugiera esto:

  void8  

CAMBIAR A:

  void9  
 

the posted code does not compile! Please post code that compiles. When compiling, always enable the warnings, then fix those warnings. (for gcc, at a minimum use: -Wall -Wextra -Wconversion -pedantic -std=gnu17 ) Note, other compilers have a different set of options to accomplish the same thing

here is what the compiler outputs when given the posted code:

gcc -ggdb -Wall -Wextra -Wconversion -pedantic -std=gnu11 -c "untitled.c"   untitled.c: In function xe2x80x98BilateralFilterFastCompressivexe2x80x99:  untitled.c:15:18: warning: implicit declaration of function xe2x80x98_mm_mallocxe2x80x99 [-Wimplicit-function-declaration]      mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero                   ^~~~~~~~~~  untitled.c:15:47: warning: conversion to xe2x80x98long unsigned intxe2x80x99 from xe2x80x98intxe2x80x99 may change the sign of the result [-Wsign-conversion]      mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero                                                 ^ untitled.c:15:64: error: xe2x80x98AVX_ALIGNMENTxe2x80x99 undeclared (first use in this function)      mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero                                                                 ^~~~~~~~~~~~~  untitled.c:15:64: note: each undeclared identifier is reported only once for each function it appears in  untitled.c:16:47: warning: conversion to xe2x80x98long unsigned intxe2x80x99 from xe2x80x98intxe2x80x99 may change the sign of the result [-Wsign-conversion]      mT = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Buffer                                                ^  untitled.c:17:47: warning: conversion to xe2x80x98long unsigned intxe2x80x99 from xe2x80x98intxe2x80x99 may change the sign of the result [-Wsign-conversion]      mC = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);                                                ^  untitled.c:18:47: warning: conversion to xe2x80x98long unsigned intxe2x80x99 from xe2x80x98intxe2x80x99 may change the sign of the result [-Wsign-conversion]      mS = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);                                                ^  untitled.c:19:52: warning: conversion to xe2x80x98long unsigned intxe2x80x99 from xe2x80x98intxe2x80x99 may change the sign of the result [-Wsign-conversion]      mCOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);                                                     ^  untitled.c:20:52: warning: conversion to xe2x80x98long unsigned intxe2x80x99 from xe2x80x98intxe2x80x99 may change the sign of the result [-Wsign-conversion]      mSOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);                                                     ^  untitled.c:21:55: warning: conversion to xe2x80x98long unsigned intxe2x80x99 from xe2x80x98intxe2x80x99 may change the sign of the result [-Wsign-conversion]      mCFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);                                                        ^  untitled.c:22:55: warning: conversion to xe2x80x98long unsigned intxe2x80x99 from xe2x80x98intxe2x80x99 may change the sign of the result [-Wsign-conversion]      mSFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);                                                        ^  untitled.c:24:5: warning: implicit declaration of function xe2x80x98memsetxe2x80x99 [-Wimplicit-function-declaration]      memset(mZ, 0.0f, numRows * numCols * sizeof(float));      ^~~~~~  untitled.c:24:5: warning: incompatible implicit declaration of built-in function xe2x80x98memsetxe2x80x99  untitled.c:24:5: note: include xe2x80x98<string.h>xe2x80x99 or provide a declaration of xe2x80x98memsetxe2x80x99  untitled.c:24:40: warning: conversion to xe2x80x98long unsigned intxe2x80x99 from xe2x80x98intxe2x80x99 may change the sign of the result [-Wsign-conversion]      memset(mZ, 0.0f, numRows * numCols * sizeof(float));                                         ^  untitled.c:28:26: warning: conversion to xe2x80x98floatxe2x80x99 from xe2x80x98intxe2x80x99 may alter its value [-Wconversion]      paramL      = paramK * rangeStd;                           ^  untitled.c:1:23: error: xe2x80x98M_PIxe2x80x99 undeclared (first use in this function); did you mean xe2x80x98M_PIfxe2x80x99?  #define M_PIf (float)(M_PI)                        ^  untitled.c:29:28: note: in expansion of macro xe2x80x98M_PIfxe2x80x99      paramTau    = paramK / M_PIf;                             ^~~~~  untitled.c:30:19: warning: implicit declaration of function xe2x80x98ceilfxe2x80x99 [-Wimplicit-function-declaration]      paramN      = ceilf((paramK * paramK) / M_PIf);                    ^~~~~  untitled.c:30:19: warning: incompatible implicit declaration of built-in function xe2x80x98ceilfxe2x80x99  untitled.c:30:19: note: include xe2x80x98<math.h>xe2x80x99 or provide a declaration of xe2x80x98ceilfxe2x80x99  untitled.c:33:41: warning: conversion to xe2x80x98long unsigned intxe2x80x99 from xe2x80x98intxe2x80x99 may change the sign of the result [-Wsign-conversion]      vParamD = (float*)_mm_malloc(paramN * sizeof(float), AVX_ALIGNMENT);                                          ^  untitled.c:37:31: warning: implicit declaration of function xe2x80x98expfxe2x80x99 [-Wimplicit-function-declaration]          vParamD[ii - 1] = 2 * expf(-(ii * ii) / (2 * paramTau * paramTau));                                ^~~~  untitled.c:37:31: warning: incompatible implicit declaration of built-in function xe2x80x98expfxe2x80x99  untitled.c:37:31: note: include xe2x80x98<math.h>xe2x80x99 or provide a declaration of xe2x80x98expfxe2x80x99  untitled.c:37:47: warning: conversion to xe2x80x98floatxe2x80x99 from xe2x80x98intxe2x80x99 may alter its value [-Wconversion]          vParamD[ii - 1] = 2 * expf(-(ii * ii) / (2 * paramTau * paramTau));                                                ^  untitled.c:45:5: warning: implicit declaration of function xe2x80x98ImageConvolutionGaussianKernelxe2x80x99 [-Wimplicit-function-declaration]      ImageConvolutionGaussianKernel(mCFiltered, mCOmega, mT, numRows, numCols, spatialStd, paramK);      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  untitled.c:71:5: warning: implicit declaration of function xe2x80x98_mm_freexe2x80x99 [-Wimplicit-function-declaration]      _mm_free(mZ);      ^~~~~~~~  untitled.c: In function xe2x80x98InitOmegaArraysxe2x80x99:  untitled.c:91:23: warning: implicit declaration of function xe2x80x98cosfxe2x80x99 [-Wimplicit-function-declaration]          mCOmega[ii] = cosf(paramOmega * mI[ii]);                        ^~~~  untitled.c:91:23: warning: incompatible implicit declaration of built-in function xe2x80x98cosfxe2x80x99  untitled.c:91:23: note: include xe2x80x98<math.h>xe2x80x99 or provide a declaration of xe2x80x98cosfxe2x80x99  untitled.c:92:23: warning: implicit declaration of function xe2x80x98sinfxe2x80x99 [-Wimplicit-function-declaration]          mSOmega[ii] = sinf(paramOmega * mI[ii]);                        ^~~~  untitled.c:92:23: warning: incompatible implicit declaration of built-in function xe2x80x98sinfxe2x80x99  untitled.c:92:23: note: include xe2x80x98<math.h>xe2x80x99 or provide a declaration of xe2x80x98sinfxe2x80x99  untitled.c: In function xe2x80x98UpdateArraysxe2x80x99:  untitled.c:104:33: warning: conversion to xe2x80x98floatxe2x80x99 from xe2x80x98intxe2x80x99 may alter its value [-Wconversion]          mO[ii] += (iterationIdx * paramD) * (mC[ii] * mSFiltered[ii] - mS[ii] * mCFiltered[ii]);                                  ^  untitled.c: In function xe2x80x98UpdtaeOutputxe2x80x99:  untitled.c:147:28: error: xe2x80x98outFactorxe2x80x99 undeclared (first use in this function)          mO[ii] = mI[ii] + (outFactor * (mO[ii] / (1.0f + mZ[ii])));                             ^~~~~~~~~  untitled.c:141:84: warning: unused parameter xe2x80x98rangeStdxe2x80x99 [-Wunused-parameter]  void UpdtaeOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL) {                                                                                     ^~~~~~~~  untitled.c:141:100: warning: unused parameter xe2x80x98paramLxe2x80x99 [-Wunused-parameter]  void UpdtaeOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL) {                                                                                                     ^~~~~~  Compilation failed. 

Of course, if you had posted the #include statements for the needed header files that would have helped a lot.

Strongly suggest honoring the right margin (usually column 72 or 80) by breaking/indenting the lines, similar to:

mSFiltered = (float*)_mm_malloc(     numRows * numCols * sizeof(float), AVX_ALIGNMENT);   

then the code would be much easier to read and understand.

There are a lot of 'random' blank lines in the posted code. For ease of readability and understanding, 1) separate code blocks ( for if else while do...while switch case default via a single blank line. 2) separate functions by 2 or 3 blank lines (be consistent) 3) follow the axiom: only one statement per line and (at most) one variable declaration per statement.

if, by _mm_malloc a call to malloc() is being performed then: 1) in C the returned type is void* which can be assigned to any pointer. Casting just clutters the code, making it more difficult to understand, debug, etc. 2) always check (!=NULL) the returned value to assure the operation was successful. 3) malloc() already allocates at the maximum alignment

To obtain the maximum speed of execution, Suggest setting the optimization level to the max value. I.E. -o3 in gcc

regarding:

for (ii = 0; ii < numRows * numCols; ii++) {     varTmp = mC[ii] * mSOmega[ii] + mS[ii] * mCOmega[ii];     mC[ii] = mC[ii] * mCOmega[ii] - mS[ii] * mSOmega[ii];     mS[ii] = varTmp; } 

1) varTmp can be eliminated by assigning the first statement directly to mS[ii]

2) Although the compiler should optimize the for() statement, suggest this:

for (ii = 0; ii < numRows * numCols; ii++) 

be changed to:

int size = numRows * numCols; for (ii = 0; ii < size; ii++) 
 
 
     
     

Relacionados problema

-3  Convertir la matriz de `flotan32` (` flotando`) a la matriz de `uint8` (` char 'unesigned) usando avx2 [cerrado]  ( Converting array of float32 float to array of uint8 unsigned char us ) 
cerrado. Esta pregunta es off-topic . Actualmente no está aceptando respuestas. ¿Quieres ...

5  Encontrar el valor mínimo de una matriz usando SIMD  ( Finding min value of an array using simd ) 
Tengo el siguiente código para encontrar el valor mínimo 9988776665544339 en una matriz. Está utilizando AAPL0 para realizar un SIMD MIN en trozos de la m...

1  Computación de vectores de la base del espacio tangente para una malla arbitraria  ( Computing tangent space basis vectors for an arbitrary mesh ) 
Esto es más como una parte y una solicitud que una pregunta. Convidí el código de Eric Lengyel, que calcula las tangentes de una malla con el fin de la textur...

2  Escriba mapa de bits de 16x16 al búfer de marco  ( Write 16x16 bitmap to frame buffer ) 
El siguiente código escribe un mapa de bits de 16x16 a un frameBuffer usando hasta AVX2 . Estoy seguro de que se puede mejorar con AVX512 , pero solo estoy ...

4  Optimización de SSE para remuestreo de audio [cerrado]  ( Sse optimisation for audio resampling ) 
cerrado. Esta pregunta es off-topic . Actualmente no está aceptando respuestas. ¿Quieres ...

0  Convertir la matriz de `uint8` (` char 'unesigned) a la matriz de `flotan32` (` float`) usando avx2  ( Converting array of uint8 unsigned char to array of float32 float us ) 
Dada matriz de entrada de Sales4 ( Import25 ) ¿Cómo podría uno convertirlo de manera eficiente en la matriz de Sales7 ( Sales8 )? Por ejemplo, aquí hay ...

3  Sonido lineal 16 elementos HashMap en paralelo con ICTRINSICS SIMD  ( Linear probing 16 hashmap elements in parallel using simd instrinsics ) 
Para mi implementación de HashMap, estoy almacenando en caché los hashes para cada elemento de mapa en una matriz de longitud Dictionary3 , donde cada elemen...

1  Sin ganancia de rendimiento con vector SIMD 3D para Ray Tracer  ( No performance gain with simd 3d vector for ray tracer ) 
Escribí dos versiones de clase de vectores para mi trazador de rayos. Versión no SIMD y versión SIMD. Puedes encontrar el código a continuación. Quiero pregun...

8  Instrucción de SSE para verificar si la matriz de bytes es ceros C #  ( Sse instruction to check if byte array is zeroes c ) 
Mi problema fundamental es cómo verificar si byte[] está lleno de ceros. Publiqué una gama de implementaciones (con tiempos) y uno late claramente a los d...

4  Cálculo de SIMD Mandelbrot  ( Simd mandelbrot calculation ) 
Me estaba arruinando con los shaders de cómputo GPU el otro día y creó un sombreador de Mandelbrot. Desafortunadamente, el metal no admite la doble precisión ...




© 2022 respuesta.top Reservados todos los derechos. Centro de preguntas y respuestas reservados todos los derechos