积分图的SSE和AVX2优化

153次阅读

共计 7179 个字符，预计需要花费 18 分钟才能阅读完成。

在图像处理中，积分图的应用在某些场合可以带来极高的效率优化，但是积分图本身的计算比较耗时，需要优化。

void IntegralF64(Mat src, Mat &integal_out)   
{
            
  Mat tmp(src.size(), CV_64FC1, 0.0);

  tmp.ptr<double>(0)[0] = (double)src.ptr<uchar>(0)[0];
  for (int i = 1; i < src.cols; i++)   //第一行    
  {
            
    tmp.ptr<double>(0)[i] = tmp.ptr<double>(0)[i - 1] + src.ptr<uchar>(0)[i];
  }

  for (int i = 1; i < src.rows; i++)  //第一列    
  {
            
    tmp.ptr<double>(i)[0] = tmp.ptr<double>(i - 1)[0] + src.ptr<uchar>(i)[0];
  }

  for (int i = 1; i < src.rows; i++)   //第i行    
  {
            
    for (int j = 1; j < src.cols; j++)   //第j列       
    {
            
      tmp.ptr<double>(i)[j] = tmp.ptr<double>(i)[j - 1] + tmp.ptr<double>(i - 1)[j] - tmp.ptr<double>(i - 1)[j - 1] + src.ptr<uchar>(i)[j];
    }
  }
  tmp.copyTo(integal_out);
}

该方法，对4k*2k的图，耗时在70ms。

void IntegralF32(Mat src, Mat &integal_out)
{
            
  Mat tmp(src.size(), CV_32FC1, 0.0);

  tmp.ptr<float>(0)[0] = (float)src.ptr<uchar>(0)[0];
  for (int i = 1; i < src.cols; i++)   //第一行    
  {
            
    tmp.ptr<float>(0)[i] = tmp.ptr<float>(0)[i - 1] + src.ptr<uchar>(0)[i];
  }

  for (int i = 1; i < src.rows; i++)  //第一列    
  {
            
    tmp.ptr<float>(i)[0] = tmp.ptr<float>(i - 1)[0] + src.ptr<uchar>(i)[0];
  }

  for (int i = 1; i < src.rows; i++)   //第i行    
  {
            
    for (int j = 1; j < src.cols; j++)   //第j列       
    {
            
      tmp.ptr<float>(i)[j] = tmp.ptr<float>(i)[j - 1] + tmp.ptr<float>(i - 1)[j] - tmp.ptr<float>(i - 1)[j - 1] + src.ptr<uchar>(i)[j];
    }
  }
  tmp.copyTo(integal_out);
}

效率提高一点，可以到50ms。

void IntegralI32(Mat src, Mat &integal_out)
{
            
  Mat tmp(src.size(), CV_32SC1, 0.0);

  tmp.ptr<int>(0)[0] = (int)src.ptr<uchar>(0)[0];
  for (int i = 1; i < src.cols; i++)   //第一行    
  {
            
    tmp.ptr<int>(0)[i] = tmp.ptr<int>(0)[i - 1] + src.ptr<uchar>(0)[i];
  }

  for (int i = 1; i < src.rows; i++)  //第一列    
  {
            
    tmp.ptr<int>(i)[0] = tmp.ptr<int>(i - 1)[0] + src.ptr<uchar>(i)[0];
  }

  for (int i = 1; i < src.rows; i++)   //第i行    
  {
            
    for (int j = 1; j < src.cols; j++)   //第j列       
    {
            
      tmp.ptr<int>(i)[j] = tmp.ptr<int>(i)[j - 1] + tmp.ptr<int>(i - 1)[j] - tmp.ptr<int>(i - 1)[j - 1] + src.ptr<uchar>(i)[j];
    }
  }
  tmp.copyTo(integal_out);
}

效率再提高一点，可以到35ms。

void IntegralI32_Ex(unsigned char *Src, int *Integral, int Width, int Height, int Stride)
{
            
  int *ColSum = (int *)calloc(Width, sizeof(int)); 
  memset(Integral, 0, (Width + 1) * sizeof(int));
  for (int Y = 0; Y < Height; Y++)
  {
            
    unsigned char *LinePS = Src + Y * Stride;
    int *LinePL = Integral + Y * (Width + 1) + 1;
    int *LinePD = Integral + (Y + 1) * (Width + 1) + 1;
    LinePD[-1] = 0;
    for (int X = 0; X < Width; X++)
    {
            
      ColSum[X] += LinePS[X];
      //这句话可以优化，一次处理4个像素
      LinePD[X] = LinePD[X - 1] + ColSum[X];
    }
  }
  free(ColSum);
}

效率再提高一点，可以到13ms。

void GetGrayIntegralImageSSE(unsigned char *Src, int *Integral, int Width, int Height, int Stride)
{
            
  memset(Integral, 0, (Width + 1) * sizeof(int));            //    第一行都为0
  int BlockSize = 8, Block = Width / BlockSize;
  for (int Y = 0; Y < Height; Y++)
  {
            
    unsigned char *LinePS = Src + Y * Stride;
    int *LinePL = Integral + Y * (Width + 1) + 1;                //    上一行位置            
    int *LinePD = Integral + (Y + 1) * (Width + 1) + 1;          //    当前位置，注意每行的第一列的值都为0
    LinePD[-1] = 0;
    __m128i PreV = _mm_setzero_si128();
    __m128i Zero = _mm_setzero_si128();
    for (int X = 0; X < Block * BlockSize; X += BlockSize)
    {
            
      __m128i Src_Shift0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(LinePS + X)), Zero);        //    A7 A6 A5 A4 A3 A2 A1 A0
      __m128i Src_Shift1 = _mm_slli_si128(Src_Shift0, 2);                                            //    A6 A5 A4 A3 A2 A1 A0 0     
      __m128i Src_Shift2 = _mm_slli_si128(Src_Shift1, 2);                                            //    A5 A4 A3 A2 A1 A0 0  0
      __m128i Src_Shift3 = _mm_slli_si128(Src_Shift2, 2);                                            //    A4 A3 A2 A1 A0 0  0  0
      __m128i Shift_Add12 = _mm_add_epi16(Src_Shift1, Src_Shift2);                                   //    A6+A5 A5+A4 A4+A3 A3+A2 A2+A1 A1+A0 A0+0  0+0
      __m128i Shift_Add03 = _mm_add_epi16(Src_Shift0, Src_Shift3);                                   //    A7+A4 A6+A3 A5+A2 A4+A1 A3+A0 A2+0  A1+0  A0+0    
      __m128i Low = _mm_add_epi16(Shift_Add12, Shift_Add03);                                         //    A7+A6+A5+A4 A6+A5+A4+A3 A5+A4+A3+A2 A4+A3+A2+A1 A3+A2+A1+A0 A2+A1+A0+0 A1+A0+0+0 A0+0+0+0
      __m128i High = _mm_add_epi32(_mm_unpackhi_epi16(Low, Zero), _mm_unpacklo_epi16(Low, Zero));    //    A7+A6+A5+A4+A3+A2+A1+A0  A6+A5+A4+A3+A2+A1+A0  A5+A4+A3+A2+A1+A0  A4+A3+A2+A1+A0
      __m128i SumL = _mm_loadu_si128((__m128i *)(LinePL + X + 0));
      __m128i SumH = _mm_loadu_si128((__m128i *)(LinePL + X + 4));
      SumL = _mm_add_epi32(SumL, PreV);
      SumL = _mm_add_epi32(SumL, _mm_unpacklo_epi16(Low, Zero));
      SumH = _mm_add_epi32(SumH, PreV);
      SumH = _mm_add_epi32(SumH, High);
      PreV = _mm_add_epi32(PreV, _mm_shuffle_epi32(High, _MM_SHUFFLE(3, 3, 3, 3)));
      _mm_storeu_si128((__m128i *)(LinePD + X + 0), SumL);
      _mm_storeu_si128((__m128i *)(LinePD + X + 4), SumH);
    }
    for (int X = Block * BlockSize, V = LinePD[X - 1] - LinePL[X - 1]; X < Width; X++)
    {
            
      V += LinePS[X];
      LinePD[X] = V + LinePL[X];
    }
  }
}

效率再提高一点，可以到8ms。

void GrayImageIntegral_AVX2(uint8_t* pSrc, uint32_t* pDst, uint32_t nWidth, uint32_t nHeight)
{
            
    //第一行赋值为0
    memset(pDst, 0, (nWidth + 1) * sizeof(int));

    int nStep = 16;
    int nBlock = nWidth / nStep;
    uint32_t x, y, t;

    for (y = 0; y < nHeight; y++)
    {
            
        //源图每行的首地址
        uint8_t* lineSrc = pSrc + y * nWidth;

        //积分图当前行的首地址+1
        uint32_t* lineDst = pDst + y * (nWidth + 1) + 1;

        //积分图下一行的首地址加1
        uint32_t* lineNewDst = pDst + (y + 1) * (nWidth + 1) + 1;

        //第一列赋值为0
        lineNewDst[-1] = 0;

        __m256i r1, r2, a, b, c, d, e, f1, f3, g1, g2, G1bc, h;
        __m256i m2, m3;
        __m256i m6 = _mm256_setzero_si256();
        __m256i m00 = _mm256_setzero_si256();
        for (x = 0; x < nBlock * nStep; x += nStep)
        {
            
            r1 = _mm256_loadu_si256((__m256i*)(lineSrc + x));
            r2 = _mm256_permute4x64_epi64(r1, 0xD8);
            a = _mm256_unpacklo_epi8(r2, m00);
            b = _mm256_slli_si256(a, 2);
            c = _mm256_slli_si256(b, 2);
            d = _mm256_slli_si256(c, 2);

            e = _mm256_add_epi16(_mm256_add_epi16(a, b), _mm256_add_epi16(c, d));

            f1 = _mm256_unpacklo_epi16(e, m00);
            f3 = _mm256_add_epi32(_mm256_unpackhi_epi16(e, m00), f1);

            g1 = _mm256_permute2x128_si256(f1, f3, 0x20);
            g2 = _mm256_permute2x128_si256(f1, f3, 0x31);

            m2 = _mm256_loadu_si256((__m256i*)(lineDst + x + 0));
            m3 = _mm256_loadu_si256((__m256i*)(lineDst + x + 8));

            //先写g1+m2+m6
            g1 = _mm256_add_epi32(_mm256_add_epi32(m2, m6), g1);
            _mm256_storeu_si256((__m256i*)(lineNewDst + x + 0), g1);

            //再写g2+m3+m6+g1广播
            G1bc = _mm256_set1_epi32(g1.m256i_u32[7]);
            h = _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(m3, m6), g2), G1bc);
            _mm256_storeu_si256((__m256i*)(lineNewDst + x + 8), h);   //A8 到 A15的累加值

            //更新最终结果
            m6 = _mm256_set1_epi32(h.m256i_u32[7]);
        }
        for (x = nBlock * nStep, t = lineNewDst[x - 1] - lineDst[x - 1]; x < nWidth; x++)
        {
            
            t += lineSrc[x];
            lineNewDst[x] = t + lineDst[x];
        }
    }
}

发现耗时和SSE接近，可能后面还要继续优化。

以上所有耗时，都是在笔记本上的耗时。换为台式机后，使用i7-10700处理器，对于4k*2k的图，SSE版本的耗时到2.5ms。

积分图的优化，SSE可能看起来还比较好懂，但AVX2就比较复杂了，这里附上之前写这个函数时做的一些笔记。

读取32个像素从中取到前16个像素,得到A
需要用到如下的函数
_mm256_loadu_si256 _mm256_permute4x64_epi64 _mm256_unpacklo_epi8

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15
A(2)	A0	A1	A2	A3	A4	A5	A6	A7	A8	A9	A10	A11	A12	A13	A14	A15

一直向左移位，用到如下函数。
_mm256_slli_si256

	1	2	3	4	5	6	7	9	10	11	12	13	14	15
B(2)	A0	A1	A2	A3	A4	A5	A6	A8	A9	A10	A11	A12	A13	A14
C(2)	0	A0	A1	A2	A3	A4	A5	0	A8	A9	A10	A11	A12	A13
D(2)	0	0	A0	A1	A2	A3	A4	0	0	A8	A9	A10	A11	A12

移位结果相加,得到E
E(2)=A+B+C+D

E(2)
A0
A0+A1
A0+A1+A2
A0+A1+A2+A3
A1+A2+A3+A4
A2+A3+A4+A5
A3+A4+A5+A6
A4+A5+A6+A7
A8
A8+A9
A8+A9+A10
A8+A9+A10+A11
A9+A10+A11+A12
A10+A11+A12+A13
A11+A12+A13+A14
A12+A13+A14+A15

unpack后得到F1,F2,相加，得到F3
_mm_add_epi32 _mm_unpackhi_epi16(E(2)) _mm256_unpacklo_epi8(E(2))

F1(4)	F2(4)	F3(4)
A0	A1+A2+A3+A4	A0+A1+A2+A3+A4
A0+A1	A2+A3+A4+A5	A0+A1+A2+A3+A4+A5
A0+A1+A2	A3+A4+A5+A6	A0+A1+A2+A3+A4+A5+A6
A0+A1+A2+A3	A4+A5+A6+A7	A0+A1+A2+A3+A4+A5+A6+A7
A8	A9+A10+A11+A12	A8+A9+A10+A11+A12
A8+A9	A10+A11+A12+A13	A8+A9+A10+A11+A12+A13
A8+A9+A10	A11+A12+A13+A14	A8+A9+A10+A11+A12+A13+A14
A8+A9+A10+A11	A12+A13+A14+A15	A8+A9+A10+A11+A12+A13+A14+A15

F1,F3交叉，得到G1,G2
Permute2x128 (F1,F3)

G1(4)	G2(4)
A0	A8
A0+A1	A8+A9
A0+A1+A2	A8+A9+A10
A0+A1+A2+A3	A8+A9+A10+A11
A0+A1+A2+A3+A4	A8+A9+A10+A11+A12
A0+A1+A2+A3+A4+A5	A8+A9+A10+A11+A12+A13
A0+A1+A2+A3+A4+A5+A6	A8+A9+A10+A11+A12+A13+A14
A0+A1+A2+A3+A4+A5+A6+A7	A8+A9+A10+A11+A12+A13+A14+A15

G1广播，得到G1bc(4)
G1bc(4) _mm256_set1_epi32

G1bc(4)
A0+A1+A2+A3+A4+A5+A6+A7
A0+A1+A2+A3+A4+A5+A6+A7
A0+A1+A2+A3+A4+A5+A6+A7
A0+A1+A2+A3+A4+A5+A6+A7
A0+A1+A2+A3+A4+A5+A6+A7
A0+A1+A2+A3+A4+A5+A6+A7
A0+A1+A2+A3+A4+A5+A6+A7
A0+A1+A2+A3+A4+A5+A6+A7