Chinaunix首页 | 论坛 | 博客
  • 博客访问: 360753
  • 博文数量: 112
  • 博客积分: 5245
  • 博客等级: 大校
  • 技术积分: 1120
  • 用 户 组: 普通用户
  • 注册时间: 2007-11-07 09:20
个人简介

静下来,定好方向,好好干。

文章分类
文章存档

2017年(1)

2012年(1)

2011年(5)

2010年(6)

2009年(16)

2008年(59)

2007年(24)

我的朋友

分类: C/C++

2010-08-23 11:10:57

指令:
例子:
MMX及SSE优化
SSE指令集 
汇编指令


Now that we are more familiar with the instruction palette lets take a look at our first example. In this example we will add two 4 element vectors using a C++ function and inline assembly. I'll start by showing you the source and then explain each step in detail.

// A 16byte = 128bit vector struct
struct Vector4
{    
        float x, y, z, w;        
};

// Add two constant vectors and return the resulting vector
Vector4 SSE_Add ( const Vector4 &Op_A, const Vector4 &Op_B )
{
        Vector4 Ret_Vector;

        __asm 
        {        
                MOV EAX Op_A                              // Load pointers into CPU regs
                MOV EBX, Op_B

                MOVUPS XMM0, [EAX]                 // Move unaligned vectors to SSE regs
                MOVUPS XMM1, [EBX]

                ADDPS XMM0, XMM1                   // Add vector elements
                MOVUPS [Ret_Vector], XMM0      // Save the return vector
        }

        return Ret_Vector;
}

Here is a common 3D calculation to find the normal of two vectors. It demonstrates a useful way of shuffling register elements to achieve optimal performance.

// R.x = A.y * B.z - A.z * B.y
// R.y = A.z * B.x - A.x * B.z
// R.z = A.x * B.y - A.y * B.x

// Find the cross product of two constant vectors and return it.
Vector4 SSE_CrossProduct(const Vector4 &Op_A, const Vector4 &Op_B)
{
        Vector4 Ret_Vector;
        __asm 
        {
                MOV EAX Op_A                               // Load pointers into CPU regs
                MOV EBX, Op_B

                MOVUPS XMM0, [EAX]                 // Move unaligned vectors to SSE regs
                MOVUPS XMM1, [EBX]    
                MOVAPS XMM2, XMM0               // Make a copy of vector A
                MOVAPS XMM3, XMM1               // Make a copy of vector B

                SHUFPS XMM0, XMM0, 0xD8      // 11 01 10 00  Flip the middle elements of A
                SHUFPS XMM1, XMM1, 0xE1       // 11 10 00 01 Flip first two elements of B
                MULPS  XMM0, XMM1                 // Multiply the modified register vectors
                
                SHUFPS XMM2, XMM2, 0xE1      // 11 10 00 01 Flip first two elements of the A copy
                SHUFPS XMM3, XMM3, 0xD8     // 11 01 10 00  Flip the middle elements of the B copy
                MULPS XMM2, XMM3                 // Multiply the modified register vectors
               
                SUBPS  XMM0, XMM2                  // Subtract the two resulting register vectors
               
                MOVUPS [Ret_Vector], XMM0      // Save the return vector
        }
        return Ret_Vector;
}

Here is an example of multiplying a vector by a floating point scalar. It uses an intrinsic function to initialize a 128 bit object.

// Multiply a constant vector by a constant scalar and return the result
Vector4 SSE_Multiply(const Vector4 &Op_A, const float &Op_B)
{
        Vector4 Ret_Vector;

        __m128 F = _mm_set1_ps(Op_B)             // Create a 128 bit vector with four elements Op_B
        
        __asm 
        {        
                MOV EAX, Op_A                               // Load pointer into CPU reg

                MOVUPS XMM0, [EAX]                   // Move the vector  to an SSE reg       
                MULPS XMM0, F                               // Multiply vectors
                MOVUPS [Ret_Vector], XMM0       // Save the return vector
        }
        return Ret_Vector;
}

google:sse tutorial - Data Alignment
Each float array processed by SSE instructions should have 16 byte alignment. A static array is declared using the __declspec(align(16)) keyword:
__declspec(align(16)) float m_fArray[ARRAY_SIZE];
Dynamic array should be allocated using new _aligned_malloc function:
m_fArray = (float*) _aligned_malloc(ARRAY_SIZE * sizeof(float), 16);
Array allocated by the _aligned_malloc function is released using the _aligned_free function:
_aligned_free(m_fArray);

void CSSETestDlg::ComputeArrayCPlusPlusSSE(
          float* pArray1,                   // [in] first source array

          float* pArray2,                   // [in] second source array

          float* pResult,                   // [out] result array

          int nSize)                        // [in] size of all arrays

{
    int nLoop = nSize/ 4;

    __m128 m1, m2, m3, m4;

    __m128* pSrc1 = (__m128*) pArray1;
    __m128* pSrc2 = (__m128*) pArray2;
    __m128* pDest = (__m128*) pResult;


    __m128 m0_5 = _mm_set_ps1(0.5f);        // m0_5[0, 1, 2, 3] = 0.5


    for ( int i = 0; i < nLoop; i++ )
    {
        m1 = _mm_mul_ps(*pSrc1, *pSrc1);        // m1 = *pSrc1 * *pSrc1

        m2 = _mm_mul_ps(*pSrc2, *pSrc2);        // m2 = *pSrc2 * *pSrc2

        m3 = _mm_add_ps(m1, m2);                // m3 = m1 + m2

        m4 = _mm_sqrt_ps(m3);                   // m4 = sqrt(m3)

        *pDest = _mm_add_ps(m4, m0_5);          // *pDest = m4 + 0.5

        pSrc1++;
        pSrc2++;
        pDest++;
    }
}

void CSSESampleDlg::OnBnClickedButtonSseC()
{
    __m128 coeff = _mm_set_ps1(2.8f);      // coeff[0, 1, 2, 3] = 2.8

    __m128 tmp;

    __m128 min128 = _mm_set_ps1(FLT_MAX);  // min128[0, 1, 2, 3] = FLT_MAX

    __m128 max128 = _mm_set_ps1(FLT_MIN);  // max128[0, 1, 2, 3] = FLT_MIN


    __m128* pSource = (__m128*) m_fInitialArray;
    __m128* pDest = (__m128*) m_fResultArray;

    for ( int i = 0; i < ARRAY_SIZE/4; i++ )
    {
        tmp = _mm_mul_ps(*pSource, coeff);      // tmp = *pSource * coeff

        *pDest = _mm_sqrt_ps(tmp);              // *pDest = sqrt(tmp)


        min128 =  _mm_min_ps(*pDest, min128);
        max128 =  _mm_max_ps(*pDest, max128);

        pSource++;
        pDest++;
    }

    // extract minimum and maximum values from min128 and max128

    union u
    {
        __m128 m;
        float f[4];
    } x;

    x.m = min128;
    m_fMin = min(x.f[0], min(x.f[1], min(x.f[2], x.f[3])));

    x.m = max128;
    m_fMax = max(x.f[0], max(x.f[1], max(x.f[2], x.f[3])));
}

movdqa: Moves 128 bit data from memory or pointer to an XMM register or vice versa.
pandn: "and & not" an XMM register.
loop: continue looping.

//Traverse:
// unroll the loop 4 times. 4th unroll is a half one.
// (as much as XMM's can hold)

// load 3 pixels
movdqa xmm0,[esi]
movdqa xmm1,[esi+16]
movdqa xmm2,[esi+32]
movdqa xmm3,[esi+48]

// 255- pixel for 3 pixels
pandn  xmm0, xmm7
pandn  xmm1, xmm7
pandn  xmm2, xmm7
pandn  xmm3, xmm7

// output the computed content
movdqa [edi], xmm0
movdqa [edi+16], xmm1
movdqa [edi+32], xmm2
movdqa [edi+48], xmm3

// traverse array
add esi,64
add edi, 64

// move on
loop Traverse;

MMX intrinsics (aligned load/store):

char a[N], b[N], c[N];
...
__m64 *av, *bv, *cv;
av = (__m64*)a; // assume 8-byte aligned
bv = (__m64*)b; // assume 8-byte aligned
cv = (__m64*)c; // assume 8-byte aligned
for (i = 0; i < N/8; i++)
  av[i] = _mm_add_pi8(bv[i], cv[i]);

SSE/SSE2 technology with 128-bit XMM registers (aligned load/store):

Back:
  movdqa xmm0, _b[ecx]
  paddb  xmm0, _c[ecx]
  movdqa _a[ecx], xmm0
  add    ecx, 16
  cmp    ecx, edi
  jl     Back

SSE2 intrinsics (aligned load/store):

char a[N], b[N], c[N];
...
__m128i *av, *bv, *cv;
av = (__m128i*)a; // assume 16-byte aligned
bv = (__m128i*)b; // assume 16-byte aligned
cv = (__m128i*)c; // assume 16-byte aligned
for (i = 0; i < N/16; i++)
  av[i] = _mm_add_epi8(bv[i], cv[i]);

SSE/SSE2 technology with 128-bit XMM registers (unaligned load/store):

Back:
  movdqu xmm0, _b[ecx]
  movdqu xmm1, _c[ecx]
  paddb  xmm0, xmm1
  movdqu _a[ecx], xmm0
  add    ecx, 16
  cmp    ecx, edi
  jl     Back

SSE2 intrinsics (unaligned load/store)

char a[N], b[N], c[N];
...
__m128i *av, *bv, *cv;
av = (__m128i*)a;
bv = (__m128i*)b;
cv = (__m128i*)c;
for (i = 0; i < N/16; i++)
{ __m128i br = _mm_loadu_si128(&bv[i];
  __m128i cr = _mm_loadu_si128(&cv[i];
  __m128i ar = _mm_add_epi8(br, cr);
  _mm_storeu_si128(&av[i], ar);
}

================================================================================

Problem: vectorize the following code with SSE

double a[N], b[N], c[N];
...
for (i = 0; i < N; i++)
  a[i] = b[i] + c[i];

SSE/SSE2 technology with 128-bit XMM registers (aligned load/store):

Back:
  movapd xmm0, _b[ecx]		|b1   |b0   |
  addpd  xmm0, _c[ecx]		|b1+c1|b0+c0|
  movapd _a[ecx], xmm0
  add    ecx, 16
  cmp    ecx, edi
  jl     Back

SSE2 intrinsics (aligned load/store):

double a[N], b[N], c[N];
...
__m128d *av, *bv, *cv;
av = (__m128d*)a; // assume 16-byte aligned
bv = (__m128d*)b; // assume 16-byte aligned
cv = (__m128d*)c; // assume 16-byte aligned
for (i = 0; i < N/2; i++)
  av[i] = _mm_add_pd(bv[i], cv[i]);

================================================================================

Problem: vectorize the following code with SSE

float a[N], b[N], x;
...
for (i = 0; i < N; i++)
  a[i] = b[i] + x;

SSE/SSE2 technology with 128-bit XMM registers (aligned load/store):

  movss  xmm0, _x		|0|0|0|x|
  shufps xmm0, xmm0, 0		|x|x|x|x|
Back:
  movaps xmm1, _b[ecx]		|b3  |b2  |b1  |b0  |
  addps  xmm1, xmm0		|b3+x|b2+x|b1+x|b0+x|
  movaps _a[ecx], xmm1
  add    ecx, 16
  cmp    ecx, edi
  jl     Back

Note: instead of shufps we can twice unpack:

  movss    xmm0, _x		|0|0|0|x|
  unpcklps xmm0, xmm0		|0|0|x|x|
  unpcklps xmm0, xmm0		|x|x|x|x|

SSE2 intrinsics (aligned load/store):

float a[N], b[N], x;
...
__m128 *av, *bv, xr;
av = (__m128d*)a; // assume 16-byte aligned
bv = (__m128d*)b; // assume 16-byte aligned
xr = _mm_set1_ps(x);
for (i = 0; i < N/4; i++)
  av[i] = _mm_add_ps(bv[i], xr);

================================================================================

Problem: scalar expand the following code with SSE

double x;
int n;

SSE/SSE2 technology with 128-bit XMM registers (aligned load/store):

  movsd    xmm0, _x		|0|x|
  unpcklpd xmm0, xmm0		|x|x|
  movdqa   xmm1, _n		|0|0|0|n|
  unpcklwd xmm1, xmm1		|0|0|n|n|
  unpcklwd xmm1, xmm1		|n|n|n|n|

SSE2 intrinsics (aligned load/store):

__m128d xr = _mm_set1_pd(x);
__m128i nr = _mm_set1_epi32(n);

================================================================================

Problem: vectorize the following code with SSE

char a[N];
...
for (i = 0; i < N; i++)
  a[i] = i;

SSE/SSE2 technology with 128-bit XMM registers (aligned load/store):

  movdqa xmm0, _cnst$1		|15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
  movdqa xmm1, _cnst$2		|16|16|16|16|16|16|16|16|16|16|16|16|16|16|16|16|
Back:
  movdqa _a[ecx], xmm0
  paddb  xmm0, xmm1
  add    ecx, 16
  cmp    ecx, edi
  jl     Back

SSE2 intrinsics (aligned load/store):

char a[N];
...
__m128i *av = (__m128i*)a; // assume 16-byte aligned
__m128i iv = _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)
__m128i ci = _mm_set1_epi8(16);
av = (__m128i*)a; // assume 16-byte aligned
for (i = 0; i < N/16; i++)
{ av[i] = iv;
  iv = _mm_add_epi8(iv, ci);
}

================================================================================

Problem: vectorize the following code with SSE

double a[N], x = 0.0;
...
for (i = 0; i < N; i++)
{
  x = x + 4;
  a[i] = 6 * x + 1;
}

SSE/SSE2 technology with 128-bit XMM registers (aligned load/store):

  movapd xmm0, _cnst$1		|49.0|25.0|
  movapd xmm1, _cnst$2		|48.0|48.0|
Back:
  movapd _a[ecx], xmm0
  addpd  xmm0, xmm1
  add    ecx, 16
  cmp    ecx, edi
  jl     Back

SSE2 intrinsics (aligned load/store):

double a[N], x = 0.0;
...
__m128d *av = (__m128d*)a; // assume 16-byte aligned
__m128d iv = _mm_set_pd(49.0,25.0)
__m128d ci = _mm_set1_pd(48.0);
av = (__m128d*)a; // assume 16-byte aligned
for (i = 0; i < N/2; i++)
  av[i] = iv = _mm_add_epi8(iv, ci);

================================================================================

Problem: vectorize the following code with SSE

int a[N], x = 0;
...
for (i = 0; i < N; i++)
  x = x + a[i];

SSE/SSE2 technology with 128-bit XMM registers (aligned load/store):

  pxor    xmm0, xmm0		|0|0|0|0|
Back:
  movdqa  xmm1, _a[ecx]
  paddd   xmm0, xmm1
  add     ecx, 16
  cmp     ecx, edi
  jl      Back
  psrldq  xmm1, xmm0, 32	|0  |0    |x3      |x2         |
  paddw   xmm0, xmm1            |x3 |x2   |x1+x3   |x0+x2      |
  psrldq  xmm1, xmm0, 16        |0  |x3   |x2      |x1+x3      |
  paddw   xmm0, xmm1            |x3 |x2+x3|x1+x2+x3|x0+x1+x2+x3|


int a[N], x = 0;
...
SSE2 intrinsics (aligned load/store):

int a[N], x = 0.0;
...
int xx[4];
__m128i *av = (__m128i*)a; // assume 16-byte aligned
__m128i xv = _mm_setzero_si128();
for (i = 0; i < N/4; i++)
  xv = _mm_add_epi32(xv, av[i]);
_mm_store_si128(xx, xv);
x = xx[0] + xx[1] + xx[2] + xx[3];

================================================================================

Problem: vectorize the following code with SSE

float a[N], b[N], x = 0.0;
...
for (i = 0; i < N; i++)
  x = x + a[i]*b[i];

SSE/SSE2 technology with 128-bit XMM registers (aligned load/store):

  xorps  xmm0, xmm0		|0|0|0|0|
Back:
  movaps xmm1, _a[ecx]
  mulps  xmm1, _b[ecx]
  addps  xmm0, xmm1
  add    ecx, 16
  cmp    ecx, edi
  jl     Back
  haddps xmm0, xmm0		|x3+x2      |x1+x0      |x3+x2      |x2+x0      |
  haddps xmm0, xmm0		|x3+x2+x1+x0|x3+x2+x1+x0|x3+x2+x1+x0|x3+x2+x1+x0|

SSE2 intrinsics (aligned load/store):

float a[N], x = 0.0;
...
float xx[4];
__m128 *av = (__m128*)a; // assume 16-byte aligned
__m128 *bv = (__m128*)b; // assume 16-byte aligned
__m128 xv = _mm_setzero_ps();
for (i = 0; i < N/4; i++)
  xv = _mm_add_ps(xv, _mm_mul_ps(av[i], bv[i]));
_mm_store_ps(xx, xv);
x = xx[0] + xx[1] + xx[2] + xx[3];
// or instead of the two lines above we can use a horizontal add:
xv = _mm_hadd_ps(xv, xv);
xv = _mm_hadd_ps(xv, xv);
_mm_store_ps(xx, xv);
x = xx[0];

================================================================================

Problem: vectorize the following code with SSE

double a[N], x = 0.0;
...
for (i = 0; i < N; i++)
  x = x + a[3*i];

SSE/SSE2 technology with 128-bit XMM registers (unaligned load/store):

  xorpd  xmm0, xmm0		|0|0|
Back:
  movsd  xmm1, _a[ecx]		|0      |a[3*i+0]|
  movhpd xmm1, _a[ecx+24]	|a[3*i+3|a[3*i+0]|
  addpd  xmm0, xmm1
  add    ecx, 48
  cmp    ecx, edi
  jl     Back
  haddpd xmm0, xmm0		|x1+x0|x1+x0|

SSE2 intrinsics (unaligned load/store):

double a[N], x = 0.0;
...
double xx[2];
__m128d xv = _mm_setzero_pd();
__m128d t;
for (i = 0; i < N; i += 2)
{
  t = _mm_load_sd(a+3*i);         // need not be aligned
  t = _mm_loadh_pd(t, a+3*(i+1)); // need not be aligned
  xv = xv + t;
}
_mm_store_pd(xx, xv);
x = xx[0] + xx[1];

================================================================================

Problem: vectorize the following code with SSE

float a[N], x = 0.0;
...
for (i = 0; i < N; i++)
  x = x + a[3*i];

SSE/SSE2 technology with 128-bit XMM registers (unaligned load/store):

  xorps    xmm0, xmm0		|0|0|0|0|
Back:
  movss    xmm4, _a[ecx]	|-       |-       |-       |a[3*i+0]|
  movss    xmm3, _a[ecx+12]	|-       |-       |-       |a[3*i+3]| 
  movss    xmm2, _a[ecx+24]	|-       |-       |-       |a[3*i+6]|
  movss    xmm1, _a[ecx+36]	|-       |-       |-       |a[3*i+9]|
  unpcklps xmm4, xmm2		|-       |-       |a[3*i+6]|a[3*i+0]|
  unpcklps xmm3, xmm1		|-       |-       |a[3*i+9]|a[3*i+3]|
  unpcklps xmm4, xmm3		|a[3*i+9]|a[3*i+6]|a[3*i+3]|a[3*i+0]|
  addps    xmm0, xmm4
  add      ecx, 48
  cmp      ecx, edi
  jl       Back
  haddps   xmm0, xmm0		|x3+x2      |x1+x0      |x3+x2      |x2+x0      |
  haddps   xmm0, xmm0		|x3+x2+x1+x0|x3+x2+x1+x0|x3+x2+x1+x0|x3+x2+x1+x0|

SSE2 intrinsics (unaligned load/store):

float a[N], x = 0.0;
...
float xx[4];
__m128 xv = _mm_setzero_ps();
__m128 t;
for (i = 0; i < N; i += 4)
{
  t = _mm_set_ps(a+3*i, a+3*i+3, a+3*i+6, a+3*i+9);
  xv = _mm_add_ps(xv, t);
}
_mm_store_ps(xx, xv);
x = xx[0] + xx[1] + xx[2] + xx[3];

Note: performance may be poor, since this is not much better than scalar code!

================================================================================

Problem: vectorize the following code with SSE

float a[N], b[N], c[N];
...
for (i = 0; i < N; i++)
  if (a[i] > 0)
    a[i] = b[i] / c[i];

SSE/SSE2 technology with 128-bit XMM registers (aligned load/store):

Back:
  movaps   xmm0, _a[ecx]
  movaps   xmm2, _b[ecx]
  divps    xmm2, _c[ecx]	set  x[] = |b3/c3   |b2/c2   |b1/c1   |b0/c0   |
  xorps    xmm1, xmm1		set 0
  cmpltps  xmm1, xmm0		guards g[]=|a3>0    |a2>0    |a1>0    |a0>0    |
  movaps   xmm3, xmm1		copy guards g[]
  andnps   xmm3, xmm0		mask y[] = |!a3>0&a3|!a2>0&a2|!a1>0&a1|!a0>0&a0|
  andps    xmm2, xmm1		mask z[] = | a3>0&x3| a2>0&x2| a1>0&x1| a0>0&x0|
  orps     xmm3, xmm2		combine  = |y3|z3   |y2|z2   |y1|z1   |y0|z0   |
  movaps   _a[ecx], xmm3	store into a[]
  add      ecx, 16
  cmp      ecx, edi
  jl       Back

SSE2 intrinsics (aligned load/store):

float a[N], b[N], c[N];
...
__m128 *av = (__m128*)a; // assume 16-byte aligned
__m128 *bv = (__m128*)b; // assume 16-byte aligned
__m128 *cv = (__m128*)c; // assume 16-byte aligned
__m128 zeros = _mm_setzero_ps();
for (i = 0; i < N/4; i++)
{
  __m128 x = _mm_div_ps(bv[i], cv[i]);
  __m128 g = _mm_cmplt_ps(av[i], zeros);
  __m128 y = _mm_andnot_ps(g, av[i]);
  __m128 z = _mm_and_ps(g, x);
  av[i] = _mm_or_ps(y, z);
}

20__m128 ma, mb, mr;
 21__m128 na, nb, nr;
 22__m128 gl, gr, gtt, gb;
 23__m128 gx, gy, sgx, sgy, sg, sqsg;
 24__m128 gn, gi;
 25__m128i gii;
 26__m128 gzero;
 27
 28memset(gzero.m128_f32, 0, sizeof(float) * 4);
 29
 30for(int i=0;i<4;i++)
 31{
 32    gn.m128_f32[i] = AtanLookupF32::NDOUBLE();
 33}
 34
 35for (int s = 1 ; s < scnt ; ++s) {
 36
 37    ImageArrayf& imgt = *imgScaled[s];
 38
 39    imgt.fillShiftedImage(1, imgsa);
 40    imgt.fillShiftedImage(2, imgsb);
 41
 42    for (int y = 1 ; y < (h - 1) ; ++y) {
 43        int x;
 44        for (x = 0 ; x < (w - 2) ; x += 4) {
 45
 46            gl = _mm_load_ps(imgt[y] + x);
 47            gr = _mm_load_ps(imgsb[y] + x);
 48            gtt = _mm_load_ps(imgsa[y+1] + x);
 49            gb = _mm_load_ps(imgsa[y-1] + x);
 50
 51            _asm
 52            {
 53                // x0 = right;
 54                movaps xmm0, gr;
 55
 56                // x1 = left;
 57                movaps xmm1, gl;
 58
 59                // x2 = top;
 60                movaps xmm2, gtt;
 61
 62                // x3 = bottom
 63                movaps xmm3, gb;
 64
 65                // x0 = right - left = gx
 66                subps xmm0, xmm1;
 67
 68                // x2 = top - bottom = gy
 69                subps xmm2, xmm3;
 70
 71                // x4 = right
 72                movaps xmm4, gr;
 73
 74                // x6 = top
 75                movaps xmm6, gtt;
 76
 77                // x1 = left - right = -gx;
 78                subps xmm1, xmm4;
 79
 80                // x3 = bottom - top = -gy;
 81                subps xmm3, xmm6;
 82
 83                // x1 = |gx|
 84                maxps xmm1, xmm0;
 85
 86                // x3 = |gy|
 87                maxps xmm3, xmm2;
 88
 89                // gx = x0
 90                movaps gx, xmm0;
 91
 92                // gy = x2
 93                movaps gy, xmm2;
 94
 95                // x1 = |gx| + |gy|
 96                addps xmm1, xmm3;
 97
 98                // x4 = gx;
 99                movaps xmm4, xmm0;
100
101                // x6 = gy;
102                movaps xmm6, xmm2;
103
104                // x4 = gx^2;
105                mulps xmm4, xmm4;
106
107                // x6 = gy^2;
108                mulps xmm6, xmm6;
109
110                // x4 = gx^2 + gy^2;
111                addps xmm4, xmm6;
112
113                // x4 = sqrt()
114                sqrtps xmm4, xmm4;
115
116                // sqsg = x4;
117                movaps sqsg, xmm4;
118
119                // x3 = |gy| / (|gx| + |gy|) = dy;
120                divps xmm3, xmm1;
121
122                // x1 = n;
123                movaps xmm1, gn;
124
125                // x3 = |dy| * n;
126                mulps xmm3, xmm1;
127
128                // gi = |dy| * n;
129                movaps gi, xmm3;
130            }
131
132            _mm_store_ps(imggm[y] + x, sqsg);
133
134            gx = _mm_cmpgt_ps(gx, gzero);
135            gy = _mm_cmpgt_ps(gy, gzero);
136
137            _mm_store_si128((__m128i*)(imggx[y] + x), *((__m128i*)&gx));
138            _mm_store_si128((__m128i*)(imggy[y] + x), *((__m128i*)&gy));
139
140            gii = _mm_cvtps_epi32(gi);
141            _mm_store_si128((__m128i*)(imgsi[y] + x), gii);
142        }
143    }
144
145    for (int y = 1 ; y < (h - 1) ; ++y) {
146        for (int x = 1 ; x < (w - 1) ; x ++) {
147            magnitudes[s]->At(x, y) = imggm[y][x-1];
148            directions[s]->At(x, y) = AtanLookupF32::ValueDirect(imggy[y][x-1], imggx[y][x-1], imgsi[y][x-1]);
149        }
150    }
151}
阅读(1289) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~