SIMD(Single Istruction Multiple Data), 即单条指令处理多个数据。是在通用处理器上对程序性能进行优化的利器,尤其在进行大量数据进行相同操作时,更能凸显优势,在Intel X86家族中主要有MMX/SSE/SSE2等指令集构成。memcpy函数即从源地址向目的地址复制一块数据,利用SIMD对其优化有很好的效果。如普通汇编指令 mov eax,ebx一次能复制两个字节的数据,而MMX指令 movq mm1,mm2可以复制8个字节的数据,mm1,mm2分别为MMX指令寄存器为64位,而SSE指令movdqa xmm1,xmm2一次复制16个字节!
函数原型:
void *memcpy( void *dest, const void *src, size_t count );
由于内存分页机制,在操作系统内核地址是否按16字节或是8字节等的对齐对数据存取操作性能有很大影响,未对齐就对数据复制操作很浪费时间,如SSE指令movdqa是16字节对齐的移动操作,movdqu是未对齐时的移动操作,虽然两者功能相同,但movdqa指令周期要远远小于movdqu的指令周期所以在优化前要先检查目的地址和源地址地址是否能够对齐,对齐以后SSE指令注意是“能够”对齐,可能分配时还没有对齐,但经过先从源地址向目的地址复制几个字节以后就对齐,比如两个首地址源地址为 253 ,目的地址 1021 ,虽然都没有按照16字节对齐,但通过先分别复制 253,254,255地址的数据v到1021,1022,1023地址处以后,剩下的地址就是16字节对齐的,因此在复制前应当先检查地址是不是能够以16字节对齐,不行的话再检查是不是可以按8字节对齐,如果8字节还不能对齐,再 检查可不可以4字节对齐,如果都不行 的话只能按照最普通的汇编语言处理了。因为16字节对齐可以用SSE指令集,8字节对齐可以用MMX指令集,4字节对齐可以用rep movsd指令,都不可以的话就只能 movsb 了,:)。
检测是否对齐就通过比较16进制地址末几位是否相等,如检测16字节是否对齐就检测末四位,8字节是否对齐就检测末三位等等。
汇编语言 函数流程如下:
将地址移入到相应寄存器:
mov edi,[esp + 8 + 4];dst
mov esi,[esp + 8 + 8];src
mov ecx,[esp + 8 + 12];count
计算count包含多少个16字节,保存在ecx中,判断如果count<=1就不再判断,直接按照普通复制处理:
mov eax,ecx
and eax,15 ; save the 16 unaligned bits
shr ecx,04H ; save num of double qua_dword
cmp ecx,1 ;if so few , move it as Byte
jg nextt
jmp normal_copy2
如果count>1 再检测能否16字节对齐,如果不能对齐转入是否能够8字节对齐的判断,如果可以将需要先移动的字节数保存在edx中,最后再复制的字节数保存在eax中,比如源地址为253,目的地址为1023,要复制30个字节的话,对齐操作前需要复制3个字节,然后可以按16字节地址对齐复制16个字节,然后还剩11个字节就只能按照普通复制一字节一字节的操作了,这样edx保存3,eax保存11:
nextt:
push esi ;save addr of src
push edi
and esi,15 ;see if the src and dst addr can align
and edi,15
cmp esi,edi
jne normal_mmx ;if can't align use normal movdqu func
xor edx,edx
mov edx,16
sub edx,esi ;(16-low 16 bit of addr) get the num of up_moving Bytes
cmp eax,edx ;if the low 16 bits is too small,we must borrow from aligned bits;
jg not_add_sse2 ;now ignore eax == edx which makes eax = 0, we must deal with it before loop_down
add eax,16
dec ecx
not_add_sse2:
sub eax,edx ; eax maybe zero!
pop edi
pop esi ;now edx keep up_moving Byte count; eax keeps down_moving Byte count; ecx keeps fast_moving //double qua_dword count
push ecx
处理对齐前需要移动的数据:
loop_up_sse2: ;we must move some bytes before aligned_moving
mov ecx,edx
rep movsb
pop ecx
通过sse指令处理对齐数据,并查看eax是否为零,如果为零则不再需要后续的处理剩下的尾,数据处理完毕:
loop_fast_sse2: ;fast moving using movdqa
movdqa xmm1,[esi]
lea esi,[esi + 16]
movdqa [edi],xmm1
lea edi,[edi + 16]
dec ecx
jnz loop_fast_sse2
cmp eax, 0
jne loop_down_sse2
jmp done_align
处理剩下的尾数据:
loop_down_sse2:
mov ecx,eax
rep movsb
jmp dword done_align ; done all
按照同样思路路,处理8字节对齐和4字节能否对齐,并分别按照相应的指令集处理。
整个程序代码如下所示:
ALIGN 16
memcpy_sse2_align:
push esi
push edi
mov edi,[esp + 8 + 4];dst
mov esi,[esp + 8 + 8];src
mov ecx,[esp + 8 + 12];count
mov eax,ecx
and eax,15 ; save the 16 unaligned bits
shr ecx,04H ; save num of double qua_dword
cmp ecx,1 ;if so few , move it as Byte
jg nextt
jmp normal_copy2
nextt:
push esi ;save addr of src
push edi
and esi,15 ;see if the src and dst addr can align
and edi,15
cmp esi,edi
jne normal_mmx ;if can't align use normal movdqu func
xor edx,edx
mov edx,16
sub edx,esi ;(16-low 16 bit of addr) get the num of up_moving Bytes
cmp eax,edx ;if the low 16 bits is too small,we must borrow from aligned bits;
jg not_add_sse2 ;now ignore eax == edx which makes eax = 0, we must deal with it before loop_down
add eax,16
dec ecx
not_add_sse2:
sub eax,edx ; eax maybe zero!
pop edi
pop esi ;now edx keep up_moving Byte count; eax keeps down_moving Byte count;
push ecx
loop_up_sse2: ;we must move some bytes before aligned_moving
mov ecx,edx
rep movsb
pop ecx
loop_fast_sse2: ;fast moving using movdqa
movdqa xmm1,[esi]
lea esi,[esi + 16]
movdqa [edi],xmm1
lea edi,[edi + 16]
dec ecx
jnz loop_fast_sse2
cmp eax, 0
jne loop_down_sse2
jmp done_align
loop_down_sse2:
mov ecx,eax
rep movsb
jmp dword done_align ; done all
normal_mmx:
and esi,7
and edi,7
cmp esi,edi
jne normal_dword
xor edx,edx
mov edx,8
sub edx,esi ;(8-low 8 bit of addr) get the num of up_moving Bytes
and eax,7
cmp eax,edx ;if the low 16 bits is too small,we must borrow from aligned bits;
jg not_add_mmx ;now ignore eax == edx which makes eax = 0, we must deal with it before loop_down
add eax,16
dec ecx
not_add_mmx:
sub eax,edx ; eax maybe zero!
pop edi
pop esi ;now edx keep up_moving Byte count; eax keeps down_moving Byte count; ecx keeps fast_moving double qua_dword count
push ecx ;backup fast count
loop_up_mmx: ;we must move some bytes before aligned_moving
mov ecx,edx
rep movsb
pop ecx
loop_fast_mmx: ;fast moving using movdqa
movq mm1,[esi]
movq mm2,[esi + 8]
lea esi,[esi + 16]
movq [edi],mm1
movq [edi + 8],mm2
lea edi,[edi + 16]
dec ecx
jnz loop_fast_mmx
cmp eax, 0
jne loop_down_mmx
jmp done_align
loop_down_mmx:
mov ecx,eax
rep movsb
jmp done_align; done all
normal_dword:
and esi,3
and edi,3
cmp esi,edi
jne normal_word
xor edx,edx
mov edx,4
sub edx,esi ;(16-low 16 bit of addr) get the num of up_moving Bytes
and eax,3
shl ecx,2 ; transform it to be count of dword;
cmp eax,edx ;if the low 16 bits is too small,we must borrow from aligned bits;
jg not_add_dword ;now ignore eax == edx which makes eax = 0, we must deal with it before loop_down
add eax,4
dec ecx
not_add_dword:
sub eax,edx ; eax maybe zero!
pop edi
pop esi ;now edx keep up_moving Byte count; eax keeps down_moving Byte count; ecx keeps fast_moving double qua_dword count
push ecx;
loop_up_dword: ;we must move some bytes before aligned_moving
mov ecx,edx
rep movsb
pop ecx
loop_fast_dword: ;fast moving using movdqa
rep movsd
cmp eax, 0
je done_align
loop_down_dword:
mov ecx,eax
rep movsb
jmp done_align; done all
normal_word:
pop edi;here we deal with addr_unaligned moving;
pop esi
loop_unalign:
loop_in:
movq mm1,[esi] ;the reason not use "rep movsd" or "movdqu" is that doing the same thing movq is the fastest when addr is unaligned
movq mm2,[esi + 8]
lea esi,[esi + 16]
movq [edi],mm1
movq [edi + 8],mm2
lea edi,[edi + 16]
dec ecx
jnz loop_in
normal_copy2:
shl ecx, 4
add eax,ecx
cmp eax,0
je done_align
normal_loop2:
mov ecx,eax
rep movsb
done_align:
pop edi
pop esi
ret
.endfunc
可以看出在最后处理无法对齐的数据时我们用来MMX指令集movq 因为对比发现处理同样未对齐数据时它的速度是最快的。
如有转载请注明出处:孔祥文博客http://kswapd.cublog.cn