本文的copyleft归gfree.wind@gmail.com所有,使用GPL发布,可以自由拷贝,转载。但转载请保持文档的完整性,注明原作者及原链接,严禁用于任何商业用途。
作者:gfree.wind@gmail.com
博客:linuxfocus.blog.chinaunix.net
上一篇博文,发现glibc中的strcpy的效率居然比我写出的例子要差,这实在让我感到惊奇。下面看看为什么glibc中的实现,效率会低呢?让我们反汇编两个实现的代码
第一个strcpy,是我写的例子。
- char* my_strcpy1(char *dest, const char *src)
- {
- char *d = dest;
- register char c;
-
- do {
- c = *src++;
- *d++ = c;
- } while ('\0' != c);
- return dest;
- }
- 它对应的汇编代码如下:
- Dump of assembler code for function my_strcpy1:
-
0x08048394 <+0>: push %ebp
-
0x08048395 <+1>: mov %esp,%ebp
-
0x08048397 <+3>: push %ebx
-
0x08048398 <+4>: sub $0x10,%esp
-
0x0804839b <+7>: mov 0x8(%ebp),%eax
-
0x0804839e <+10>: mov %eax,-0x8(%ebp)
-
0x080483a1 <+13>: mov 0xc(%ebp),%eax
-
0x080483a4 <+16>: movzbl (%eax),%ebx
-
0x080483a7 <+19>: addl $0x1,0xc(%ebp)
-
0x080483ab <+23>: mov -0x8(%ebp),%eax
-
0x080483ae <+26>: mov %bl,(%eax)
-
0x080483b0 <+28>: addl $0x1,-0x8(%ebp)
-
0x080483b4 <+32>: test %bl,%bl
-
0x080483b6 <+34>: jne 0x80483a1
-
0x080483b8 <+36>: mov 0x8(%ebp),%eax
-
0x080483bb <+39>: add $0x10,%esp
-
0x080483be <+42>: pop %ebx
-
0x080483bf <+43>: pop %ebp
-
0x080483c0 <+44>: ret
-
End of assembler dump.
红色部分的汇编代码为do {} while循环代码。
glibc中的strcpy的代码如下:
- /* Copy SRC to DEST. */
- char *
- my_strcpy2 (dest, src)
- char *dest;
- const char *src;
- {
- register char c;
- char * s = (char *)src;
- const int off = dest - s - 1;
- do
- {
- c = *s++;
- s[off] = c;
- }
- while (c != '\0');
- return dest;
- }
对应的汇编代码如下:
- Dump of assembler code for function my_strcpy2:
-
0x080483c1 <+0>: push %ebp
-
0x080483c2 <+1>: mov %esp,%ebp
-
0x080483c4 <+3>: push %ebx
-
0x080483c5 <+4>: sub $0x10,%esp
-
0x080483c8 <+7>: mov 0xc(%ebp),%eax
-
0x080483cb <+10>: mov %eax,-0xc(%ebp)
-
0x080483ce <+13>: mov 0x8(%ebp),%edx
-
0x080483d1 <+16>: mov -0xc(%ebp),%eax
-
0x080483d4 <+19>: mov %edx,%ecx
-
0x080483d6 <+21>: sub %eax,%ecx
-
0x080483d8 <+23>: mov %ecx,%eax
-
0x080483da <+25>: sub $0x1,%eax
-
0x080483dd <+28>: mov %eax,-0x8(%ebp)
-
0x080483e0 <+31>: mov -0xc(%ebp),%eax
-
0x080483e3 <+34>: movzbl (%eax),%ebx
-
0x080483e6 <+37>: addl $0x1,-0xc(%ebp)
-
0x080483ea <+41>: mov -0x8(%ebp),%eax
-
0x080483ed <+44>: add -0xc(%ebp),%eax
-
0x080483f0 <+47>: mov %bl,(%eax)
-
0x080483f2 <+49>: test %bl,%bl
-
0x080483f4 <+51>: jne 0x80483e0
-
0x080483f6 <+53>: mov 0x8(%ebp),%eax
-
0x080483f9 <+56>: add $0x10,%esp
-
0x080483fc <+59>: pop %ebx
-
0x080483fd <+60>: pop %ebp
-
0x080483fe <+61>: ret
-
End of assembler dump.
这里的红色部分同样是对应的do{}while循环代码。
两个实现对应的汇编代码基本相似,那么是否由循环前面的代码引起的呢。my_strcpy2使用了offset,所以多了一些mov和sub操作。我再次更改了代码,在my_strcpy2中不再计算offset。
- #include <stdio.h>
-
#include <stdlib.h>
-
-
-
char* my_strcpy1(char *dest, const char *src)
-
{
-
char *d = dest;
-
register char c;
-
-
do {
-
c = *src++;
-
*d++ = c;
-
} while ('\0' != c);
-
-
return dest;
-
}
-
-
-
int off;
-
-
/* Copy SRC to DEST. */
-
char *
-
my_strcpy2 (dest, src)
-
char *dest;
-
const char *src;
-
{
-
register char c;
-
char * s = (char *)src;
-
-
do
-
{
-
c = *s++;
-
s[off] = c;
-
}
-
while (c != '\0');
-
-
return dest;
-
}
-
-
int main()
-
{
-
const char *str1 = "test1";
-
char buf[100];
-
-
off = buf-str1-1;
-
-
int i;
-
for (i = 0; i < 10000000; ++i) {
-
my_strcpy1(buf, str1);
-
}
-
-
return 0;
-
}
通过使用一个off的全局变量,来省得my_strcpy2的offset的计算。但是结果仍然是my_strcpy1效率跟高。
my_strcpy1的时间约为0.147s,而my_strcpy2的时间为0.220s。再次查看汇编
- (gdb) disassemble my_strcpy1
-
Dump of assembler code for function my_strcpy1:
-
0x08048394 <+0>: push %ebp
-
0x08048395 <+1>: mov %esp,%ebp
-
0x08048397 <+3>: push %ebx
-
0x08048398 <+4>: sub $0x10,%esp
-
0x0804839b <+7>: mov 0x8(%ebp),%eax
-
0x0804839e <+10>: mov %eax,-0x8(%ebp)
-
0x080483a1 <+13>: mov 0xc(%ebp),%eax
-
0x080483a4 <+16>: movzbl (%eax),%ebx
-
0x080483a7 <+19>: addl $0x1,0xc(%ebp)
-
0x080483ab <+23>: mov -0x8(%ebp),%eax
-
0x080483ae <+26>: mov %bl,(%eax)
-
0x080483b0 <+28>: addl $0x1,-0x8(%ebp)
-
0x080483b4 <+32>: test %bl,%bl
-
0x080483b6 <+34>: jne 0x80483a1
-
0x080483b8 <+36>: mov 0x8(%ebp),%eax
-
0x080483bb <+39>: add $0x10,%esp
-
0x080483be <+42>: pop %ebx
-
0x080483bf <+43>: pop %ebp
-
0x080483c0 <+44>: ret
-
End of assembler dump.
-
-
-
(gdb) disassemble my_strcpy2
-
Dump of assembler code for function my_strcpy2:
-
0x080483c1 <+0>: push %ebp
-
0x080483c2 <+1>: mov %esp,%ebp
-
0x080483c4 <+3>: push %ebx
-
0x080483c5 <+4>: sub $0x10,%esp
-
0x080483c8 <+7>: mov 0xc(%ebp),%eax
-
0x080483cb <+10>: mov %eax,-0x8(%ebp)
-
0x080483ce <+13>: mov -0x8(%ebp),%eax
-
0x080483d1 <+16>: movzbl (%eax),%ebx
-
0x080483d4 <+19>: addl $0x1,-0x8(%ebp)
-
0x080483d8 <+23>: mov 0x80496bc,%eax
-
0x080483dd <+28>: add -0x8(%ebp),%eax
-
0x080483e0 <+31>: mov %bl,(%eax)
-
0x080483e2 <+33>: test %bl,%bl
-
0x080483e4 <+35>: jne 0x80483ce
-
0x080483e6 <+37>: mov 0x8(%ebp),%eax
-
0x080483e9 <+40>: add $0x10,%esp
-
0x080483ec <+43>: pop %ebx
-
0x080483ed <+44>: pop %ebp
-
0x080483ee <+45>: ret
-
End of assembler dump.
现在效率仍然有区别,那么看来还是循环处出的问题。时间又晚了,下次再继续研究。
阅读(816) | 评论(0) | 转发(0) |