轉自:http://www.aiuxian.com/article/p-1309055.htmlhtml
偶爾看到一個說法,說,小內存的拷貝,使用等號直接賦值比memcpy快得多。結合本身蒐集到的資料,整理成此文。node
事實:strcpy等函數的逐字節拷貝,memcpy是按照機器字長逐字進行拷貝的,一個字等於4(32位機)或8(64位機)個字節。CPU存取一個字節和存取一個字同樣,都是在一條指令、一個內存週期內完成的。顯然,按字拷貝效率更高。c++
先給出一個程序:併發
01 |
#include <stdio.h> |
02 |
#define TESTSIZE 128 |
03 |
struct node { |
04 |
char buf[TESTSIZE]; |
05 |
}; |
06 |
int main() |
07 |
{ |
08 |
char src[TESTSIZE] = {0}; |
09 |
char dst[TESTSIZE]; |
10 |
*( struct node*)dst = *( struct node*)src; |
11 |
} |
編譯:gcc -g -o test test.c函數
得到彙編:objdump -S testoop
能夠看到有這麼一些彙編,對應的是等號賦值操做:測試
*(struct node*)dst = *(struct node*)src;
4004b6: 48 8d 85 00 ff ff ff lea 0xffffffffffffff00(%rbp),%rax
4004bd: 48 8d 55 80 lea 0xffffffffffffff80(%rbp),%rdx
4004c1: 48 8b 0a mov (%rdx),%rcx
4004c4: 48 89 08 mov %rcx,(%rax)
4004c7: 48 8b 4a 08 mov 0x8(%rdx),%rcx
4004cb: 48 89 48 08 mov %rcx,0x8(%rax)
4004cf: 48 8b 4a 10 mov 0x10(%rdx),%rcx
4004d3: 48 89 48 10 mov %rcx,0x10(%rax)
4004d7: 48 8b 4a 18 mov 0x18(%rdx),%rcx
4004db: 48 89 48 18 mov %rcx,0x18(%rax)
4004df: 48 8b 4a 20 mov 0x20(%rdx),%rcx
4004e3: 48 89 48 20 mov %rcx,0x20(%rax)
4004e7: 48 8b 4a 28 mov 0x28(%rdx),%rcx
4004eb: 48 89 48 28 mov %rcx,0x28(%rax)
4004ef: 48 8b 4a 30 mov 0x30(%rdx),%rcx
4004f3: 48 89 48 30 mov %rcx,0x30(%rax)
4004f7: 48 8b 4a 38 mov 0x38(%rdx),%rcx
4004fb: 48 89 48 38 mov %rcx,0x38(%rax)
4004ff: 48 8b 4a 40 mov 0x40(%rdx),%rcx
400503: 48 89 48 40 mov %rcx,0x40(%rax)
400507: 48 8b 4a 48 mov 0x48(%rdx),%rcx
40050b: 48 89 48 48 mov %rcx,0x48(%rax)
40050f: 48 8b 4a 50 mov 0x50(%rdx),%rcx
400513: 48 89 48 50 mov %rcx,0x50(%rax)
400517: 48 8b 4a 58 mov 0x58(%rdx),%rcx
40051b: 48 89 48 58 mov %rcx,0x58(%rax)
40051f: 48 8b 4a 60 mov 0x60(%rdx),%rcx
400523: 48 89 48 60 mov %rcx,0x60(%rax)
400527: 48 8b 4a 68 mov 0x68(%rdx),%rcx
40052b: 48 89 48 68 mov %rcx,0x68(%rax)
40052f: 48 8b 4a 70 mov 0x70(%rdx),%rcx
400533: 48 89 48 70 mov %rcx,0x70(%rax)
400537: 48 8b 52 78 mov 0x78(%rdx),%rdx
40053b: 48 89 50 78 mov %rdx,0x78(%rax)優化
得到libc的memcpy彙編代碼:objdump -S /lib/libc.so.6spa
00973a30 <memcpy>:
973a30: 8b 4c 24 0c mov 0xc(%esp),%ecx
973a34: 89 f8 mov %edi,%eax
973a36: 8b 7c 24 04 mov 0x4(%esp),%edi
973a3a: 89 f2 mov %esi,%edx
973a3c: 8b 74 24 08 mov 0x8(%esp),%esi
973a40: fc cld
973a41: d1 e9 shr %ecx
973a43: 73 01 jae 973a46 <memcpy+0x16>
973a45: a4 movsb %ds:(%esi),%es:(%edi)
973a46: d1 e9 shr %ecx
973a48: 73 02 jae 973a4c <memcpy+0x1c>
973a4a: 66 a5 movsw %ds:(%esi),%es:(%edi)
973a4c: f3 a5 rep movsl %ds:(%esi),%es:(%edi)
973a4e: 89 c7 mov %eax,%edi
973a50: 89 d6 mov %edx,%esi
973a52: 8b 44 24 04 mov 0x4(%esp),%eax
973a56: c3 ret
973a57: 90 nop
001 |
#include <stdio.h> |
002 |
#include <string.h> |
003 |
#include <stdlib.h> |
004 |
#include <sys/time.h> |
005 |
006 |
#define LEN 0x20000 |
007 |
#define MYM 1 |
008 |
#define LIBM 0 |
009 |
char *dst; |
010 |
char *src; |
011 |
012 |
typedef struct memcpy_data_size |
013 |
{ |
014 |
int a[16]; |
015 |
}DATA_SIZE, *P_DATA_SIZE; |
016 |
017 |
void *mymemcpy( void *to, const void *from, size_t size) |
018 |
{ |
019 |
P_DATA_SIZE dst = (P_DATA_SIZE)to; |
020 |
P_DATA_SIZE src = (P_DATA_SIZE)from; |
021 |
022 |
int new_len = size/ sizeof (DATA_SIZE)-1; |
023 |
int remain = size% sizeof (DATA_SIZE)-1; |
024 |
|
025 |
while (new_len >= 1) |
026 |
{ |
027 |
*dst++ = *src++; |
028 |
new_len--; |
029 |
} |
030 |
#if 0 |
031 |
while (new_len >= 2) |
032 |
{ |
033 |
*dst++ = *src++; |
034 |
*dst++ = *src++; |
035 |
new_len = new_len -2; |
036 |
} |
037 |
if (new_len == 1) |
038 |
{ |
039 |
*dst++ = *src++; |
040 |
} |
041 |
#endif |
042 |
while (remain >= 0) |
043 |
{ |
044 |
*(( char *)dst + remain) = *(( char *)src + remain); |
045 |
remain--; |
046 |
} |
047 |
048 |
return to; |
049 |
} |
050 |
051 |
052 |
int main( int argc, char const * argv[]) |
053 |
{ |
054 |
int type = 0; |
055 |
struct timeval start, end; |
056 |
unsigned long diff; |
057 |
058 |
gettimeofday(&start, NULL); |
059 |
if (argc != 2){ |
060 |
printf ( "you should run it as : ./run 1(or 0)\n" ); |
061 |
printf ( "1: run my memcpy\n" ); |
062 |
printf ( "0: run lib memcpy\n" ); |
063 |
exit (0); |
064 |
} |
065 |
type = atoi (argv[1]); |
066 |
if (MYM != type && LIBM != type){ |
067 |
printf ( "you should run it as : ./run 1(or 0)\n" ); |
068 |
printf ( "1: run my memcpy\n" ); |
069 |
printf ( "0: run lib memcpy\n" ); |
070 |
exit (0); |
071 |
} |
072 |
073 |
dst = malloc ( sizeof ( char )*LEN); |
074 |
if (NULL == dst) { |
075 |
perror ( "dst malloc" ); |
076 |
exit (1); |
077 |
} |
078 |
079 |
src = malloc ( sizeof ( char )*LEN); |
080 |
if (NULL == src) { |
081 |
perror ( "src malloc" ); |
082 |
exit (1); |
083 |
} |
084 |
if (MYM == type){ |
085 |
mymemcpy(dst, src, LEN); |
086 |
printf ( "my memcpy:\n" ); |
087 |
} |
088 |
else { |
089 |
memcpy (dst, src, LEN); |
090 |
printf ( "lib memcpy:\n" ); |
091 |
} |
092 |
free (dst); |
093 |
free (src); |
094 |
|
095 |
gettimeofday(&end, NULL); |
096 |
diff = 1000000*(end.tv_sec - start.tv_sec)+ end.tv_usec - start.tv_usec; |
097 |
printf ( "run time is %ld us\n" ,diff); |
098 |
|
099 |
return 0; |
100 |
} |
01 |
#!/bin/sh |
02 |
. / timememcpy 1 |
03 |
. / timememcpy 1 |
04 |
. / timememcpy 1 |
05 |
. / timememcpy 1 |
06 |
. / timememcpy 1 |
07 |
. / timememcpy 0 |
08 |
. / timememcpy 0 |
09 |
. / timememcpy 0 |
10 |
. / timememcpy 0 |
11 |
. / timememcpy 0 |
[root@SPA c]# ./run.sh
my memcpy:
run time is 435 us
my memcpy:
run time is 237 us
my memcpy:
run time is 249 us
my memcpy:
run time is 304 us
my memcpy:
run time is 300 us
lib memcpy:
run time is 262 us
lib memcpy:
run time is 222 us
lib memcpy:
run time is 335 us
lib memcpy:
run time is 281 us
lib memcpy:
run time is 247 us
01 |
#!/bin/sh |
02 |
. / timememcpy 0 |
03 |
. / timememcpy 0 |
04 |
. / timememcpy 0 |
05 |
. / timememcpy 0 |
06 |
. / timememcpy 0 |
07 |
. / timememcpy 1 |
08 |
. / timememcpy 1 |
09 |
. / timememcpy 1 |
10 |
. / timememcpy 1 |
11 |
. / timememcpy 1 |
[root@SPA c]# ./run.sh
lib memcpy:
run time is 479 us
lib memcpy:
run time is 461 us
lib memcpy:
run time is 512 us
lib memcpy:
run time is 405 us
lib memcpy:
run time is 365 us
my memcpy:
run time is 399 us
my memcpy:
run time is 314 us
my memcpy:
run time is 309 us
my memcpy:
run time is 510 us
my memcpy:
run time is 324 us