strlen的實現是經過4個字節4個字節進行枚舉,而後經過位運算來判斷這4個字節中是否有一個字節含有0,這樣的話,效率就提升了4倍。ios
這個效率提升是假設a&b&c&d與a&b有差很少效率的前提下。app
那用8字節8字節來偏移的話,是否是更快呢?32位機上不會,64位機上會提升一倍。由於a&b在64位下會提升一倍,由於32位的寄存器大小是32位的,對於分別MOV高位與低位兩次。ide
原本實驗a&b&c&d與a&b的速度的,經實驗驗證,這兩個效率確實是差很少的,而後去看彙編,看指令條數,在沒有使用-O優化下,指令的條數差異跟運算符號的個數的倍數相同,就讓我感到疑惑了。oop
下面附上實驗的代碼:優化
#include <iostream> #include <time.h> #include <cstdio> #include <string> using namespace std; int _strlen(const char *str) { const unsigned int *p = (const unsigned int *) str; unsigned int low = 0x01010101; unsigned int high = 0x80808080; while (true) { unsigned int d = *p++; if (((d - low) & ~d & high) != 0) { // handle [0...256) //if (((d - low) & high) != 0) { // handle [0...128) break; } } const char *q = (const char *)(p - 1); for (int i = 0; i < (int)sizeof(unsigned int); i++) { if (q[i] == 0) { return q - str + i; } } return -1; } int _strlen2(const char *str) { const char *p = str; while (*p != 0) { p++; } return p - str; } int _strlen3(const char *str) { const unsigned long long *p = (const unsigned long long *) str; unsigned long long low = 0x0101010101010101; unsigned long long high = 0x8080808080808080; while (true) { unsigned long long d = *p++; if (((d - low) & ~d & high) != 0) { // handle [0...256) //if (((d - low) & high) != 0) { // handle [0...128) break; } } const char *q = (const char *)(p - 1); for (int i = 0; i < (int)sizeof(unsigned long long); i++) { if (q[i] == 0) { return q - str + i; } } return -1; } size_t _strlen4(const char *str) { const char *char_ptr; const unsigned long int *longword_ptr; unsigned long int longword, himagic, lomagic; /* Handle the first few characters by reading one character at a time. Do this until CHAR_PTR is aligned on a longword boundary. */ for (char_ptr = str; ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0; ++char_ptr) if (*char_ptr == '\0') return char_ptr - str; /* All these elucidatory comments refer to 4-byte longwords, but the theory applies equally well to 8-byte longwords. */ longword_ptr = (unsigned long int *) char_ptr; /* Bits 31, 24, 16, and 8 of this number are zero. Call these bits the "holes." Note that there is a hole just to the left of each byte, with an extra at the end: bits: 01111110 11111110 11111110 11111111 bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD The 1-bits make sure that carries propagate to the next 0-bit. The 0-bits provide holes for carries to fall into. */ himagic = 0x80808080L; lomagic = 0x01010101L; if (sizeof (longword) > 4) { /* 64-bit version of the magic. */ /* Do the shift in two steps to avoid a warning if long has 32 bits. */ himagic = ((himagic << 16) << 16) | himagic; lomagic = ((lomagic << 16) << 16) | lomagic; } /*j if (sizeof (longword) > 8) abort (); */ /* Instead of the traditional loop which tests each character, we will test a longword at a time. The tricky part is testing if *any of the four* bytes in the longword in question are zero. */ for (;;) { longword = *longword_ptr++; if (((longword - lomagic) & ~longword & himagic) != 0) { /* Which of the bytes was the zero? If none of them were, it was a misfire; continue the search. */ const char *cp = (const char *) (longword_ptr - 1); if (cp[0] == 0) return cp - str; if (cp[1] == 0) return cp - str + 1; if (cp[2] == 0) return cp - str + 2; if (cp[3] == 0) return cp - str + 3; if (sizeof (longword) > 4) { if (cp[4] == 0) return cp - str + 4; if (cp[5] == 0) return cp - str + 5; if (cp[6] == 0) return cp - str + 6; if (cp[7] == 0) return cp - str + 7; } } } } string gen_data() { string a; for (int i = 0; i < 100000; i++) { a.push_back('a'); } return a; } double get_run_time(int(*fp)(const char *), const char *str, int count) { clock_t start = clock(); for (int i = 0; i < count; i++) { fp(str); } clock_t end = clock(); return (double)(end - start) / CLOCKS_PER_SEC; } double get_run_time(size_t(*fp)(const char *), const char *str, int count) { clock_t start = clock(); for (int i = 0; i < count; i++) { fp(str); } clock_t end = clock(); return (double)(end - start) / CLOCKS_PER_SEC; } int main() { string a = gen_data(); printf("%d\n", _strlen(a.c_str())); printf("%d\n", _strlen2(a.c_str())); printf("%d\n", _strlen3(a.c_str())); printf("%d\n", (int)strlen(a.c_str())); double time = get_run_time(&_strlen, a.c_str(), 10000); printf("%f\n", time); double time2 = get_run_time(&_strlen2, a.c_str(), 10000); printf("%f\n", time2); double time3 = get_run_time(&_strlen3, a.c_str(), 10000); printf("%f\n", time3); double time4 = get_run_time(&strlen, a.c_str(), 10000); printf("%f\n", time4); double time5 = get_run_time(&_strlen4, a.c_str(), 10000); printf("%f\n", time5); }