go源碼分析：strings包

時間 2019-12-01

標籤源碼分析 strings 简体版

原文原文鏈接

主要介紹strings包中的 strings.go/search.go/replace.gogolang

string.go中主要介紹Index函數，該函數尋找s中第一次出現substr的位置，返回position或-1:算法

基本代碼以下：函數

  1 func Index(s, substr string) int {
  2     n := len(substr)
  3     switch {
  4    　　...
  5     case n <= bytealg.MaxLen:
  6         // Use brute force when s and substr both are small
  7         if len(s) <= bytealg.MaxBruteForce {
  8             return bytealg.IndexString(s, substr)
  9         }
 10         c := substr[0]
 11         i := 0
 12         t := s[:len(s)-n+1]
 13         fails := 0
 14         for i < len(t) {
 15             if t[i] != c {
 16                 // IndexByte is faster than bytealg.IndexString, so use it as long as
 17                 // we're not getting lots of false positives.
 18                 o := IndexByte(t[i:], c)
 19                 if o < 0 {
 20                     return -1
 21                 }
 22                 i += o
 23             }
 24             if s[i:i+n] == substr {
 25                 return i
 26             }
 27             fails++
 28             i++
 29             // Switch to bytealg.IndexString when IndexByte produces too many false positives.
 30             if fails > bytealg.Cutover(i) {
 31                 r := bytealg.IndexString(s[i:], substr)
 32                 if r >= 0 {
 33                     return r + i
 34                 }
 35                 return -1
 36             }
 37         }
 38         return -1
 39     }
 40     c := substr[0]
 41     i := 0
 42     t := s[:len(s)-n+1]
 43     fails := 0
 44     for i < len(t) {
 45         if t[i] != c {
 46             o := IndexByte(t[i:], c)
 47             if o < 0 {
 48                 return -1
 49             }
 50             i += o
 51         }
 52         if s[i:i+n] == substr {
 53             return i
 54         }
 55         i++
 56         fails++
 57         if fails >= 4+i>>4 && i < len(t) {
 58             // See comment in ../bytes/bytes_generic.go.
 59             j := indexRabinKarp(s[i:], substr)
 60             if j < 0 {
 61                 return -1
 62             }
 63             return i + j
 64         }
 65     }
 66     return -1
 67 }
 68 
 69 
 70 func indexRabinKarp(s, substr string) int {
 71     // Rabin-Karp search
 72     hashss, pow := hashStr(substr)
 73     n := len(substr)
 74     var h uint32
 75     for i := 0; i < n; i++ {
 76         h = h*primeRK + uint32(s[i])
 77     }
 78     if h == hashss && s[:n] == substr {
 79         return 0
 80     }
 81     for i := n; i < len(s); {
 82         h *= primeRK
 83         h += uint32(s[i])
 84         h -= pow * uint32(s[i-n])
 85         i++
 86         if h == hashss && s[i-n:i] == substr {
 87             return i - n
 88         }
 89     }
 90     return -1
 91 
 92 }
 93 
 94 func hashStr(sep string) (uint32, uint32) {
 95    hash := uint32(0)
 96    for i := 0; i < len(sep); i++ {
 97       hash = hash*primeRK + uint32(sep[i])
 98    }
 99    var pow, sq uint32 = 1, primeRK
100    for i := len(sep); i > 0; i >>= 1 {
101       if i&1 != 0 {
102          pow *= sq
103       }
104       sq *= sq
105    }
106    return hash, pow
107 }

　　能夠看到在substr較短的狀況下使用了暴力匹配，不然使用rabin-karp算法進行比較，如下對rabin-karp以及golang的實現方式進行簡要總結：ui

1. Rabin-Karp算法相似於暴力匹配，假設A爲s中即將於substr進行比較的字符串，傳統方法須要將A中全部的字符與substr一一比較來判斷二者是否相等，這太耗時了；spa

2. 爲此咱們將上述比較替換成比較A與substr的hash值，這樣就將字符串匹配簡化爲整形的判等。爲了減少運算量，這裏但願對於一個長字符串的hash值可以儘量短，但實際上不能期望長字符串有短hash值，並且當這個值很大的時候可能溢出，這個時候就須要用求餘來解決。code

3. hash以及hash求餘都會有hash值碰撞的問題，因此最後要把hash值相等的兩個串再逐個匹配一下確認其相等。golang在實現上實際上使用了uint32大小的限制，隱式地取了餘數。blog

4. 這裏值得一提的是golang在hash方法的選擇上是很特殊的。它使用了一個特殊的hash函數（s[0]*pow(PrimeRK,n-1)+s[1]*pow(PrimeRK,n-2)+...+s[n-1]），使得 hash[i+1:i+n+1] = hash[i:i+n]*PrimeRK+s[i+n+1]-s[i]*pow(PrimeRK,n) 也就是說存在hash[i+1:i+n+1]=f(hash[i:i+n]) 的關係，即A的下一個待匹配項可以迅速由A計算獲得，大大加快了計算hash[i+1:i+n+1]的速度。字符串

===未完待續======get