[Golang] 源碼探究:strings

時間 2019-12-10

標籤 golang 源碼探究 strings 欄目 Go 简体版

原文原文鏈接

golang源碼探究-strings

Contain()

func Contains(s, substr string) bool
Contains()返回一個布爾值，若substr存在於s中，則返回true，不存在則返回false。golang

// Contains reports whether substr is within s

func Contains(s, substr string) bool {
    return Index(s, substr) >= 0
}

Index()

咱們再來看Index(),
func Index(s, substr string) int
Index()返回substr出如今原始string s 中的位置，若是s中meiyousubstr，則返回-1算法

// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
func Index(s, substr string) int {
    n := len(substr) //先獲取substr的長度 賦給n
    switch {
    case n == 0:    //若是 substr的長度爲0 ，則返回0，
        return 0
    case n == 1: 
        return IndexByte(s, substr[0]) // 後面再看一下IndexByte()的源碼
    case n == len(s): // 若是 s和substr長度相等，直接判斷倆字符串是否如出一轍
        if substr == s {
            return 0
        }
        return -1
    case n > len(s): // 若是 substr的長度大於s的長度，那確定不存在了，返回-1，說明substr不存在於s中
        return -1
    case n <= bytealg.MaxLen: // 後面得看bytealg.MaxLen 
        // Use brute force when s and substr both are small
        if len(s) <= bytealg.MaxBruteForce {   // const型 ：const MaxBruteForce = 64
            return bytealg.IndexString(s, substr)
        }
        c0 := substr[0]
        c1 := substr[1]
        i := 0
        t := len(s) - n + 1
        fails := 0
        for i < t {
            if s[i] != c0 {
                // IndexByte is faster than bytealg.IndexString, so use it as long as
                // we're not getting lots of false positives.
                o := IndexByte(s[i:t], c0)
                if o < 0 {
                    return -1
                }
                i += o
            }
            if s[i+1] == c1 && s[i:i+n] == substr {
                return i
            }
            fails++
            i++
            // Switch to bytealg.IndexString when IndexByte produces too many false positives.
            if fails > bytealg.Cutover(i) {
                r := bytealg.IndexString(s[i:], substr)
                if r >= 0 {
                    return r + i
                }
                return -1
            }
        }
        return -1
    }
    c0 := substr[0]
    c1 := substr[1]
    i := 0
    t := len(s) - n + 1
    fails := 0
    for i < t {
        if s[i] != c0 {
            o := IndexByte(s[i:t], c0)
            if o < 0 {
                return -1
            }
            i += o
        }
        if s[i+1] == c1 && s[i:i+n] == substr {
            return i
        }
        i++
        fails++
        if fails >= 4+i>>4 && i < t {
            // See comment in ../bytes/bytes_generic.go.
            j := indexRabinKarp(s[i:], substr)
            if j < 0 {
                return -1
            }
            return i + j
        }
    }
    return -1
}

internel/bytealg中：
MaxLen is the maximum length of the string to be searched for (argument b) in Index.app

main.go:5:2: use of internal package internal/bytealg not allowed

想看一下bytealg.Maxlen等於多少，可是go build 後報錯，說internal/bytealg不容許用函數

在internel包搜了一下MaxLen
ui

如是，MaxLen於CPU有關。spa

package bytealg

import "internal/cpu"

const MaxBruteForce = 64

func init() {
    if cpu.X86.HasAVX2 {
        MaxLen = 63
    } else {
        MaxLen = 31
    }
}

cpu.X86.HasAVS2是個啥？來看一下cpu.X86.HasAVX2code

X86.HasAVX = isSet(ecx1, cpuid_AVX) && osSupportsAV

看一下isSet()圖片

func isSet(hwc uint32, value uint32) bool {
    return hwc&value != 0
}

// cpuid is implemented in cpu_x86.s.
func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)

_, _, ecx1, edx1 := cpuid(1, 0)

osSupportsAVX := false
    // For XGETBV, OSXSAVE bit is required and sufficient.
    if X86.HasOSXSAVE {
        eax, _ := xgetbv()
        // Check if XMM and YMM registers have OS support.
        osSupportsAVX = isSet(eax, 1<<1) && isSet(eax, 1<<2)
    }

。。。涉及cpu硬件相關的了。。。。ip

回到strings.Index(),go on
雖然bytealg.MaxLen不知道是多少，可是可是從case語句不難看出，bytealg.MaxLen是一個可能要比substr的長度小的值，若是substr的確比bytealg.MaxLen小，則執行case n <= bytealg.MaxLen ，不然跳出case。
直接看case 幾種狀況都不知足的塊ci

c0 := substr[0] // 獲取substr的第一個字符
    c1 := substr[1]    // 第二個
    i := 0 
    t := len(s) - n + 1  // t是s的長度減substr的長度 + 1  ，，，嘖嘖嘖
    fails := 0
    for i < t { // 進入循環 
        if s[i] != c0 { // 若是substr的頭 不等於 s[i] 
            o := IndexByte(s[i:t], c0) // 直接將substr的頭放在s的slice中判斷
            if o < 0 { // 不存在，直接返回-1
                return -1
            }
// substr的頭存在於 s[i]剩下的部分 。那直接看substr的第二個字符存不存在於 s[i+o]中，
            i += o 
        }
        if s[i+1] == c1 && s[i:i+n] == substr { // 恩，若是substr的頭等於s[i] 直接看substr的第二個字符等不等於s[i+1] 
// 而且判斷s的slices[i:i+n]和substr相等不，若是相等，那麼substr就存在於s中了，位置是 i
            return i
        }
// 其餘狀況，i++ ，在for的斷定條件下繼續循環。失敗次數+1，
        i++
        fails++
        if fails >= 4+i>>4 && i < t { 若是失敗次數大於等於 4+i 獲得二進制數右移四位而且i < t ，
// 得看一下indexrabinKarp 是個啥？
//不難看出，這塊是針對於失敗次數比較多的時候執行的。
            // See comment in ../bytes/bytes_generic.go.
            j := indexRabinKarp(s[i:], substr)
            if j < 0 {
                return -1
            }
            return i + j
        }
    }
// 若是循環執行完了，到這兒了，說明substr不存在於s中，返回-1
    return -1

indexRabinKarp()

看一下indexRabinKarp() 吧！
Rabin-Karp是個啥？查了一下。
原來rabinkarp是一種字符串查找算法，看看吧！想必你已經知道Rabin-Karp了，直接跳到下一個目錄吧。
Rabin-Karp Algorithm-WikiPedia

圖解Rabin-Karp字符串查找算法

func indexRabinKarp(s, substr string) int {
    // Rabin-Karp search
    hashss, pow := hashStr(substr) // hashStr是個啥？
    n := len(substr)
    var h uint32
    for i := 0; i < n; i++ {  // 當 0 < i < n 時 循環，獲得h，執行下一塊的if判斷
        h = h*primeRK + uint32(s[i])
    }
    if h == hashss && s[:n] == substr { 
// 若是 判斷substr是否是就是緊挨這s的頭對齊的時候而且相等的，若是是，index = 0，返回0
        return 0
    }
    for i := n; i < len(s); { // 若是 substr和s的頭對齊以後不相等，則繼續循環， 判斷下。從i = n開始判斷
        h *= primeRK
        h += uint32(s[i])
        h -= pow * uint32(s[i-n])
        i++
        if h == hashss && s[i-n:i] == substr {
            return i - n
        }
    }
    return -1
}

hashStr

hashStr()

// primeRK is the prime base used in Rabin-Karp algorithm.
const primeRK = 16777619 // 這是一個質數

// hashStr returns the hash and the appropriate multiplicative
// factor for use in Rabin-Karp algorithm.
func hashStr(sep string) (uint32, uint32) {
    hash := uint32(0)
    for i := 0; i < len(sep); i++ { 
        hash = hash*primeRK + uint32(sep[i]) // hash 是循環len(sep)次的 這串操做的int32型的數
//假設len(sep) = 4
//i = 0: hash = uint32(sep[0])
//i = 1: hash = uint32(sep[0])* primeRK + uint32(sep[1])
//...

    }
    var pow, sq uint32 = 1, primeRK
    for i := len(sep); i > 0; i >>= 1 { i是seq的長度，執行一次循環體以後，i = i + i右移一位(二進制)，
// 等於 i = i + floor(i/2) （十進制，）
        if i&1 != 0 { 
            pow *= sq // 若是i就是1 了 pow = pow * sq
        }
        sq *= sq // i不是1，sq = sq*sq
    }
// 假設len(sep) = 4 
// i = 4, sq = sq^2
// i = 2, sq = sq^4
// i = 1, pow = pow*sq = sq = sq^4
// 假設 len(sep) = 5
// i = 5, sq = sq^2 ，5的二進制101 >> 1 = 10 
// i = 2, sq = sq^4
// i = 1, pow = sq = sq^4 ，仍是sq^len(sep)


    return hash, pow 輸入一個字符串，返回倆uint值
}

Rabin-Karp算法就是爲了不挨個字符對文本s和substr進行比較，能夠嘗試一次性判斷二者是否相等。所以，咱們須要一個好的哈希函數（hash function）。經過哈希函數，能夠算出代匹配字符串substr的哈希值，而後將它和文本中的s的切片s[x：y]的哈希值進行比較。
好比原字符串爲： AABAACAADAABAABA
個人substr爲：AABA
先算出AABA 的hash，而後按照substr的長度算AABA的hash，比一下,結果倆hash相等，那麼再把原字符串比一下，還相等，獲得一個index0繼續，再算ABAA的hash比一下，不相等，繼續，再算BAAC的hash比一下，不相等，繼續，再算AACA的hash比一下，不相等，就這樣，若是目標字符串和原字符串的hash比的結果一致，還得再把目標字符串和原字符串的字符串值比一下，由於很差的hash函數可能會有hash衝突。

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。