Redis 數據結構之dict(2)

本文及後續文章,Redis版本均是v3.2.8數組

上篇文章《Redis 數據結構之dict》,咱們對dict的結構有了大體的印象。此篇文章對dict是如何維護數據結構的作個詳細的理解。數據結構

老規矩仍是打開Redis的源碼,文件dict.cide

1、dict數據結構的維護函數

一、dictCreate - 建立一個新的哈希表this

/* Reset a hash table already initialized with ht_init().spa

 * NOTE: This function should only be called by ht_destroy(). */指針

static void _dictReset(dictht *ht)orm

{索引

    ht->table = NULL;// hash table初始化ip

    ht->size = 0;

    ht->sizemask = 0;

    ht->used = 0;

}

 

/* Create a new hash table */

dict *dictCreate(dictType *type,

        void *privDataPtr)

{

    dict *d = zmalloc(sizeof(*d)); // 分配內存

 

    _dictInit(d,type,privDataPtr);// dict初始化

    return d;

}

 

/* Initialize the hash table */

int _dictInit(dict *d, dictType *type,

        void *privDataPtr)

{

    _dictReset(&d->ht[0]);

    _dictReset(&d->ht[1]);

    d->type = type;

    d->privdata = privDataPtr;

    d->rehashidx = -1;

    d->iterators = 0;

    return DICT_OK;

}

 

從上述的代碼中,能夠看出dictCreate爲dict的數據結構分配空間併爲各個變量賦初值。其中兩個哈希表ht[0]和ht[1]起始都沒有分配空間,table指針都賦爲NULL。這就說明要等第一個數據插入時纔會真正分配空間。

 

二、dictFind - dict查找

dictEntry *dictFind(dict *d, const void *key)

{

    dictEntry *he;

    unsigned int h, idx, table;

 

    if (d->ht[0].used + d->ht[1].used == 0) return NULL; /* dict is empty */

    if (dictIsRehashing(d)) _dictRehashStep(d);

    h = dictHashKey(d, key);

    for (table = 0; table <= 1; table++) {

        idx = h & d->ht[table].sizemask;

        he = d->ht[table].table[idx];

        while(he) {

            if (key==he->key || dictCompareKeys(d, key, he->key))

                return he;

            he = he->next;

        }

        if (!dictIsRehashing(d)) return NULL;

    }

    return NULL;

}

 

從上述的代碼中,dictFind主要是根據dict是否正在重哈希,進行以下操做:

  • 若是當前正在重哈希,那麼就調用_dictRehashStep(d)【稍後在詳細看下實現】。

  • 調用dictHashKey,計算key的哈希值

  • 兩層for循環,其實就是上面定義的兩個hash table。首先在在第一個哈希表h[0]上查找,在table數組上定位到哈希值所對應的位置(經過哈希值與sizemask進行按位與計算),而後在對應的dictEntry鏈表上查找。在遍歷dictEntry鏈表時,須要對key進行比較即調用dictCompareKeys(d, key, he->key),dictCompareKeys裏面的實現會調用keyCompare。若是找到就返回該項。不然,進行下一步。

  • 接下來判斷是否正在重哈希,若是沒有,那麼在ht[0]上找的結果就是最終的結果(若是沒有找到,就返回NULL);不然,執行第二次遍歷即在ht[1]上查找,過程如ht[0]一致。

 

三、dictAdd和dictReplace - dict插入

/* Add an element to the target hash table */

int dictAdd(dict *d, void *key, void *val)

{

    dictEntry *entry = dictAddRaw(d,key);

 

    if (!entry) return DICT_ERR;

    dictSetVal(d, entry, val);

    return DICT_OK;

}

 

/* Low level add. This function adds the entry but instead of setting

 * a value returns the dictEntry structure to the user, that will make

 * sure to fill the value field as he wishes.

 *

 * This function is also directly exposed to the user API to be called

 * mainly in order to store non-pointers inside the hash value, example:

 *

 * entry = dictAddRaw(dict,mykey);

 * if (entry != NULL) dictSetSignedIntegerVal(entry,1000);

 *

 * Return values:

 *

 * If key already exists NULL is returned.

 * If key was added, the hash entry is returned to be manipulated by the caller.

 */

dictEntry *dictAddRaw(dict *d, void *key)

{

    int index;

    dictEntry *entry;

    dictht *ht;

 

    if (dictIsRehashing(d)) _dictRehashStep(d);

 

    /* Get the index of the new element, or -1 if

     * the element already exists. */

    if ((index = _dictKeyIndex(d, key)) == -1)

        return NULL;

 

    /* Allocate the memory and store the new entry.

     * Insert the element in top, with the assumption that in a database

     * system it is more likely that recently added entries are accessed

     * more frequently. */

    ht = dictIsRehashing(d) ? &d->ht[1] : &d->ht[0];

    entry = zmalloc(sizeof(*entry));

    entry->next = ht->table[index];//將新元素添加到桶中鏈表的頭節點

    ht->table[index] = entry;

    ht->used++;

 

    /* Set the hash entry fields. */

    dictSetKey(d, entry, key);

    return entry;

}

 

 

_dictKeyIndex

/* Returns the index of a free slot that can be populated with

 * a hash entry for the given 'key'.

 * If the key already exists, -1 is returned.

 *

 * Note that if we are in the process of rehashing the hash table, the

 * index is always returned in the context of the second (new) hash table. */

static int _dictKeyIndex(dict *d, const void *key)

{

    unsigned int h, idx, table;

    dictEntry *he;

 

    /* Expand the hash table if needed */

    if (_dictExpandIfNeeded(d) == DICT_ERR)

        return -1;

    /* Compute the key hash value */

    h = dictHashKey(d, key);

    for (table = 0; table <= 1; table++) {

        idx = h & d->ht[table].sizemask;

        /* Search if this slot does not already contain the given key */

        he = d->ht[table].table[idx];

        while(he) {

            if (key==he->key || dictCompareKeys(d, key, he->key))

                return -1;

            he = he->next;

        }

        if (!dictIsRehashing(d)) break;

    }

    return idx;

}

 

 

/* Add an element, discarding the old if the key already exists.

 * Return 1 if the key was added from scratch, 0 if there was already an

 * element with such key and dictReplace() just performed a value update

 * operation. */

int dictReplace(dict *d, void *key, void *val)

{

    dictEntry *entry, auxentry;

 

    /* Try to add the element. If the key

     * does not exists dictAdd will suceed. */

    if (dictAdd(d, key, val) == DICT_OK)

        return 1;

    /* It already exists, get the entry */

    entry = dictFind(d, key);

    /* Set the new value and free the old one. Note that it is important

     * to do that in this order, as the value may just be exactly the same

     * as the previous one. In this context, think to reference counting,

     * you want to increment (set), and then decrement (free), and not the

     * reverse. */

    auxentry = *entry;

    dictSetVal(d, entry, val);

    dictFreeVal(d, &auxentry);

    return 0;

}

 

dictAdd和dictReplace都有插入的功能,它們又有何區別:

  • dictAdd插入新的一對key和value,若是key已經存在,則插入失敗。

  • dictReplace是在dictAdd的基礎上實現的。dictReplace也是插入一對key和value,不過在key存在的時候,它會更新value。這其實至關於兩次查找過程dictFind。

從dictAdd和dictReplace的代碼的註釋,咱們大體瞭解函數的實現過程和原理:

  • dictAdd和dictReplace也會調用_dictRehashStep(d),觸發推動一步重哈希

  • 若是正在重哈希中,則會把數據插入到ht[1],不然數據插入到ht[0]。

  • 在對應bucket中插入數據的時候,數據老是插入dictEntry鏈表的頭部,由於最近添加的數據更可能被訪問的機率更頻繁。

  • dictKeyIndex,可能會存在哈希表的內存擴展。_dictExpandIfNeeded(d),它將哈希表的長度擴展爲原來的兩倍。

  • _dictKeyIndex,在dict查找元素插入的位置。從代碼中,看到ht[0]、ht[1]的遍歷,若是不在重哈希過程當中,它只查找ht[0];不然查找ht[0]和ht[1]。

 

四、dictDelete - dict刪除

/* Search and remove an element */

static int dictGenericDelete(dict *d, const void *key, int nofree)

{

    unsigned int h, idx;

    dictEntry *he, *prevHe;

    int table;

 

    if (d->ht[0].size == 0) return DICT_ERR; /* d->ht[0].table is NULL */

    if (dictIsRehashing(d)) _dictRehashStep(d);

    h = dictHashKey(d, key);

 

    for (table = 0; table <= 1; table++) {

        idx = h & d->ht[table].sizemask;

        he = d->ht[table].table[idx];

        prevHe = NULL;

        while(he) {

            if (key==he->key || dictCompareKeys(d, key, he->key)) {

                /* Unlink the element from the list */

                if (prevHe)

                    prevHe->next = he->next;

                else

                    d->ht[table].table[idx] = he->next;

                if (!nofree) {

                    dictFreeKey(d, he);

                    dictFreeVal(d, he);

                }

                zfree(he);

                d->ht[table].used--;

                return DICT_OK;

            }

            prevHe = he;

            he = he->next;

        }

        if (!dictIsRehashing(d)) break;

    }

    return DICT_ERR; /* not found */

}

 

int dictDelete(dict *ht, const void *key) {

    return dictGenericDelete(ht,key,0);

}

 

int dictDeleteNoFree(dict *ht, const void *key) {

    return dictGenericDelete(ht,key,1);

}

 

從dictDelete代碼中,能夠看到

  • dictDelete也會觸發推動一步重哈希(_dictRehashStep)

  • 若是當前不在重哈希過程當中,它只在ht[0]中查找要刪除的key;不然ht[0]和ht[1]它都要查找。

  • 刪除成功後會調用key和value的析構函數(keyDestructor和valDestructor)。

 

從dictCreate、dictFind、dictAdd\dictReplace、dictDelete代碼中,看到這些函數中都有_dictRehashStep(d)函數的調用(將哈希推動一步)。此舉的目的就將重哈希過程分散到各個查找、插入和刪除操做中去了,而不是集中在某一個操做中一次性作完。

 

五、_dictRehashStep源碼實現

/* This function performs just a step of rehashing, and only if there are

 * no safe iterators bound to our hash table. When we have iterators in the

 * middle of a rehashing we can't mess with the two hash tables otherwise

 * some element can be missed or duplicated.

 *

 * This function is called by common lookup or update operations in the

 * dictionary so that the hash table automatically migrates from H1 to H2

 * while it is actively used. */

static void _dictRehashStep(dict *d) {

    if (d->iterators == 0) dictRehash(d,1);

}

 

/* Performs N steps of incremental rehashing. Returns 1 if there are still

 * keys to move from the old to the new hash table, otherwise 0 is returned.

 *

 * Note that a rehashing step consists in moving a bucket (that may have more

 * than one key as we use chaining) from the old to the new hash table, however

 * since part of the hash table may be composed of empty spaces, it is not

 * guaranteed that this function will rehash even a single bucket, since it

 * will visit at max N*10 empty buckets in total, otherwise the amount of

 * work it does would be unbound and the function may block for a long time. */

int dictRehash(dict *d, int n) {

    int empty_visits = n*10; /* Max number of empty buckets to visit. */

    if (!dictIsRehashing(d)) return 0;

 

    while(n-- && d->ht[0].used != 0) {

        dictEntry *de, *nextde;

 

        /* Note that rehashidx can't overflow as we are sure there are more

         * elements because ht[0].used != 0 */

        assert(d->ht[0].size > (unsigned long)d->rehashidx);

        while(d->ht[0].table[d->rehashidx] == NULL) {//跳過數組中爲空的桶

            d->rehashidx++;

            if (--empty_visits == 0) return 1;//若是訪問空桶次數超過限制,則直接返回

        }

        de = d->ht[0].table[d->rehashidx];//ht[0]中正在rehash的桶元素的頭節點

        /* Move all the keys in this bucket from the old to the new hash HT */

        while(de) {

            unsigned int h;

 

            nextde = de->next;

            /* Get the index in the new hash table */

            h = dictHashKey(d, de->key) & d->ht[1].sizemask;//計算ht[0]中元素進行rehash後在ht[1]中的索引

            de->next = d->ht[1].table[h];//並插入到鏈表的頭部

            d->ht[1].table[h] = de;

            d->ht[0].used--;

            d->ht[1].used++;

            de = nextde;

        }

        d->ht[0].table[d->rehashidx] = NULL;

        d->rehashidx++;//該桶處理完成後,準備處理下一個桶    }

    }

 

    /* Check if we already rehashed the whole table... */

//ht[0]剩餘元素個數爲0,代表ht[0]中的元素已經所有rehash到ht[1]中,所以rehash過程已經完成

    if (d->ht[0].used == 0) {

        zfree(d->ht[0].table);//能夠釋放ht[0],並將ht[1]賦給ht[0]後重置ht[1]

        d->ht[0] = d->ht[1];

        _dictReset(&d->ht[1]);

        d->rehashidx = -1;//代表rehash已經結束

        return 0;

    }

 

    /* More to rehash... */

    return 1;//不然還處於rehash過程當中

}

 

 

_dictRehashStep,能夠理解爲增量式重哈希。

dictRehash每次將重哈希至少向前推動N步(除非不到N步整個重哈希就結束了),每一步都將ht[0]上某一個bucket(即一個dictEntry鏈表)上的每個dictEntry移動到ht[1]上,它在ht[1]上的新位置根據ht[1]的sizemask進行從新計算。rehashidx記錄了當前還沒有遷移(有待遷移)的ht[0]的bucket位置。

若是dictRehash被調用的時候,rehashidx指向的bucket裏一個dictEntry也沒有,那麼它就沒有可遷移的數據。這時它嘗試在ht[0].table數組中不斷向後遍歷,直到找到下一個存有數據的bucket位置。若是一直找不到,則最多走N*10步,本次重哈希暫告結束。

最後,若是ht[0]上的數據都遷移到ht[1]上了(即d->ht[0].used == 0),那麼整個重哈希結束,ht[0]變成ht[1]的內容,而ht[1]重置爲空。

對於重哈希過程的分析,正如上篇文章對dict結構圖中所展現的正是rehashidx=2時的狀況,前面兩個bucket(ht[0].table[0]和ht[0].table[1])都已經遷移到ht[1]上去了。

 

總結

Rehash操做分爲擴展和收縮兩種狀況,

dict中有兩個hash表,ht[0]和ht[1]。從代碼中看出,dict的rehash並非一次性完成的,而是分屢次、漸進式的完成的。具體的說dict有兩種不一樣的策略:

一、_dictRehashStep:全部的數據都是存在放dict的ht[0]中,ht[1]只在rehash的時候使用。dict進行rehash的時候,將ht[0]中的全部數據rehash到ht[1]中。

二、dictRehashMilliseconds:每次執行一段固定的時間,時間到了就暫停rehash操做。

爲何要Rehash?

一、從感性上說,隨着HashTable中的數據增多,衝突的元素增多,ht[0]的鏈表增加,查找元素效率就越低,所以就須要Rehash。

二、從代碼角度看,哈希表利用負載因子loadfactor = used/size來代表hash表當前的存儲狀況。當負載因子過大時操做的時間複雜度增大,負載因子太小時說明hash表的填充率很低,浪費內存。因爲Redis中的數據都是存儲在內存中的,所以咱們必須儘可能的節省內存。所以咱們必須將loadfactor控制在必定的範圍內,同時保證操做的時間複雜度接近O(1)和內存儘可能被佔用。

 

-EOF-

相關文章
相關標籤/搜索