compare.c

map_with_crush():算法

 1 void map_with_crush(int replication_count, int hosts_count, int object_map[][NUMBER_OF_OBJECTS]) {
 2   struct crush_map *m = crush_create();
 3   m->choose_local_tries = 0;
 4   m->choose_local_fallback_tries = 0;
 5   m->choose_total_tries = 50;
 6   m->chooseleaf_descend_once = 1;
 7   m->chooseleaf_vary_r = 1;
 8   m->chooseleaf_stable = 1;
 9   m->allowed_bucket_algs =
10     (1 << CRUSH_BUCKET_UNIFORM) |
11     (1 << CRUSH_BUCKET_LIST) |
12     (1 << CRUSH_BUCKET_STRAW2);
13   int root_type = 1;
14   int host_type = 2;  
15   int bucketno = 0;
16 
17   int hosts[hosts_count];
18   int weights[hosts_count];
19   int disk = 0;
20   int weight = 0x10000;  // 1.0
21   // 建立一個多個host類型的bucket,並給每一個bucket添加一個disk的item
22   for(int host = 0; host < hosts_count; host++) {
23     struct crush_bucket *b;
24     // 函數建立一個bucket結構。須要指定CRUSH算法(uniform、list、tree、straw)、hash算法、
25     // bucket類型(自定義的type)、size(包括多少個item)、這些items的id數組、這些items的weights
26     b = crush_make_bucket(m, CRUSH_BUCKET_STRAW2, CRUSH_HASH_DEFAULT, host_type,
27                           0, NULL, NULL);
28     assert(b != NULL);
29     // 往bucket中添加item, disk爲item的id,weight爲item的權重
30     assert(crush_bucket_add_item(m, b, disk, weight) == 0);
31     // 將b添加到m中,m->buckets[],bucket的id爲bucketno
32     assert(crush_add_bucket(m, 0, b, &bucketno) == 0);
33     hosts[host] = bucketno;
34     weights[host] = weight;
35     disk++;
36   }
37 
38   struct crush_bucket *root;
39   int bucket_root;
40   // 建立一個root類型的bucket,bucket中添加的item是hosts
41   root = crush_make_bucket(m, CRUSH_BUCKET_STRAW2, CRUSH_HASH_DEFAULT, root_type,
42                            hosts_count, hosts, weights);
43   assert(root != NULL);
44   // 將bucket添加帶map中
45   assert(crush_add_bucket(m, 0, root, &bucket_root) == 0);
46   // 調整權重分佈
47   assert(crush_reweight_bucket(m, root) == 0);
48   
49   struct crush_rule *r;
50   int minsize = 1;
51   int maxsize = 5;
52   int number_of_steps = 3;
53   r = crush_make_rule(number_of_steps, 0, 0, minsize, maxsize);
54   assert(r != NULL);
55   crush_rule_set_step(r, 0, CRUSH_RULE_TAKE, bucket_root, 0);
56   crush_rule_set_step(r, 1, CRUSH_RULE_CHOOSELEAF_FIRSTN, replication_count, host_type);
57   crush_rule_set_step(r, 2, CRUSH_RULE_EMIT, 0, 0);
58   int ruleno = crush_add_rule(m, r, -1);
59   assert(ruleno >= 0);
60 
61   crush_finalize(m);
62 
63   {
64     int result[replication_count];
65     __u32 weights[hosts_count];
66     for(int i = 0; i < hosts_count; i++)
67       weights[i] = 0x10000;
68     int cwin_size = crush_work_size(m, replication_count);
69     char cwin[cwin_size];
70     crush_init_workspace(m, cwin);
71     for(int x = 0; x < NUMBER_OF_OBJECTS; x++) {
72       memset(result, '\0', sizeof(int) * replication_count);
73       assert(crush_do_rule(m, ruleno, x, result, replication_count, weights, hosts_count, cwin) == replication_count);
74       for(int i = 0; i < replication_count; i++) {
75         object_map[i][x] = result[i];
76       }
77     }
78   }
79   crush_destroy(m);
80 }

構造crush map

1. 建立crush map數組

 struct crush_map *m = crush_create();數據結構

 1 struct crush_map *crush_create()
 2 {
 3     struct crush_map *m;
 4     m = malloc(sizeof(*m));
 5         if (!m)
 6                 return NULL;
 7     memset(m, 0, sizeof(*m));
 8 
 9     /* initialize legacy tunable values */
10     m->choose_local_tries = 2;
11     m->choose_local_fallback_tries = 5;
12     m->choose_total_tries = 19;
13     m->chooseleaf_descend_once = 0;
14     m->chooseleaf_vary_r = 0;
15     m->straw_calc_version = 0;
16 
17     // by default, use legacy types, and also exclude tree,
18     // since it was buggy.
19     m->allowed_bucket_algs = CRUSH_LEGACY_ALLOWED_BUCKET_ALGS;
20     return m;
21 }

2. 添加bucketapp

 分爲兩步:先make,後addide

 1  // 建立一個多個host類型的bucket,並給每一個bucket添加一個disk的item
 2   for(int host = 0; host < hosts_count; host++) {
 3     struct crush_bucket *b;
 4     // 函數建立一個bucket結構。須要指定CRUSH算法(uniform、list、tree、straw)、hash算法、
 5     // bucket類型(自定義的type)、size(包括多少個item)、這些items的id數組、這些items的weights
 6     b = crush_make_bucket(m, CRUSH_BUCKET_STRAW2, CRUSH_HASH_DEFAULT, host_type,
 7                           0, NULL, NULL);
 8     assert(b != NULL);
 9     // 往bucket中添加item, disk爲item的id,weight爲item的權重
10     assert(crush_bucket_add_item(m, b, disk, weight) == 0);
11     // 將b添加到m中,m->buckets[],bucket的id爲bucketno
12     assert(crush_add_bucket(m, 0, b, &bucketno) == 0);
13     hosts[host] = bucketno;
14     weights[host] = weight;
15     disk++;
16   }

 

2.1 make bucket函數

先看一下bucket的數據結構:oop

 1 /** @ingroup API
 2  *
 3  * A bucket contains __size__ __items__ which are either positive
 4  * numbers or negative numbers that reference other buckets and is
 5  * uniquely identified with __id__ which is a negative number.  The
 6  * __weight__ of a bucket is the cumulative weight of all its
 7  * children.  A bucket is assigned a ::crush_algorithm that is used by
 8  * crush_do_rule() to draw an item depending on its weight.  A bucket
 9  * can be assigned a strictly positive (> 0) __type__ defined by the
10  * caller. The __type__ can be used by crush_do_rule(), when it is
11  * given as an argument of a rule step.
12  *
13  * A pointer to crush_bucket can safely be cast into the following
14  * structure, depending on the value of __alg__:
15  *
16  * - __alg__ == ::CRUSH_BUCKET_UNIFORM cast to crush_bucket_uniform
17  * - __alg__ == ::CRUSH_BUCKET_LIST cast to crush_bucket_list
18  * - __alg__ == ::CRUSH_BUCKET_STRAW2 cast to crush_bucket_straw2
19  *
20  * The weight of each item depends on the algorithm and the
21  * information about it is available in the corresponding structure
22  * (crush_bucket_uniform, crush_bucket_list or crush_bucket_straw2).
23  *
24  * See crush_map for more information on how __id__ is used
25  * to reference the bucket.
26  */
27 struct crush_bucket {
28     __s32 id;        /*!< bucket identifier, < 0 and unique within a crush_map 小於0的惟一標記 */
29     __u16 type;      /*!< > 0 bucket type, defined by the caller > 0的值,用戶自定義的類型 */
30     __u8 alg;        /*!< the item selection ::crush_algorithm 所採用的crush算法 */
31         /*! @cond INTERNAL */
32     __u8 hash;       /* which hash function to use, CRUSH_HASH_* 使用的hash算法類型 */
33     /*! @endcond */
34     __u32 weight;    /*!< 16.16 fixed point cumulated children weight */
35     __u32 size;      /*!< size of the __items__ array,若是size爲0,則不包含item*/
36         __s32 *items;    /*!< array of children: < 0 are buckets, >= 0 items, 這是個數組,
37         記錄全部的item, 若是item的值是負數,則爲buket, 爲天然數則是device */
38 };

產生一個bucket:ui

b = crush_make_bucket(m, CRUSH_BUCKET_STRAW2, CRUSH_HASH_DEFAULT, host_type, 0, NULL, NULL);this

 1 struct crush_bucket*
 2 crush_make_bucket(struct crush_map *map,
 3           int alg, int hash, int type, int size,
 4           int *items,
 5           int *weights)
 6 {
 7     int item_weight;
 8 
 9     switch (alg) {
10     case CRUSH_BUCKET_UNIFORM:
11         if (size && weights)
12             item_weight = weights[0];
13         else
14             item_weight = 0;
15         return (struct crush_bucket *)crush_make_uniform_bucket(hash, type, size, items, item_weight);
16 
17     case CRUSH_BUCKET_LIST:
18         return (struct crush_bucket *)crush_make_list_bucket(hash, type, size, items, weights);
19 
20     case CRUSH_BUCKET_TREE:
21         return (struct crush_bucket *)crush_make_tree_bucket(hash, type, size, items, weights);
22 
23     case CRUSH_BUCKET_STRAW:
24         return (struct crush_bucket *)crush_make_straw_bucket(map, hash, type, size, items, weights);
25     case CRUSH_BUCKET_STRAW2:
26         return (struct crush_bucket *)crush_make_straw2_bucket(map, hash, type, size, items, weights);
27     }
28     return 0;
29 }
30 
31 struct crush_bucket_straw2 *
32 crush_make_straw2_bucket(struct crush_map *map,
33              int hash,          // hash值
34              int type,          // 類型
35              int size,          // item的大小
36              int *items,        // items
37              int *weights)      // item的weight值
38 {
39     struct crush_bucket_straw2 *bucket;
40     int i;
41 
42     bucket = malloc(sizeof(*bucket));
43         if (!bucket)
44                 return NULL;
45     memset(bucket, 0, sizeof(*bucket));
46     bucket->h.alg = CRUSH_BUCKET_STRAW2;
47     bucket->h.hash = hash;
48     bucket->h.type = type;
49     bucket->h.size = size;
50 
51     bucket->h.items = malloc(sizeof(__s32)*size);
52         if (!bucket->h.items)
53                 goto err;
54     bucket->item_weights = malloc(sizeof(__u32)*size);
55         if (!bucket->item_weights)
56                 goto err;
57 
58     bucket->h.weight = 0;
59     for (i=0; i<size; i++) {
60         bucket->h.items[i] = items[i];        // item中是item的id
61         bucket->h.weight += weights[i];       // 總的weight值
62         bucket->item_weights[i] = weights[i];
63     }
64 
65     return bucket;
66 err:
67         free(bucket->item_weights);
68         free(bucket->h.items);
69         free(bucket);
70         return NULL;
71 }

 

2.2 向bucket中添加itemspa

入參:

1. struct crush_map *map  map

2. struct crush_bucket *b   bucket

3. int item       item的id

4. int weight   item的weight

assert(crush_bucket_add_item(m, b, disk, weight) == 0);

 1 int crush_bucket_add_item(struct crush_map *map,
 2               struct crush_bucket *b, int item, int weight)
 3 {
 4     switch (b->alg) {
 5     case CRUSH_BUCKET_UNIFORM:
 6         return crush_add_uniform_bucket_item((struct crush_bucket_uniform *)b, item, weight);
 7     case CRUSH_BUCKET_LIST:
 8         return crush_add_list_bucket_item((struct crush_bucket_list *)b, item, weight);
 9     case CRUSH_BUCKET_TREE:
10         return crush_add_tree_bucket_item((struct crush_bucket_tree *)b, item, weight);
11     case CRUSH_BUCKET_STRAW:
12         return crush_add_straw_bucket_item(map, (struct crush_bucket_straw *)b, item, weight);
13     case CRUSH_BUCKET_STRAW2:
14         return crush_add_straw2_bucket_item(map, (struct crush_bucket_straw2 *)b, item, weight);
15     default:
16         return -1;
17     }
18 }
19 
20 int crush_add_straw2_bucket_item(struct crush_map *map,
21                  struct crush_bucket_straw2 *bucket,
22                  int item, int weight)
23 {
24     int newsize = bucket->h.size + 1;          // 添加一個item
25 
26     void *_realloc = NULL;
27 
28     if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
29         return -ENOMEM;
30     } else {
31         bucket->h.items = _realloc;
32     }
33     if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) {
34         return -ENOMEM;
35     } else {
36         bucket->item_weights = _realloc;
37     }
38 
39     bucket->h.items[newsize-1] = item;
40     bucket->item_weights[newsize-1] = weight;
41 
42     if (crush_addition_is_unsafe(bucket->h.weight, weight))
43                 return -ERANGE;
44 
45     bucket->h.weight += weight;
46     bucket->h.size++;
47 
48     return 0;
49 }

 

2.3  把bucket添加到map中

入參:

1. struct crush_map *map  

2. int id   bucket的id

3. struct crush_bucket *bucket   bucket的結構體

4. int *idout 和id值一致

crush_add_bucket(m, 0, b, &bucketno) == 0

 1 int crush_add_bucket(struct crush_map *map,
 2              int id,
 3              struct crush_bucket *bucket,
 4              int *idout)
 5 {
 6     int pos;
 7 
 8     /* find a bucket id */
 9     if (id == 0)
10         id = crush_get_next_bucket_id(map);
11     pos = -1 - id;
12 
13     while (pos >= map->max_buckets) {     // 擴充map的bucket數組大小
14         /* expand array */
15         int oldsize = map->max_buckets;
16         if (map->max_buckets)             // 若是已經設置最大的bucket個數,則加倍擴充
17             map->max_buckets *= 2;
18         else
19             map->max_buckets = 8;         // 未進行設置,則默認爲8
20         void *_realloc = NULL;
21         if ((_realloc = realloc(map->buckets, map->max_buckets * sizeof(map->buckets[0]))) == NULL) {
22             return -ENOMEM; 
23         } else {
24             map->buckets = _realloc;
25         }
26         memset(map->buckets + oldsize, 0, (map->max_buckets-oldsize) * sizeof(map->buckets[0]));  // 初始化
27     }
28 
29     if (map->buckets[pos] != 0) {   // 判斷是否已經存在
30         return -EEXIST;
31     }
32 
33     /* add it */
34     bucket->id = id;
35     map->buckets[pos] = bucket;
36 
37     if (idout) *idout = id;    // 返回id
38     return 0;
39 }

 

2.4 構造root bucket

構造一個root類型的bucket,並將hosts添加到該bucket中;

root = crush_make_bucket(m, CRUSH_BUCKET_STRAW2, CRUSH_HASH_DEFAULT, root_type, hosts_count, hosts, weights);

 1 // 入參
 2 // struct crush_map *map
 3 // int alg 算法類型 CRUSH_BUCKET_STRAW2
 4 // int hash 選擇使用的是那種hash函數類型
 5 // int type類型,root type
 6 // 包含的item的大小, hosts_count
 7 // int *items item的id數組 hosts
 8 // int *weights item的weight集合 weights
 9 struct crush_bucket*
10 crush_make_bucket(struct crush_map *map,
11           int alg, int hash, int type, int size,
12           int *items,
13           int *weights)

 

// 將bucket添加帶map中
assert(crush_add_bucket(m, 0, root, &bucket_root) == 0);


// 調整權重分佈
assert(crush_reweight_bucket(m, root) == 0);

 1 int crush_reweight_bucket(struct crush_map *crush, struct crush_bucket *b)
 2 {
 3     switch (b->alg) {
 4     case CRUSH_BUCKET_UNIFORM:
 5         return crush_reweight_uniform_bucket(crush, (struct crush_bucket_uniform *)b);
 6     case CRUSH_BUCKET_LIST:
 7         return crush_reweight_list_bucket(crush, (struct crush_bucket_list *)b);
 8     case CRUSH_BUCKET_TREE:
 9         return crush_reweight_tree_bucket(crush, (struct crush_bucket_tree *)b);
10     case CRUSH_BUCKET_STRAW:
11         return crush_reweight_straw_bucket(crush, (struct crush_bucket_straw *)b);
12     case CRUSH_BUCKET_STRAW2:
13         return crush_reweight_straw2_bucket(crush, (struct crush_bucket_straw2 *)b);
14     default:
15         return -1;
16     }
17 }
18 
19 static int crush_reweight_straw2_bucket(struct crush_map *crush, struct crush_bucket_straw2 *bucket)
20 {
21     unsigned i;
22 
23     bucket->h.weight = 0;
24     for (i = 0; i < bucket->h.size; i++) {
25         int id = bucket->h.items[i];
26         if (id < 0) {
27             struct crush_bucket *c = crush->buckets[-1-id];
28             crush_reweight_bucket(crush, c);        // 遞歸調用,逐級調整其實就是從新計算下每一層的bucket的weight
29             bucket->item_weights[i] = c->weight;
30         }
31 
32         if (crush_addition_is_unsafe(bucket->h.weight, bucket->item_weights[i]))
33             return -ERANGE;
34 
35         bucket->h.weight += bucket->item_weights[i];
36     }
37 
38     return 0;
39 }

 

至此,crush_map已經建立完成

梳理一下:

1. 建立一個空map;

2. 建立host類型的bucket,向bucket添加disk類型的item; bucket, bucket->h.items[]

3. 將host添加到bucket中; map->buckets[]

4. 建立root類型的bucket,將host類型的bucket添加到root下,並調整相應的weight值;

 

接下來分析ruleset規則的建立,它決定了rule的選路規則

rule的數據結構

 1 /*
 2  * The rule mask is used to describe what the rule is intended for.
 3  * Given a ruleset and size of output set, we search through the
 4  * rule list for a matching rule_mask.
 5  */
 6 struct crush_rule_mask {
 7     __u8 ruleset;  // ruleset的編號
 8     __u8 type;     // 類型
 9     __u8 min_size;
10     __u8 max_size;
11 };
12 
13 struct crush_rule {
14     __u32 len;   //steps數組的長度
15     struct crush_rule_mask mask;      // ruleset的相關配置參數
16     struct crush_rule_step steps[0];  // 每個操做步驟,操做類型+參數
17 };

 

1. 首先,建立一個rule的數據結構,用於存儲rule信息

r = crush_make_rule(number_of_steps, 0, 0, minsize, maxsize);

 1 // step的number,ruleset的id,type的id
 2 struct crush_rule *crush_make_rule(int len, int ruleset, int type, int minsize, int maxsize)
 3 {
 4     struct crush_rule *rule;
 5     rule = malloc(crush_rule_size(len));
 6         if (!rule)
 7                 return NULL;
 8     rule->len = len;
 9     rule->mask.ruleset = ruleset;
10     rule->mask.type = type;
11     rule->mask.min_size = minsize;
12     rule->mask.max_size = maxsize;
13     return rule;
14 }

 

2. 接着,添加step:

 1 crush_rule_set_step(r, 0, CRUSH_RULE_TAKE, bucket_root, 0);
 2 
 3 crush_rule_set_step(r, 1, CRUSH_RULE_CHOOSELEAF_FIRSTN, replication_count, host_type);
 4 
 5 crush_rule_set_step(r, 2, CRUSH_RULE_EMIT, 0, 0);
 6 
 7 /*  入參:n爲step的編號,op爲操做碼,arg爲參數
 8  * be careful; this doesn't verify that the buffer you allocated is big enough!
 9  */
10 void crush_rule_set_step(struct crush_rule *rule, int n, int op, int arg1, int arg2)
11 {
12     assert((__u32)n < rule->len);
13     rule->steps[n].op = op;
14     rule->steps[n].arg1 = arg1;
15     rule->steps[n].arg2 = arg2;
16 }

 

3. 添加rule:

 1 int crush_add_rule(struct crush_map *map, struct crush_rule *rule, int ruleno)
 2 {
 3     __u32 r;
 4     // 若是ruleno小於0,則本身分配連續的rule,不然,按照指定的ruleno
 5     if (ruleno < 0) {
 6         for (r=0; r < map->max_rules; r++)
 7             if (map->rules[r] == 0)
 8                 break;
 9         assert(r < CRUSH_MAX_RULES);
10     }
11     else
12         r = ruleno;
13 
14     if (r >= map->max_rules) {
15         /* expand array */
16         int oldsize;
17         void *_realloc = NULL;
18         if (map->max_rules +1 > CRUSH_MAX_RULES)
19             return -ENOSPC;
20         oldsize = map->max_rules;
21         map->max_rules = r+1;  
22         if ((_realloc = realloc(map->rules, map->max_rules * sizeof(map->rules[0]))) == NULL) {
23             return -ENOMEM; 
24         } else {
25             map->rules = _realloc;
26         } 
27         memset(map->rules + oldsize, 0, (map->max_rules-oldsize) * sizeof(map->rules[0]));
28     }
29 
30     /* add it */
31     map->rules[r] = rule;
32     return r;
33 }

 

看一下map的數據結構:

 1 /** @ingroup API
 2  *
 3  * A crush map define a hierarchy of crush_bucket that end with leaves
 4  * (buckets and leaves are called items) and a set of crush_rule to
 5  * map an integer to items with the crush_do_rule() function.
 6  *
 7  */
 8 struct crush_map {
 9         /*! An array of crush_bucket pointers of size __max_buckets__.
10          * An element of the array may be NULL if the bucket was removed with
11          * crush_remove_bucket(). The buckets must be added with crush_add_bucket().
12          * The bucket found at __buckets[i]__ must have a crush_bucket.id == -1-i.
13          */
14     struct crush_bucket **buckets;     // 動態二維數組,保存全部的bucket結構
15         /*! An array of crush_rule pointers of size __max_rules__.
16          * An element of the array may be NULL if the rule was removed (there is
17          * no API to do so but there may be one in the future). The rules must be added
18          * with crush_add_rule().
19          */
20     struct crush_rule **rules;        // 保存全部的Crush_rule 
21         __s32 max_buckets; /*!< the size of __buckets__ */
22     __u32 max_rules; /*!< the size of __rules__ */
23         /*! The value of the highest item stored in the crush_map + 1
24          */
25     __s32 max_devices;
26 
27     /*! choose local retries before re-descent */
28     __u32 choose_local_tries;
29     /*! choose local attempts using a fallback permutation before
30      *! re-descent */
31     __u32 choose_local_fallback_tries;
32     /*! choose attempts before giving up */
33     __u32 choose_total_tries;
34     /*! attempt chooseleaf inner descent once for firstn mode; on
35      *! reject retry outer descent.  Note that this does *not*
36      *! apply to a collision: in that case we will retry as we used
37      *! to. */
38     __u32 chooseleaf_descend_once;
39 
40     /*! if non-zero, feed r into chooseleaf, bit-shifted right by (r-1)
41      *! bits.  a value of 1 is best for new clusters.  for legacy clusters
42      *! that want to limit reshuffling, a value of 3 or 4 will make the
43      *! mappings line up a bit better with previous mappings. */
44     __u8 chooseleaf_vary_r;
45 
46     /*! if true, it makes chooseleaf firstn to return stable results (if
47      *! no local retry) so that data migrations would be optimal when some
48      *! device fails. */
49     __u8 chooseleaf_stable;
50 
51         /*! @cond INTERNAL */
52     /* This value is calculated after decode or construction by
53        the builder. It is exposed here (rather than having a
54        'build CRUSH working space' function) so that callers can
55        reserve a static buffer, allocate space on the stack, or
56        otherwise avoid calling into the heap allocator if they
57        want to. The size of the working space depends on the map,
58        while the size of the scratch vector passed to the mapper
59        depends on the size of the desired result set.
60 
61        Nothing stops the caller from allocating both in one swell
62        foop and passing in two points, though. */
63     size_t working_size;
64 
65 #ifndef __KERNEL__
66     /*
67      * version 0 (original) of straw_calc has various flaws.  version 1
68      * fixes a few of them.
69      */
70     __u8 straw_calc_version;
71 
72     /*
73      * allowed bucket algs is a bitmask, here the bit positions
74      * are CRUSH_BUCKET_*.  note that these are *bits* and
75      * CRUSH_BUCKET_* values are not, so we need to or together (1
76      * << CRUSH_BUCKET_WHATEVER).  The 0th bit is not used to
77      * minimize confusion (bucket type values start at 1).
78      */
79     __u32 allowed_bucket_algs;
80 
81     __u32 *choose_tries;
82 #endif
83     /*! @endcond */
84 };

 

4. finalize rule

 1 /* 更新working_size和max_devices
 2  * finalize should be called _after_ all buckets are added to the map.
 3  */
 4 void crush_finalize(struct crush_map *map)
 5 {
 6     int b;
 7     __u32 i;
 8 
 9     /* Calculate the needed working space while we do other
10        finalization tasks. */
11     map->working_size = sizeof(struct crush_work);
12     /* Space for the array of pointers to per-bucket workspace */
13     map->working_size += map->max_buckets *
14         sizeof(struct crush_work_bucket *);
15 
16     /* calc max_devices */
17     map->max_devices = 0;
18     for (b=0; b<map->max_buckets; b++) {
19         if (map->buckets[b] == 0)
20             continue;
21         // 更新map->max_devices
22         for (i=0; i<map->buckets[b]->size; i++)
23             if (map->buckets[b]->items[i] >= map->max_devices)
24                 map->max_devices = map->buckets[b]->items[i] + 1;
25 
26         switch (map->buckets[b]->alg) {
27         default:
28             /* The base case, permutation variables and
29                the pointer to the permutation array. */
30             map->working_size += sizeof(struct crush_work_bucket);
31             break;
32         }
33         /* Every bucket has a permutation array. */
34         map->working_size += map->buckets[b]->size * sizeof(__u32);
35     }
36 }

 

如何調用

先初始化空間:

 1 int cwin_size = crush_work_size(m, replication_count);  // 3倍空間,o和w
 2 
 3 crush_init_workspace(m, cwin);
 4 
 5 for(int x = 0; x < NUMBER_OF_OBJECTS; x++) {
 6       memset(result, '\0', sizeof(int) * replication_count);
 7       assert(crush_do_rule(m, ruleno, x, result, replication_count, weights, hosts_count, cwin) == replication_count);
 8       for(int i = 0; i < replication_count; i++) {
 9         object_map[i][x] = result[i];
10       }
11  }

 

crush do rule

  1 /**
  2  * crush_do_rule - calculate a mapping with the given input and rule
  3  * @map: the crush_map
  4  * @ruleno: the rule id
  5  * @x: hash input
  6  * @result: pointer to result vector
  7  * @result_max: maximum result size
  8  * @weight: weight vector (for map leaves)
  9  * @weight_max: size of weight vector
 10  * @cwin: Pointer to at least map->working_size bytes of memory or NULL.
 11  */
 12 int crush_do_rule(const struct crush_map *map,
 13           int ruleno, int x, int *result, int result_max,
 14           const __u32 *weight, int weight_max,
 15           void *cwin)
 16 {
 17     int result_len;
 18     struct crush_work *cw = cwin;
 19     int *a = (int *)((char *)cw + map->working_size);  // 用了一個scratch空間來完成item的搜索
 20     int *b = a + result_max;
 21     int *c = b + result_max;
 22     int *w = a;
 23     int *o = b;
 24     int recurse_to_leaf;
 25     int wsize = 0;
 26     int osize;
 27     int *tmp;
 28     const struct crush_rule *rule;
 29     __u32 step;
 30     int i, j;
 31     int numrep;
 32     int out_size;
 33     /*
 34      * the original choose_total_tries value was off by one (it
 35      * counted "retries" and not "tries").  add one.
 36      */
 37     int choose_tries = map->choose_total_tries + 1;
 38     int choose_leaf_tries = 0;
 39     /*
 40      * the local tries values were counted as "retries", though,
 41      * and need no adjustment
 42      */
 43     int choose_local_retries = map->choose_local_tries;
 44     int choose_local_fallback_retries = map->choose_local_fallback_tries;
 45 
 46     int vary_r = map->chooseleaf_vary_r;
 47     int stable = map->chooseleaf_stable;
 48 
 49     if ((__u32)ruleno >= map->max_rules) {
 50         dprintk(" bad ruleno %d\n", ruleno);
 51         return 0;
 52     }
 53 
 54     rule = map->rules[ruleno];
 55     result_len = 0;
 56 
 57     for (step = 0; step < rule->len; step++) {
 58         int firstn = 0;
 59         const struct crush_rule_step *curstep = &rule->steps[step];
 60 
 61         switch (curstep->op) {
 62         case CRUSH_RULE_TAKE:
 63             if ((curstep->arg1 >= 0 &&
 64                  curstep->arg1 < map->max_devices) ||
 65                 (-1-curstep->arg1 >= 0 &&
 66                  -1-curstep->arg1 < map->max_buckets &&
 67                  map->buckets[-1-curstep->arg1])) {
 68                 w[0] = curstep->arg1;
 69                 wsize = 1;
 70             } else {
 71                 dprintk(" bad take value %d\n", curstep->arg1);
 72             }
 73             break;
 74 
 75         case CRUSH_RULE_SET_CHOOSE_TRIES:
 76             if (curstep->arg1 > 0)
 77                 choose_tries = curstep->arg1;
 78             break;
 79 
 80         case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
 81             if (curstep->arg1 > 0)
 82                 choose_leaf_tries = curstep->arg1;
 83             break;
 84 
 85         case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
 86             if (curstep->arg1 >= 0)
 87                 choose_local_retries = curstep->arg1;
 88             break;
 89 
 90         case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
 91             if (curstep->arg1 >= 0)
 92                 choose_local_fallback_retries = curstep->arg1;
 93             break;
 94 
 95         case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
 96             if (curstep->arg1 >= 0)
 97                 vary_r = curstep->arg1;
 98             break;
 99 
100         case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
101             if (curstep->arg1 >= 0)
102                 stable = curstep->arg1;
103             break;
104 
105         case CRUSH_RULE_CHOOSELEAF_FIRSTN:
106         case CRUSH_RULE_CHOOSE_FIRSTN:
107             firstn = 1;
108             /* fall through */
109         case CRUSH_RULE_CHOOSELEAF_INDEP:
110         case CRUSH_RULE_CHOOSE_INDEP:
111             if (wsize == 0)
112                 break;
113 
114             recurse_to_leaf =
115                 curstep->op ==
116                  CRUSH_RULE_CHOOSELEAF_FIRSTN ||
117                 curstep->op ==
118                 CRUSH_RULE_CHOOSELEAF_INDEP;
119 
120             /* reset output */
121             osize = 0;
122 
123             for (i = 0; i < wsize; i++) {
124                 int bno;
125                 /*
126                  * see CRUSH_N, CRUSH_N_MINUS macros.
127                  * basically, numrep <= 0 means relative to
128                  * the provided result_max
129                  */
130                 numrep = curstep->arg1;
131                 if (numrep <= 0) {
132                     numrep += result_max;
133                     if (numrep <= 0)
134                         continue;
135                 }
136                 j = 0;
137                 /* make sure bucket id is valid */
138                 bno = -1 - w[i];
139                 if (bno < 0 || bno >= map->max_buckets) {
140                     // w[i] is probably CRUSH_ITEM_NONE
141                     dprintk("  bad w[i] %d\n", w[i]);
142                     continue;
143                 }
144                 if (firstn) {
145                     int recurse_tries;
146                     if (choose_leaf_tries)
147                         recurse_tries =
148                             choose_leaf_tries;
149                     else if (map->chooseleaf_descend_once)
150                         recurse_tries = 1;
151                     else
152                         recurse_tries = choose_tries;
153                     osize += crush_choose_firstn(
154                         map,
155                         cw,
156                         map->buckets[bno],
157                         weight, weight_max,
158                         x, numrep,
159                         curstep->arg2,
160                         o+osize, j,
161                         result_max-osize,
162                         choose_tries,
163                         recurse_tries,
164                         choose_local_retries,
165                         choose_local_fallback_retries,
166                         recurse_to_leaf,
167                         vary_r,
168                         stable,
169                         c+osize,
170                         0);
171                 } else {
172                     out_size = ((numrep < (result_max-osize)) ?
173                             numrep : (result_max-osize));
174                     crush_choose_indep(
175                         map,
176                         cw,
177                         map->buckets[bno],
178                         weight, weight_max,
179                         x, out_size, numrep,
180                         curstep->arg2,
181                         o+osize, j,
182                         choose_tries,
183                         choose_leaf_tries ?
184                            choose_leaf_tries : 1,
185                         recurse_to_leaf,
186                         c+osize,
187                         0);
188                     osize += out_size;
189                 }
190             }
191 
192             if (recurse_to_leaf)
193                 /* copy final _leaf_ values to output set */
194                 memcpy(o, c, osize*sizeof(*o));
195 
196             /* swap o and w arrays */
197             tmp = o;
198             o = w;
199             w = tmp;
200             wsize = osize;
201             break;
202 
203 
204         case CRUSH_RULE_EMIT:
205             for (i = 0; i < wsize && result_len < result_max; i++) {
206                 result[result_len] = w[i];
207                 result_len++;
208             }
209             wsize = 0;
210             break;
211 
212         default:
213             dprintk(" unknown op %d at step %d\n",
214                 curstep->op, step);
215             break;
216         }
217     }
218 
219     return result_len;
220 }
相關文章
相關標籤/搜索