vpp支持兩套qos實現,一套是基於policer實現的qos,另一套是基於dpdk的qos套件實現的hqos。html
如上圖所示,worker線程從nic中讀取報文進行處理,調用dpdk設備發送函數時,若是配置了hqos,那麼設置hqos相關參數,將其送入swq隊列(swq隊列與worker線程是1:1的關係),worker線程處理結束,hqos線程(根據配置決定個數)輪詢從swq中讀取報文進行qos處理,qos使用的是dpdk qos套件。shell
dpdk { socket-mem 16384,16384 dev 0000:02:00.0 { num-rx-queues 2 hqos #使能網卡hqos } dev 0000:06:00.0 { num-rx-queues 2 hqos #使能網卡hqos } num-mbufs 1000000 }
cpu { main-core 0 corelist-workers 1, 2, 3, 4 corelist-hqos-threads 5, 6 #啓動兩個hqos線程,分別使用cpu5和6 }
經過上面兩步配置便可開啓hqos配置,會使用默認的hqos配置。api
port { rate 1250000000 /* Assuming 10GbE port */ frame_overhead 24 /* Overhead fields per Ethernet frame: * 7B (Preamble) + * 1B (Start of Frame Delimiter (SFD)) + * 4B (Frame Check Sequence (FCS)) + * 12B (Inter Frame Gap (IFG)) */ mtu 1522 /* Assuming Ethernet/IPv4 pkt (FCS not included) */ n_subports_per_port 1 /* Number of subports per output interface */ n_pipes_per_subport 4096 /* Number of pipes (users/subscribers) */ queue_sizes 64 64 64 64 /* Packet queue size for each traffic class. * All queues within the same pipe traffic class * have the same size. Queues from different * pipes serving the same traffic class have * the same size. */ }
subport 0 { tb_rate 1250000000 /* Subport level token bucket rate (bytes per second) */ tb_size 1000000 /* Subport level token bucket size (bytes) */ tc0_rate 1250000000 /* Subport level token bucket rate for traffic class 0 (bytes per second) */ tc1_rate 1250000000 /* Subport level token bucket rate for traffic class 1 (bytes per second) */ tc2_rate 1250000000 /* Subport level token bucket rate for traffic class 2 (bytes per second) */ tc3_rate 1250000000 /* Subport level token bucket rate for traffic class 3 (bytes per second) */ tc_period 10 /* Time interval for refilling the token bucket associated with traffic class (Milliseconds) */ pipe 0 4095 profile 0 /* pipe 0到4095使用profile0 pipes (users/subscribers) configured with pipe profile 0 */ }
pipe_profile 0 { tb_rate 305175 /* Pipe level token bucket rate (bytes per second) */ tb_size 1000000 /* Pipe level token bucket size (bytes) */ tc0_rate 305175 /* Pipe level token bucket rate for traffic class 0 (bytes per second) */ tc1_rate 305175 /* Pipe level token bucket rate for traffic class 1 (bytes per second) */ tc2_rate 305175 /* Pipe level token bucket rate for traffic class 2 (bytes per second) */ tc3_rate 305175 /* Pipe level token bucket rate for traffic class 3 (bytes per second) */ tc_period 40 /* Time interval for refilling the token bucket associated with traffic class at pipe level (Milliseconds) */ tc3_oversubscription_weight 1 /* Weight traffic class 3 oversubscription */ tc0_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 0 */ tc1_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 1 */ tc2_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 2 */ tc3_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 3 */ }
red { tc0_wred_min 48 40 32 /* Minimum threshold for traffic class 0 queue (min_th) in number of packets */ tc0_wred_max 64 64 64 /* Maximum threshold for traffic class 0 queue (max_th) in number of packets */ tc0_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 0 queue (maxp = 1 / maxp_inv) */ tc0_wred_weight 9 9 9 /* Traffic Class 0 queue weight */ tc1_wred_min 48 40 32 /* Minimum threshold for traffic class 1 queue (min_th) in number of packets */ tc1_wred_max 64 64 64 /* Maximum threshold for traffic class 1 queue (max_th) in number of packets */ tc1_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 1 queue (maxp = 1 / maxp_inv) */ tc1_wred_weight 9 9 9 /* Traffic Class 1 queue weight */ tc2_wred_min 48 40 32 /* Minimum threshold for traffic class 2 queue (min_th) in number of packets */ tc2_wred_max 64 64 64 /* Maximum threshold for traffic class 2 queue (max_th) in number of packets */ tc2_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 2 queue (maxp = 1 / maxp_inv) */ tc2_wred_weight 9 9 9 /* Traffic Class 2 queue weight */ tc3_wred_min 48 40 32 /* Minimum threshold for traffic class 3 queue (min_th) in number of packets */ tc3_wred_max 64 64 64 /* Maximum threshold for traffic class 3 queue (max_th) in number of packets */ tc3_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 3 queue (maxp = 1 / maxp_inv) */ tc3_wred_weight 9 9 9 /* Traffic Class 3 queue weight */ }
port,subport,pipe,tc,queue這些配置某些參數也可使用命令行或者api進行配置。數組
set dpdk interface hqos subport <if-name> subport <n> [rate <n>] [bktsize <n>] [tc0 <n>] [tc1 <n>] [tc2 <n>] [tc3 <n>] [period <n>]
set dpdk interface hqos pipe <if-name> subport <n> pipe <n> profile <n>
set dpdk interface hqos placement <if-name> thread <n>
set dpdk interface hqos pktfield <if-name> id <n> offset <n> mask <n>
set dpdk interface hqos tctbl <if-name> entry <n> tc <n> queue <n>
vpp# show dpdk interface hqos TenGigabitEthernet2/0/0 Thread: Input SWQ size = 4096 packets Enqueue burst size = 256 packets Dequeue burst size = 220 packets Packet field 0: slab position = 0, slab bitmask = 0x0000000000000000 Packet field 1: slab position = 40, slab bitmask = 0x0000000fff000000 Packet field 2: slab position = 8, slab bitmask = 0x00000000000000fc Packet field 2 translation table: [ 0 .. 15]: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 [16 .. 31]: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 [32 .. 47]: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 [48 .. 63]: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 Port: Rate = 1250000000 bytes/second MTU = 1514 bytes Frame overhead = 24 bytes Number of subports = 1 Number of pipes per subport = 4096 Packet queue size: TC0 = 64, TC1 = 64, TC2 = 64, TC3 = 64 packets Number of pipe profiles = 1 Pipe profile 0: Rate = 305175 bytes/second Token bucket size = 1000000 bytes Traffic class rate: TC0 = 305175, TC1 = 305175, TC2 = 305175, TC3 = 305175 bytes/second TC period = 40 milliseconds TC0 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1 TC1 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1 TC2 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1 TC3 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1
vpp# show dpdk interface hqos placement Thread 5 (vpp_hqos-threads_0 at lcore 5): TenGigabitEthernet2/0/0 queue 0 Thread 6 (vpp_hqos-threads_1 at lcore 6): TenGigabitEthernet4/0/1 queue 0
該部份內容請參考:數據結構
http://doc.dpdk.org/guides/pr...socket
HQOS是基於每一個dpdk設備的,因此dpdk設備描述控制塊存在與hqos相關的成員。ide
typedef struct { ...... /* HQoS related 工做線程與hqos線程之間有一個映射關係,由於二者線程不同多 */ dpdk_device_hqos_per_worker_thread_t *hqos_wt;//工做線程隊列,工做線程往hqos_wt.swq中寫報文,swq指向hqos_ht.swq dpdk_device_hqos_per_hqos_thread_t *hqos_ht;//hqos線程隊列,hqos線程從hqos_ht.swq中讀報文 ...... } dpdk_device_t;
typedef struct { /* Required for vec_validate_aligned */ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); struct rte_ring *swq;//指向該設備的該線程將報文寫入的目標環形隊列。 //這些參數用來對報文進行分類,設置dpdk qos的subport,pipe,tc,queue等參數 u64 hqos_field0_slabmask; u32 hqos_field0_slabpos; u32 hqos_field0_slabshr; u64 hqos_field1_slabmask; u32 hqos_field1_slabpos; u32 hqos_field1_slabshr; u64 hqos_field2_slabmask; u32 hqos_field2_slabpos; u32 hqos_field2_slabshr; u32 hqos_tc_table[64]; } dpdk_device_hqos_per_worker_thread_t;
上面是每工做線程與hqos相關的私有數據,dpdk設備維護一個這樣的數組,每個工做線程一個成員。該結構中的成員主要用來對報文進行分類,從上面能夠看出,報文分類太簡單,沒法知足較細的分類需求。能夠結合分類器作進一步改進。函數
typedef struct { /* Required for vec_validate_aligned */ CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); struct rte_ring **swq;//worker thread和hqos thread線程報文中轉環形隊列數組,其大小等於worker線程的個數 struct rte_mbuf **pkts_enq;//入隊,用於從swq中提取報文放入到pkts_enq中,而後將pkts_enq中的報文壓入qos struct rte_mbuf **pkts_deq;//出隊,從qos中提取報文,而後從網口發送出去。 struct rte_sched_port *hqos;//設備的dpdk qos配置參數 u32 hqos_burst_enq;//設備一次入隊報文的個數限制 u32 hqos_burst_deq;//設備一次出隊報文的個數限制 u32 pkts_enq_len;//當前入隊報文中的個數 u32 swq_pos;//hqos迭代器當前處理的軟件隊列的位置 u32 flush_count;//當前報文不足批量寫入個數空轉次數,達到必定次數後,當即寫入,再也不等待。 } dpdk_device_hqos_per_hqos_thread_t;
上面是每hqos線程與hqos相關的私有數據,一個dpdk設備只能屬於一個hqos線程。該結構主要維護與worker線程的橋接隊列以及該port的hqos參數。源碼分析
typedef struct { ...... //dpdk設備hqos cpu索引數組,根據hqos索引獲取其管理的dpdk設備數組,構建了hqos線程與dpdk設備的關係,是一對多的關係 dpdk_device_and_queue_t **devices_by_hqos_cpu; ...... } dpdk_main_t;
/* *INDENT-OFF* 註冊dpdk類線程*/ VLIB_REGISTER_THREAD (hqos_thread_reg, static) = { .name = "hqos-threads", .short_name = "hqos-threads", .function = dpdk_hqos_thread_fn,//執行函數 };
前面提到的配置corelist-hqos-threads 5,6 。VPP在解析到該配置後,會使用hqos-threads去線程類型鏈表中去查找是否存在這樣的線程,找到後會啓動對應個數的該類線程,線程的主函數爲dpdk_hqos_thread_fn。ui
//dpdk線程執行函數 void dpdk_hqos_thread_fn (void *arg) { vlib_worker_thread_t *w = (vlib_worker_thread_t *) arg; vlib_worker_thread_init (w);//hqos線程初始化 dpdk_hqos_thread (w); } void dpdk_hqos_thread (vlib_worker_thread_t * w) { vlib_main_t *vm; vlib_thread_main_t *tm = vlib_get_thread_main (); dpdk_main_t *dm = &dpdk_main; vm = vlib_get_main (); ASSERT (vm->thread_index == vlib_get_thread_index ()); clib_time_init (&vm->clib_time); clib_mem_set_heap (w->thread_mheap); /* Wait until the dpdk init sequence is complete */ while (tm->worker_thread_release == 0) vlib_worker_thread_barrier_check (); //根據cpu索引 if (vec_len (dm->devices_by_hqos_cpu[vm->thread_index]) == 0) return clib_error ("current I/O TX thread does not have any devices assigned to it"); if (DPDK_HQOS_DBG_BYPASS) dpdk_hqos_thread_internal_hqos_dbg_bypass (vm);//調試函數,用於調試 else dpdk_hqos_thread_internal (vm);//核心處理函數 }
static_always_inline void dpdk_hqos_thread_internal (vlib_main_t * vm) { dpdk_main_t *dm = &dpdk_main; u32 thread_index = vm->thread_index; u32 dev_pos; dev_pos = 0;//起始設備編號 while (1)//循環遍歷每個設備 { vlib_worker_thread_barrier_check ();//查看主線程是否通知了sync //根據cpu id獲取該cpu的設備數組 u32 n_devs = vec_len (dm->devices_by_hqos_cpu[thread_index]); if (PREDICT_FALSE (n_devs == 0)) { dev_pos = 0; continue; } //一圈遍歷完成,開始新的一圈。 if (dev_pos >= n_devs) dev_pos = 0; //獲取當前須要遍歷的設備的上下文 dpdk_device_and_queue_t *dq = vec_elt_at_index (dm->devices_by_hqos_cpu[thread_index], dev_pos); //獲取dpdk設備描述控制塊 dpdk_device_t *xd = vec_elt_at_index (dm->devices, dq->device); //獲取該設備的線程數據 dpdk_device_hqos_per_hqos_thread_t *hqos = xd->hqos_ht; u32 device_index = xd->port_id; u16 queue_id = dq->queue_id; //如隊列 struct rte_mbuf **pkts_enq = hqos->pkts_enq; //出隊列 struct rte_mbuf **pkts_deq = hqos->pkts_deq; //入隊中已經存在的報文個數 u32 pkts_enq_len = hqos->pkts_enq_len; u32 swq_pos = hqos->swq_pos;//獲取當前隊裏的隊列位置 u32 n_swq = vec_len (hqos->swq), i; u32 flush_count = hqos->flush_count;//統計報文不足次數 /* * SWQ dequeue and HQoS enqueue for current device * 從swq隊列中出報文,而後將其壓入Hqos隊列 */ for (i = 0; i < n_swq; i++) { /* Get current SWQ for this device */ struct rte_ring *swq = hqos->swq[swq_pos]; /* Read SWQ burst to packet buffer of this device */ /* 從swq中出報文 */ pkts_enq_len += rte_ring_sc_dequeue_burst (swq, (void **) &pkts_enq[pkts_enq_len], hqos->hqos_burst_enq, 0); /* Get next SWQ for this device */ swq_pos++; if (swq_pos >= n_swq) swq_pos = 0; hqos->swq_pos = swq_pos; /* HQoS enqueue when burst available */ // 將報文壓入hqos隊列,一次壓入hqos_burst_enq報文,而後退出 if (pkts_enq_len >= hqos->hqos_burst_enq) { rte_sched_port_enqueue (hqos->hqos, pkts_enq, pkts_enq_len); pkts_enq_len = 0; flush_count = 0; break; } } if (pkts_enq_len)//須要壓入pkts_enq_len個報文 { flush_count++;//報文不足次數 if (PREDICT_FALSE (flush_count == HQOS_FLUSH_COUNT_THRESHOLD)) { rte_sched_port_enqueue (hqos->hqos, pkts_enq, pkts_enq_len); pkts_enq_len = 0; flush_count = 0; } } hqos->pkts_enq_len = pkts_enq_len; hqos->flush_count = flush_count; /* * HQoS dequeue and HWQ TX enqueue for current device * 開始從hqos隊列中出隊,將報文發送出去 */ { u32 pkts_deq_len, n_pkts; pkts_deq_len = rte_sched_port_dequeue (hqos->hqos, pkts_deq, hqos->hqos_burst_deq); for (n_pkts = 0; n_pkts < pkts_deq_len;) //將報文發送出去 n_pkts += rte_eth_tx_burst (device_index, (uint16_t) queue_id, &pkts_deq[n_pkts], (uint16_t) (pkts_deq_len - n_pkts)); } /* Advance to next device */ dev_pos++; } }