搞網絡不知道dpdk。。。不合適。。。html
搞dpdk不知道rte_mbuf。。。不合適。。。數組
因此,搞搞搞。。。網絡
上源碼!!!數據結構
//關於dpdk rte_mbuf數據結構的學習 /* define a set of marker types that can be used to refer to set points in the * mbuf */ /* 定義一組可用於引用 mbuf 中的設置點的標記類型*/ __extension__ typedef void *MARKER[0]; /**< generic marker for a point in a structure */ __extension__ typedef uint8_t MARKER8[0]; /**< generic marker with 1B alignment */ __extension__ typedef uint64_t MARKER64[0]; /**< marker that allows us to overwrite 8 bytes * with a single assignment */ /** * The generic rte_mbuf, containing a packet mbuf. */ struct rte_mbuf { MARKER cacheline0; /* 柔性數組,標記開頭 */ void *buf_addr; /**< Virtual address of segment buffer. */ /** * Physical address of segment buffer. * Force alignment to 8-bytes, so as to ensure we have the exact * same mbuf cacheline0 layout for 32-bit and 64-bit. This makes * working on vector drivers easier. */ RTE_STD_C11 union { rte_iova_t buf_iova; rte_iova_t buf_physaddr; /**< deprecated */ } __rte_aligned(sizeof(rte_iova_t)); /* next 8 bytes are initialised on RX descriptor rearm */ MARKER64 rearm_data; uint16_t data_off; /** * Reference counter. Its size should at least equal to the size * of port field (16 bits), to support zero-copy broadcast. * It should only be accessed using the following functions: * rte_mbuf_refcnt_update(), rte_mbuf_refcnt_read(), and * rte_mbuf_refcnt_set(). The functionality of these functions (atomic, * or non-atomic) is controlled by the CONFIG_RTE_MBUF_REFCNT_ATOMIC * config option. */ RTE_STD_C11 union { rte_atomic16_t refcnt_atomic; /**< Atomically accessed refcnt */ uint16_t refcnt; /**< Non-atomically accessed refcnt */ }; uint16_t nb_segs; /**< Number of segments. */ /** Input port (16 bits to support more than 256 virtual ports). */ uint16_t port; uint64_t ol_flags; /**< Offload features. */ /* remaining bytes are set on RX when pulling packet from descriptor */ MARKER rx_descriptor_fields1; /* * The packet type, which is the combination of outer/inner L2, L3, L4 * and tunnel types. The packet_type is about data really present in the * mbuf. Example: if vlan stripping is enabled, a received vlan packet * would have RTE_PTYPE_L2_ETHER and not RTE_PTYPE_L2_VLAN because the * vlan is stripped from the data. */ RTE_STD_C11 union { uint32_t packet_type; /**< L2/L3/L4 and tunnel information. */ struct { uint32_t l2_type:4; /**< (Outer) L2 type. */ uint32_t l3_type:4; /**< (Outer) L3 type. */ uint32_t l4_type:4; /**< (Outer) L4 type. */ uint32_t tun_type:4; /**< Tunnel type. */ RTE_STD_C11 union { uint8_t inner_esp_next_proto; /**< ESP next protocol type, valid if * RTE_PTYPE_TUNNEL_ESP tunnel type is set * on both Tx and Rx. */ __extension__ struct { uint8_t inner_l2_type:4; /**< Inner L2 type. */ uint8_t inner_l3_type:4; /**< Inner L3 type. */ }; }; uint32_t inner_l4_type:4; /**< Inner L4 type. */ }; }; uint32_t pkt_len; /**< Total pkt len: sum of all segments. */ uint16_t data_len; /**< Amount of data in segment buffer. */ /** VLAN TCI (CPU order), valid if PKT_RX_VLAN_STRIPPED is set. */ uint16_t vlan_tci; union { uint32_t rss; /**< RSS hash result if RSS enabled */ struct { RTE_STD_C11 union { struct { uint16_t hash; uint16_t id; }; uint32_t lo; /**< Second 4 flexible bytes */ }; uint32_t hi; /**< First 4 flexible bytes or FD ID, dependent on PKT_RX_FDIR_* flag in ol_flags. */ } fdir; /**< Filter identifier if FDIR enabled */ struct { uint32_t lo; uint32_t hi; } sched; /**< Hierarchical scheduler */ uint32_t usr; /**< User defined tags. See rte_distributor_process() */ } hash; /**< hash information */ /** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ_STRIPPED is set. */ uint16_t vlan_tci_outer; uint16_t buf_len; /**< Length of segment buffer. */ /** Valid if PKT_RX_TIMESTAMP is set. The unit and time reference * are not normalized but are always the same for a given port. */ uint64_t timestamp; /* second cache line - fields only used in slow path or on TX */ MARKER cacheline1 __rte_cache_min_aligned; RTE_STD_C11 union { void *userdata; /**< Can be used for external metadata */ uint64_t udata64; /**< Allow 8-byte userdata on 32-bit */ }; struct rte_mempool *pool; /**< Pool from which mbuf was allocated. */ struct rte_mbuf *next; /**< Next segment of scattered packet. */ /* fields to support TX offloads */ RTE_STD_C11 union { uint64_t tx_offload; /**< combined for easy fetch */ __extension__ struct { uint64_t l2_len:7; /**< L2 (MAC) Header Length for non-tunneling pkt. * Outer_L4_len + ... + Inner_L2_len for tunneling pkt. */ uint64_t l3_len:9; /**< L3 (IP) Header Length. */ uint64_t l4_len:8; /**< L4 (TCP/UDP) Header Length. */ uint64_t tso_segsz:16; /**< TCP TSO segment size */ /* fields for TX offloading of tunnels */ uint64_t outer_l3_len:9; /**< Outer L3 (IP) Hdr Length. */ uint64_t outer_l2_len:7; /**< Outer L2 (MAC) Hdr Length. */ /* uint64_t unused:8; */ }; }; /** Size of the application private data. In case of an indirect * mbuf, it stores the direct mbuf private data size. */ uint16_t priv_size; /** Timesync flags for use with IEEE1588. */ uint16_t timesync; /** Sequence number. See also rte_reorder_insert(). */ uint32_t seqn; }
好傢伙,果真mbuf,大名鼎鼎。下面分別對每一個字段進行學習解釋。app
下面按照出現順序對每一個字段進行解釋。ide
MARKER cacheline0; typedef void *MARKER[0]; /**< generic marker for a point in a structure */
查看typedef,發現這是一個柔性數組。長度爲0,因此這裏在編譯時是不佔用內存滴。只是一個標記嘍。MARKER嘛。函數
void *buf_addr; /**< Virtual address of segment buffer. */
有圖就容易解釋了,一些指針、成員或函數結果的內容在下表中列出,mbuf指針簡寫爲m性能
m | 首部,即mbuf結構體 |
m->buf_addr | headroom起始地址 |
m->data_off | data起始地址相對於buf_addr的偏移 |
m->buf_len | mbuf和priv以後內存的長度,包含headroom |
m->pkt_len | 整個mbuf鏈的data總長度 |
m->data_len | 實際data的長度 |
m->buf_addr+m->data_off | 實際data的起始地址 |
rte_pktmbuf_mtod(m) | 同上 |
rte_pktmbuf_data_len(m) | 同m->data_len |
rte_pktmbuf_pkt_len | 同m->pkt_len |
rte_pktmbuf_data_room_size | 同m->buf_len |
rte_pktmbuf_headroom | headroom長度 |
rte_pktmbuf_tailroom | 尾部剩餘空間長度 |
綜合圖片解釋以及上述表格的備註。這裏buf_addr就是rte_mbuf結構體尾部,headroom起始地址。學習
/** * Physical address of segment buffer. * Force alignment to 8-bytes, so as to ensure we have the exact * same mbuf cacheline0 layout for 32-bit and 64-bit. This makes * working on vector drivers easier. */ RTE_STD_C11 union { rte_iova_t buf_iova; rte_iova_t buf_physaddr; /**< deprecated */ } __rte_aligned(sizeof(rte_iova_t));
段緩衝區的物理地址。 強制8字節對齊,保證在32位和64位有相同的cacheline0。這塊暫時無需關注。fetch
/* next 8 bytes are initialised on RX descriptor rearm */ MARKER64 rearm_data;
接下來的 8 個字節在 RX 描述符重裝時初始化 。
uint16_t data_off;
data起始地址相對於buf_addr的偏移。要獲取data的位置,m->buf_addr + m->data_off ,就是對應的data的實際指針。通常中間間隔是一個headroom的大小。
/** * Reference counter. Its size should at least equal to the size * of port field (16 bits), to support zero-copy broadcast. * It should only be accessed using the following functions: * rte_mbuf_refcnt_update(), rte_mbuf_refcnt_read(), and * rte_mbuf_refcnt_set(). The functionality of these functions (atomic, * or non-atomic) is controlled by the CONFIG_RTE_MBUF_REFCNT_ATOMIC * config option. */ RTE_STD_C11 union { rte_atomic16_t refcnt_atomic; /**< Atomically accessed refcnt */ uint16_t refcnt; /**< Non-atomically accessed refcnt */ };
引用計數。這裏用union實現了原子訪問和非原子訪問2種。計數的規格至少等於端口字段的大小16bits,(用來支持零拷貝廣播?不明白)。
uint16_t nb_segs; /**< Number of segments. */
分片數。
/** Input port (16 bits to support more than 256 virtual ports). */ uint16_t port;
入接口id號。
uint64_t ol_flags; /**< Offload features. */
offload特性標記。
offload特性,主要是指將本來在協議棧中進行的IP分片、TCP分段、重組、checksum校驗等操做,轉移到網卡硬件中進行,下降系統CPU的消耗,提升處理性能。
/* remaining bytes are set on RX when pulling packet from descriptor */ MARKER rx_descriptor_fields1;
從描述符中提取數據包時,剩餘字節設置在 RX 上。標記使用,MARKER。。。
/* * The packet type, which is the combination of outer/inner L2, L3, L4 * and tunnel types. The packet_type is about data really present in the * mbuf. Example: if vlan stripping is enabled, a received vlan packet * would have RTE_PTYPE_L2_ETHER and not RTE_PTYPE_L2_VLAN because the * vlan is stripped from the data. */ /* 數據包類型,它是外部/內部 L二、L三、L4 和隧道類型的組合。 * packet_type 是關於 mbuf 中真正存在的數據。 * 若是啓用了 vlan 剝離,則接收到的 vlan 數據包將具備 RTE_PTYPE_L2_ETHER * 而不是 RTE_PTYPE_L2_VLAN,由於 vlan 已從數據中剝離。 */ RTE_STD_C11 union { uint32_t packet_type; /**< L2/L3/L4 and tunnel information. */ struct { uint32_t l2_type:4; /**< (Outer) L2 type. */ uint32_t l3_type:4; /**< (Outer) L3 type. */ uint32_t l4_type:4; /**< (Outer) L4 type. */ uint32_t tun_type:4; /**< Tunnel type. */ RTE_STD_C11 union { uint8_t inner_esp_next_proto; /**< ESP next protocol type, valid if * RTE_PTYPE_TUNNEL_ESP tunnel type is set * on both Tx and Rx. */ __extension__ struct { uint8_t inner_l2_type:4; /**< Inner L2 type. */ uint8_t inner_l3_type:4; /**< Inner L3 type. */ }; }; uint32_t inner_l4_type:4; /**< Inner L4 type. */ }; };
此數據結構比較清晰,無需多餘解釋。有一個疑問,這裏的inner && outer具體是什麼呢?
uint32_t pkt_len; /**< Total pkt len: sum of all segments. */ uint16_t data_len; /**< Amount of data in segment buffer. */
pkt_len,包括全部分片的長度。
data_len,當前的數據長度。若是沒有分片,pkt_len與data_len數值應該是相同的。也就是pkt_len >= data_len.
/** VLAN TCI (CPU order), valid if PKT_RX_VLAN_STRIPPED is set. */ uint16_t vlan_tci;
只有開啓了PKT_RX_VLAN_STRIPPED標記,此字段纔是有效的。vlan時使用,學習vlan時,須要關注此字段。
union { uint32_t rss; /**< RSS hash result if RSS enabled */ struct { RTE_STD_C11 union { struct { uint16_t hash; uint16_t id; }; uint32_t lo; /**< Second 4 flexible bytes */ }; uint32_t hi; /**< First 4 flexible bytes or FD ID, dependent on PKT_RX_FDIR_* flag in ol_flags. */ } fdir; /**< Filter identifier if FDIR enabled */ struct { uint32_t lo; uint32_t hi; } sched; /**< Hierarchical scheduler */ uint32_t usr; /**< User defined tags. See rte_distributor_process() */ } hash; /**< hash information */
哈希數據。這裏是一個union。當RSS開啓時,對應rss字段是哈希結果。學習RSS時,關注一下。
/** Outer VLAN TCI (CPU order), valid if PKT_RX_QINQ_STRIPPED is set. */ uint16_t vlan_tci_outer;
只有開啓了QINQ剝離時,此字段有效。外部vlan相關。
uint16_t buf_len; /**< Length of segment buffer. */
mbuf和priv以後內存的長度,包含headroom。
/** Valid if PKT_RX_TIMESTAMP is set. The unit and time reference * are not normalized but are always the same for a given port. */ uint64_t timestamp;
時間戳。PKT_RX_TIMESAMP開啓時,此字段有效。單位和時間參考未標準化,但對於給定端口始終相同。
/* second cache line - fields only used in slow path or on TX */ MARKER cacheline1 __rte_cache_min_aligned;
第二個cacheline,這部份內容僅用在慢路或者發包流程中。
RTE_STD_C11 union { void *userdata; /**< Can be used for external metadata */ uint64_t udata64; /**< Allow 8-byte userdata on 32-bit */ }; //#define RTE_STD_C11 __extension__
__extension__字段用於消除編譯告警。
這裏是一個union,
在userdata指針總能夠用來存放額外的元數據。
udata64,能夠存放8字節的用戶數據。
struct rte_mempool *pool; /**< Pool from which mbuf was allocated. */
標識本mbuf是從哪一個rte_mempool池子中申請到的。也就是該mbuf是哪一個rte_mempool池子的。
struct rte_mbuf *next; /**< Next segment of scattered packet. */
在分片報文中,標記下一個報文的位置。
/* fields to support TX offloads */ /* 用於支持發包硬件卸載的字段 */ RTE_STD_C11 union { uint64_t tx_offload; /**< combined for easy fetch */ /* tx_offload 組合起來,方便取用 */ __extension__ struct { uint64_t l2_len:7; /**< L2 (MAC) Header Length for non-tunneling pkt. * Outer_L4_len + ... + Inner_L2_len for tunneling pkt. */ uint64_t l3_len:9; /**< L3 (IP) Header Length. */ uint64_t l4_len:8; /**< L4 (TCP/UDP) Header Length. */ uint64_t tso_segsz:16; /**< TCP TSO segment size */ /* TSO(TCP Segment Offload)是一種利用網卡的少許處理能力, 下降CPU發送數據包負載的技術,須要網卡硬件及驅動的支持。 */ /* fields for TX offloading of tunnels */ uint64_t outer_l3_len:9; /**< Outer L3 (IP) Hdr Length. */ uint64_t outer_l2_len:7; /**< Outer L2 (MAC) Hdr Length. */ /* uint64_t unused:8; */ }; };
支持硬件發包卸載的字段內容。內部爲一個union。其中tx_offload字段是爲了容易獲取搞出來的。
/** Size of the application private data. In case of an indirect * mbuf, it stores the direct mbuf private data size. */ uint16_t priv_size;
應用程序私有數據的大小。
在indirect mbuf 的狀況下,它存儲direct mbuf 私有數據大小。 關於direct mbuf與indirect mbuf的區別,參考連接
10. Mbuf Library — Data Plane Development Kit 21.08.0-rc1 documentation (dpdk.org)
/** Timesync flags for use with IEEE1588. */ /* IEEE1588 協議,又稱 PTP( precise time protocol,精確時間協議), * 能夠達到亞微秒級別時間同步精度,於 2002 年發佈 version 1, * 2008 年發佈 version 2。 */ uint16_t timesync;
時間同步。參考IEEE1588。
/** Sequence number. See also rte_reorder_insert(). */ uint32_t seqn;
序列號。這個是哪裏用到呢?
rte_mbuf的數據結構學習完畢。有一些遺留的問題,後續來完善。