icmp協議相比於tcp,udp有其讀特性,它介於網絡層和傳輸層之間,它沒有傳輸層的源目的端口。因此在建立鏈接跟蹤時須要進行特殊處理。還有ICMP屬於差錯報文,並非全部icmp報文是成對出現的,這些不一樣形成了icmp的處理與tcp,udp處理的不一樣。網絡
icmp報文由以下種類:tcp
最多有18種icmp報文,每一種icmp報文可能會有一些子類。只有下面四種icmp報文是成對出現的。ide
static const u_int8_t valid_new[] = { [ICMP_ECHO] = 1, [ICMP_TIMESTAMP] = 1, [ICMP_INFO_REQUEST] = 1, [ICMP_ADDRESS] = 1 }; //其成對的關係爲 /* Add 1; spaces filled with 0. 這裏都給其對應的類型加了1,主要是由於ICMP_ECHO值爲0,內核想把0這個值表示沒有成對消息,因此在這裏進行了加1,最後在構建CT的時候會減掉1。詳細能夠查看函數icmp_invert_tuple。 */ static const u_int8_t invmap[] = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1, [ICMP_ECHOREPLY] = ICMP_ECHO + 1, [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1 };
由於只有這四對消息時成對的,因此鏈接跟蹤只會爲這四對消息進行鏈接跟蹤。函數
icmp報文沒有源目的端口,採用什麼來填充tuple呢?post
從下面代碼能夠看出,鏈接跟蹤使用一個__be16 id來替代tuple中的源port,這裏的id是icmp報文中的標識符,這四類消息都有,其中ping消息通常在其中填充ping程序的pid,因此同一臺設備啓動兩個不一樣的ping程序ping同一個ip會生成兩個會話:ui
/* The protocol-specific manipulable parts of the tuple: always in * network order */ union nf_conntrack_man_proto { /* Add other protocols here. */ __be16 all; struct { __be16 port; } tcp; struct { __be16 port; } udp; struct { __be16 id; } icmp; struct { __be16 port; } dccp; struct { __be16 port; } sctp; struct { __be16 key; /* GRE key is 32bit, PPtP only uses 16bit */ } gre; };
從下面代碼能夠看出,鏈接跟蹤使用一個u_int8_t type, code;來替代tuple中的目的port:this
/* This contains the information to distinguish a connection. */ struct nf_conntrack_tuple { struct nf_conntrack_man src; /* These are the parts of the tuple which are fixed. */ struct { union nf_inet_addr u3; union { /* Add other protocols here. */ __be16 all; struct { __be16 port; } tcp; struct { __be16 port; } udp; struct { u_int8_t type, code; } icmp; struct { __be16 port; } dccp; struct { __be16 port; } sctp; struct { __be16 key; } gre; } u; /* The protocol. */ u_int8_t protonum; /* The direction (for tuplehash) */ u_int8_t dir; } dst; };
下面咱們看一下,icmp如何求一個tuple的反轉tuple,是否項tcp,udp將源目的端口調換同樣呢?spa
/* 反轉五元組 */ static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { if (orig->dst.u.icmp.type >= sizeof(invmap) ||//判斷類型是否超過了最大值,非法 !invmap[orig->dst.u.icmp.type])//判斷該類型的icmp消息是不是成對的,使用0表示不成對,不成對則不處理。 return false; //id依然填寫到id的位置,沒有被調換到type,code位置 tuple->src.u.icmp.id = orig->src.u.icmp.id; //只是替換了type到其對應的type。 tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;//這裏減了1,由於invmap中都加了1 //code不會變。由於這四對消息的code只有一個值0。詳細請看前面的圖片。 tuple->dst.u.icmp.code = orig->dst.u.icmp.code; return true; }
從上面能夠看出,icmp求反轉tuple時,只會將type替換成對應的type(咱們這裏不涉及IP地址)debug
icmp更多的是差錯報文,它是用來通知源主機的一些錯誤信息的。它的產生每每是某臺設備發送的報文在傳輸過程當中出現了差錯,在傳輸路徑中設備或者目標主機設備檢測到了差錯,從而生成一個ICMP差錯報文通知源主機。差錯報文會在icmp報文頭後添加一段致使該icmp報文的原始報文頭信息。因此說icmp差錯報文是一個鏈接的附屬,鏈接跟蹤將差錯報文視爲一個子鏈接(不會真實建立CT,而是依附於主鏈接,設置該報文的狀態爲IP_CT_RELATED或者IP_CT_RELATED_REPLY)。3d
鏈接跟蹤處理以下幾種差錯報文:
ICMP_DEST_UNREACH //目的不可達 ICMP_SOURCE_QUENCH //源抑制,向源主機發送源抑制報文通知源主機減慢發送速度 ICMP_TIME_EXCEEDED //TTL超時, ICMP_PARAMETERPROB //參數問題, ICMP_REDIRECT //重定向,收到該差錯的主機須要更新路由的下一跳,或者鄰居(直連主機)
鏈接跟蹤對於這幾種差錯報文,須要正確交給目標主機。處理的主要緣由是NAT,後續詳細說明。
const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = { .l3proto = PF_INET, .l4proto = IPPROTO_ICMP, .pkt_to_tuple = icmp_pkt_to_tuple, .invert_tuple = icmp_invert_tuple, .packet = icmp_packet, .get_timeouts = icmp_get_timeouts, .new = icmp_new, .error = icmp_error, .destroy = NULL, .me = NULL, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmp_tuple_to_nlattr, .nlattr_tuple_size = icmp_nlattr_tuple_size, .nlattr_to_tuple = icmp_nlattr_to_tuple, .nla_policy = icmp_nla_policy, #endif #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) .ctnl_timeout = { .nlattr_to_obj = icmp_timeout_nlattr_to_obj, .obj_to_nlattr = icmp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_ICMP_MAX, .obj_size = sizeof(unsigned int), .nla_policy = icmp_timeout_nla_policy, }, #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ .init_net = icmp_init_net, .get_net_proto = icmp_get_net_proto, };
/* Small and modified version of icmp_rcv */ /* 用於在鏈接跟蹤中處理報文錯誤,tmpl通常爲NULL */ static int icmp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, u8 pf, unsigned int hooknum) { const struct icmphdr *icmph; struct icmphdr _ih; /* Not enough header? icmp頭是否完整 */ icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); if (icmph == NULL) { icmp_error_log(skb, net, pf, "short packet"); return -NF_ACCEPT; } /* See ip_conntrack_proto_tcp.c */ /* 檢驗校驗碼 */ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&//本機發送的不檢查 nf_ip_checksum(skb, hooknum, dataoff, 0)) { icmp_error_log(skb, net, pf, "bad hw icmp checksum"); return -NF_ACCEPT; } /* * 18 is the highest 'known' ICMP type. Anything else is a mystery * * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently * discarded. * 類型是否非法。 */ if (icmph->type > NR_ICMP_TYPES) { icmp_error_log(skb, net, pf, "invalid icmp type"); return -NF_ACCEPT; } /* Need to track icmp error message? */ /* 非差錯報文直接檢查經過 */ if (icmph->type != ICMP_DEST_UNREACH && icmph->type != ICMP_SOURCE_QUENCH && icmph->type != ICMP_TIME_EXCEEDED && icmph->type != ICMP_PARAMETERPROB && icmph->type != ICMP_REDIRECT) return NF_ACCEPT; //處理icmp差錯報文 return icmp_error_message(net, tmpl, skb, hooknum); } /* Returns conntrack if it dealt with ICMP, and filled in skb fields */ /* icmp差錯報文處理,主要是根據內層攜帶的原始報文頭找到對應的主鏈接。 ** 而後設置該報文依附於主連接,是一個RELATE報文 */ static int icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int hooknum) { struct nf_conntrack_tuple innertuple, origtuple; const struct nf_conntrack_l4proto *innerproto; const struct nf_conntrack_tuple_hash *h; const struct nf_conntrack_zone *zone; enum ip_conntrack_info ctinfo; struct nf_conntrack_zone tmp; WARN_ON(skb_nfct(skb)); zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); /* Are they talking about one of our connections? */ /* 根據內層報文信息獲取對應的五元組到origtuple中 */ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb) + ip_hdrlen(skb) + sizeof(struct icmphdr), PF_INET, net, &origtuple)) { pr_debug("icmp_error_message: failed to get tuple\n"); return -NF_ACCEPT; } /* rcu_read_lock()ed by nf_hook_thresh */ /* 獲取內層報文的傳輸層控制塊 */ innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum); /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. ** 獲取內層報文的反向五元組 */ if (!nf_ct_invert_tuple(&innertuple, &origtuple, &nf_conntrack_l3proto_ipv4, innerproto)) { pr_debug("icmp_error_message: no match\n"); return -NF_ACCEPT; } //設置報文的狀態爲子鏈接,這是報文的狀態。 ctinfo = IP_CT_RELATED; //根據反向五元組獲取對應的主CT。爲何是反向呢? //由於icmp報文是對源報文的一個響應,因此應該根據源報文的信息去獲取其所屬鏈接。 h = nf_conntrack_find_get(net, zone, &innertuple); if (!h) { pr_debug("icmp_error_message: no match\n"); return -NF_ACCEPT; } //若是是應答方向,則設置其狀態爲 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) ctinfo += IP_CT_IS_REPLY; /* Update skb to refer to this connection */ /* 將該報文關聯到 主CT ,其狀態爲IP_CT_RELATED or IP_CT_RELATED_REPLY*/ nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo); /* 修改內部的 */ return NF_ACCEPT; } //差錯報文返回NF_ACCEPT後,由於設置了報文的CT,報文的鏈接跟蹤處理就結束了。詳細狀況nf_conntrack_in函數。
/* 提取icmp的五元組,只有成對報文才會 */ static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) { const struct icmphdr *hp; struct icmphdr _hdr; hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hp == NULL) return false; tuple->dst.u.icmp.type = hp->type;/* 類型 */ tuple->src.u.icmp.id = hp->un.echo.id;/* id號,ping報文爲進程id */ tuple->dst.u.icmp.code = hp->code;/* 代碼,通常爲0 */ return true; }
/* 反轉五元組 */ static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { if (orig->dst.u.icmp.type >= sizeof(invmap) ||//判斷類型是否超過了最大值,非法 !invmap[orig->dst.u.icmp.type])//判斷該類型的icmp消息是不是成對的,使用0表示不成對,不成對則不處理。 return false; //id依然填寫到id的位置,沒有被調換到type,code位置 tuple->src.u.icmp.id = orig->src.u.icmp.id; //只是替換了type到其對應的type。 tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;//這裏減了1,由於invmap中都加了1 //code不會變。由於這四對消息的code只有一個值0。詳細請看前面的圖片。 tuple->dst.u.icmp.code = orig->dst.u.icmp.code; return true; }
非差錯報文的請求方向報文會被該函數處理,主要是進行合法性校驗,icmp_error函數已經處理過了,這裏多餘。
/* Called when a new connection for this protocol found. */ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, unsigned int *timeouts) { static const u_int8_t valid_new[] = {/* 共18個元素,其中只有下面四個icmp請求會進行鏈接跟蹤 */ [ICMP_ECHO] = 1, [ICMP_TIMESTAMP] = 1, [ICMP_INFO_REQUEST] = 1, [ICMP_ADDRESS] = 1 }; if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) { /* Can't create a new ICMP `conn' with this. */ pr_debug("icmp: can't create new conn with type %u\n", ct->tuplehash[0].tuple.dst.u.icmp.type); nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple); return false; } return true; }
非差錯報文的應答方向報文會被該函數處理,主要是進行超時更新和報文統計。
/* Returns verdict for packet, or -1 for invalid. */ /* icmp協議本身的鏈接跟蹤事務處理,對於icmp僅僅是進行報文統計 */ static int icmp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, unsigned int *timeout) { /* Do not immediately delete the connection after the first successful reply to avoid excessive conntrackd traffic and also to handle correctly ICMP echo reply duplicates. */ nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; }
icmp 鏈接跟蹤超時時間獲取,通常是30秒。
/* 獲取該命名空間的icmp鏈接跟蹤的超時時間 */ static unsigned int *icmp_get_timeouts(struct net *net) { return &icmp_pernet(net)->timeout; } static int icmp_init_net(struct net *net, u_int16_t proto) { struct nf_icmp_net *in = icmp_pernet(net); struct nf_proto_net *pn = &in->pn; in->timeout = nf_ct_icmp_timeout; return icmp_kmemdup_sysctl_table(pn, in); } /* icmp會話30秒超時 */ static const unsigned int nf_ct_icmp_timeout = 30*HZ;
icmp報文對nat的支持實際更多的是網絡層的支持,對於icmp報文自己來講只有一個標識符能夠改變,不過不多有場景要改變標識符的。下面就代碼簡單的分析一下.
const struct nf_nat_l4proto nf_nat_l4proto_icmp = { .l4proto = IPPROTO_ICMP, .manip_pkt = icmp_manip_pkt, .in_range = icmp_in_range, .unique_tuple = icmp_unique_tuple, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif };
判斷icmp的標識符是否在指定的範圍中。
static bool icmp_in_range(const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, const union nf_conntrack_man_proto *min, const union nf_conntrack_man_proto *max) { return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); }
分配一個標誌符,使得五元組惟一。
static void icmp_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { static u_int16_t id; unsigned int range_size; unsigned int i; range_size = ntohs(range->max_proto.icmp.id) - ntohs(range->min_proto.icmp.id) + 1; /* If no range specified... 沒有指定範圍,則設置方位爲0xffff */ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) range_size = 0xFFFF; for (i = 0; ; ++id) { tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) + (id % range_size)); if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) return; } return; }
將選擇的標識符替換掉原來的標識符,更新校驗碼。
static bool icmp_manip_pkt(struct sk_buff *skb, const struct nf_nat_l3proto *l3proto, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { struct icmphdr *hdr; if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct icmphdr *)(skb->data + hdroff); inet_proto_csum_replace2(&hdr->checksum, skb, hdr->un.echo.id, tuple->src.u.icmp.id, false); hdr->un.echo.id = tuple->src.u.icmp.id; return true; }
差錯報文的內層報文信息來自於產生差錯的報文。當一個主機發送一個報文通過NAT後,其報文頭髮生了改變。也就是說,檢測到該報文有差錯的設備看到的報文是通過NAT後的報文,因此NAT須要將內層報文還原回原來的報文再轉發給源主機。
unsigned int nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; /* maniptype == SRC for postrouting. */ enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); ct = nf_ct_get(skb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would * have dropped it. Hence it's the user's responsibilty to * packet filter it out, or implement conntrack/NAT for that * protocol. 8) --RR */ if (!ct) return NF_ACCEPT; nat = nfct_nat(ct); switch (ctinfo) { case IP_CT_RELATED://對於icmp差錯報文,會爲這兩個狀態 case IP_CT_RELATED_REPLY: //icmp報文特殊處理,這種狀態的報文是一個icmp差錯報文。 //根據其所屬的原始報文決定其所屬的ct。對icmp攜帶的原始報文部分進行 //相應操做。 if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { //既會對內層報文進行nat,也會對外層報文進行nat,這裏處理完畢後就返回了。 if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, state->hook)) return NF_DROP; else return NF_ACCEPT; } ... } int nf_nat_icmp_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum) { struct { struct icmphdr icmp; struct iphdr ip; } *inside; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); unsigned int hdrlen = ip_hdrlen(skb); const struct nf_nat_l4proto *l4proto; struct nf_conntrack_tuple target; unsigned long statusbit; WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY); if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) return 0; if (nf_ip_checksum(skb, hooknum, hdrlen, 0)) return 0; //獲取icmp報文頭起始地址 inside = (void *)skb->data + hdrlen; if (inside->icmp.type == ICMP_REDIRECT) {//重定向差錯報恩。 if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) return 0; if (ct->status & IPS_NAT_MASK) return 0; } if (manip == NF_NAT_MANIP_SRC) statusbit = IPS_SRC_NAT; else statusbit = IPS_DST_NAT; /* Invert if this is reply direction */ /* 應答方向進行求反 */ if (dir == IP_CT_DIR_REPLY) statusbit ^= IPS_NAT_MASK; //若是主鏈接沒有該nat操做,退出。 if (!(ct->status & statusbit)) return 1; //獲取內層報文的傳輸層操做控制塊 l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol); //進行內層報文nat處理。包括傳輸層和網絡層 if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp), l4proto, &ct->tuplehash[!dir].tuple, !manip)) return 0; //更新icmp校驗碼 if (skb->ip_summed != CHECKSUM_PARTIAL) { /* Reloading "inside" here since manip_pkt may reallocate */ inside = (void *)skb->data + hdrlen; inside->icmp.checksum = 0; inside->icmp.checksum = csum_fold(skb_checksum(skb, hdrlen, skb->len - hdrlen, 0)); } /* Change outer to look like the reply to an incoming packet */ //進行外層報文的nat處理 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0); if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip)) return 0; return 1; }