在ip_local_deliver中,若是檢測到是分片包,則須要將報文進行重組。其全部的分片被從新組合後才能提交到上層協議,每個被從新組合的數據包文用ipq結構實例來表示node
struct ipq { struct inet_frag_queue q; u32 user;//分片來源 __be32 saddr;//原地址 __be32 daddr;//目的地址 __be16 id;//ip報文序列號 u8 protocol;//上層協議號 //這四個字段來自ip首部是爲了肯定來自哪一個ip數據報文 u8 ecn; /* RFC3168 support */ u16 max_df_size; /* largest frag with DF set seen */ int iif; int vif; /* L3 master device index */ unsigned int rid;//已收到的分片計數器 struct inet_peer *peer;//記錄發送方信息 //經過rid peer 能夠防止Dos攻擊 };
網絡空間分段管理結構數組
struct inet_frags { struct inet_frag_bucket hash[INETFRAGS_HASHSZ];//哈希隊列 struct work_struct frags_work;//工做隊列 unsigned int next_bucket; unsigned long last_rebuild_jiffies; bool rebuild; /* The first call to hashfn is responsible to initialize * rnd. This is best done with net_get_random_once. * * rnd_seqlock is used to let hash insertion detect * when it needs to re-lookup the hash chain to use. */ u32 rnd;//隨機數 seqlock_t rnd_seqlock;// int qsize;//隊列長度 unsigned int (*hashfn)(const struct inet_frag_queue *); bool (*match)(const struct inet_frag_queue *q, const void *arg);//分段隊列匹配函數 void (*constructor)(struct inet_frag_queue *q, const void *arg); void (*destructor)(struct inet_frag_queue *); void (*frag_expire)(unsigned long data);//隊列過時處理函數 struct kmem_cache *frags_cachep; const char *frags_cache_name; };
struct netns_frags { /* The percpu_counter "mem" need to be cacheline aligned. * mem.count must not share cacheline with other writers */ struct percpu_counter mem ____cacheline_aligned_in_smp; /* sysctls */ int timeout;超時時間 int high_thresh;內存使用上限 int low_thresh;內存使用下限 int max_dist; };
/** * struct inet_frag_queue - fragment queue * * @lock: spinlock protecting the queue * @timer: queue expiration timer * @list: hash bucket list * @refcnt: reference count of the queue * @fragments: received fragments head * @fragments_tail: received fragments tail * @stamp: timestamp of the last received fragment * @len: total length of the original datagram * @meat: length of received fragments so far * @flags: fragment queue flags * @max_size: maximum received fragment size * @net: namespace that this frag belongs to * @list_evictor: list of queues to forcefully evict (e.g. due to low memory) */ struct inet_frag_queue {//inet分段隊列頭 spinlock_t lock;smp環境下 須要 struct timer_list timer;隊列定時器,組裝很是耗時,不能無休止的等待分片的到達 struct hlist_node list;哈希節點,鏈入inet分段管理結構的哈希隊列 atomic_t refcnt;計數器 struct sk_buff *fragments;分段數據包隊列 struct sk_buff *fragments_tail; ktime_t stamp;時間戳 int len;數據包結束位置offset+len int meat;與原數據長度的差距,若是和原數據包長度同樣表明接收完成 __u8 flags; u16 max_size; struct netns_frags *net;指向網絡空寂分段管理結構 struct hlist_node list_evictor; };
1.一、 IP分組的初始化網絡
void __init ipfrag_init(void) { ip4_frags_ctl_register(); register_pernet_subsys(&ip4_frags_ops);//向內核註冊ipv4分段管理函數 ip4_frags.hashfn = ip4_hashfn;//設置計算hash的函數 //設置初始化ip 分段隊列的構造函數 ip4_frags.constructor = ip4_frag_init; //析構函數 ip4_frags.destructor = ip4_frag_free; //隊列機構長度 ip4_frags.qsize = sizeof(struct ipq); //對比ip分段隊列hook ip4_frags.match = ip4_frag_match; //設置分段隊列過時處理函數 ip4_frags.frag_expire = ip_expire; ip4_frags.frags_cache_name = ip_frag_cache_name; if (inet_frags_init(&ip4_frags)) panic("IP: failed to allocate ip4_frags cache\n"); } int inet_frags_init(struct inet_frags *f) { int i; //初始化工做隊列 INIT_WORK(&f->frags_work, inet_frag_worker); for (i = 0; i < INETFRAGS_HASHSZ; i++) { struct inet_frag_bucket *hb = &f->hash[i];//初始化hash 隊列頭 spin_lock_init(&hb->chain_lock); INIT_HLIST_HEAD(&hb->chain); } seqlock_init(&f->rnd_seqlock); f->last_rebuild_jiffies = 0; f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, NULL); if (!f->frags_cachep) return -ENOMEM; return 0; } EXPORT_SYMBOL(inet_frags_init);
int ip_local_deliver(struct sk_buff *skb) { /* * Reassemble IP fragments. */ struct net *net = dev_net(skb->dev); /* 分片重組 */ if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER)) return 0; } /* 通過LOCAL_IN鉤子點 */ return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, net, NULL, skb, skb->dev, NULL, ip_local_deliver_finish); }
1.二、 ip分片報文重組的處理app
/* Process an incoming IP datagram fragment. */ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) { struct net_device *dev = skb->dev ? : skb_dst(skb)->dev; int vif = l3mdev_master_ifindex_rcu(dev); struct ipq *qp; //遞增計數 __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS); skb_orphan(skb); /* Lookup (or create) queue header* 查找或建立IP分片隊列 */ qp = ip_find(net, ip_hdr(skb), user, vif); if (qp) {/* 分片隊列存在 */ int ret; spin_lock(&qp->q.lock); ret = ip_frag_queue(qp, skb);//分片數據包入隊重組數據包 spin_unlock(&qp->q.lock); ipq_put(qp); return ret; } /* 建立新的ip分片隊列失敗,內存不足遞增失敗計數*/ __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -ENOMEM; } EXPORT_SYMBOL(ip_defrag);
1.2.2 ip_find 根據ip首部以及user標誌 在ipq散列表中查找對應的ipq。dom
/* Find the correct entry in the "incomplete datagrams" queue for * this IP datagram, and create new one, if nothing is found. enum ip_defrag_users { IP_DEFRAG_LOCAL_DELIVER, IP_DEFRAG_CALL_RA_CHAIN, IP_DEFRAG_CONNTRACK_IN, __IP_DEFRAG_CONNTRACK_IN_END = IP_DEFRAG_CONNTRACK_IN + USHRT_MAX, IP_DEFRAG_CONNTRACK_OUT, __IP_DEFRAG_CONNTRACK_OUT_END = IP_DEFRAG_CONNTRACK_OUT + USHRT_MAX, IP_DEFRAG_CONNTRACK_BRIDGE_IN, __IP_DEFRAG_CONNTRACK_BRIDGE_IN = IP_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, IP_DEFRAG_VS_IN, IP_DEFRAG_VS_OUT, IP_DEFRAG_VS_FWD, IP_DEFRAG_AF_PACKET, IP_DEFRAG_MACVLAN, }; */ static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user, int vif) { struct inet_frag_queue *q; struct ip4_create_arg arg; unsigned int hash; /* 記錄ip頭和輸入信息 */ arg.iph = iph; arg.user = user; arg.vif = vif; /* 經過id,源地址,目的地址,協議計算hash */ hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); /* 根據hash值查找或建立隊列 */ q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); if (IS_ERR_OR_NULL(q)) { inet_frag_maybe_warn_overflow(q, pr_fmt()); return NULL; } return container_of(q, struct ipq, q); } struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frags *f, void *key, unsigned int hash) { struct inet_frag_bucket *hb; struct inet_frag_queue *q; int depth = 0; /* 分片內存已經超過了低限 */ if (frag_mem_limit(nf) > nf->low_thresh) /* 進行節點回收 */ inet_frag_schedule_worker(f); //工做隊列回調函數爲inet_frag_worker hash &= (INETFRAGS_HASHSZ - 1); hb = &f->hash[hash]; /* 找到hash桶 */ spin_lock(&hb->chain_lock); hlist_for_each_entry(q, &hb->chain, list) { /* 遍歷鏈表 */ if (q->net == nf && f->match(q, key)) { atomic_inc(&q->refcnt); /* 增長引用計數 */ spin_unlock(&hb->chain_lock); return q; } depth++;/* 記錄查找深度 */ } spin_unlock(&hb->chain_lock); /* 未找到 */ /* 桶節點的鏈表深度不超過限定 */ if (depth <= INETFRAGS_MAXDEPTH) return inet_frag_create(nf, f, key);/* 建立節點返回 */ if (inet_frag_may_rebuild(f)) { /* 若是已經超過了重建間隔時間,則重建 */ if (!f->rebuild) f->rebuild = true; inet_frag_schedule_worker(f); } return ERR_PTR(-ENOBUFS); } EXPORT_SYMBOL(inet_frag_find);
若是查找不到則會建立一個ipq 並將其插入鏈表中ide
static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, struct inet_frags *f, void *arg) { struct inet_frag_queue *q; q = inet_frag_alloc(nf, f, arg);//分配隊列頭結構空間 if (!q) return NULL; return inet_frag_intern(nf, q, f, arg); } static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, struct inet_frags *f, void *arg) { struct inet_frag_queue *q; if (frag_mem_limit(nf) > nf->high_thresh) {//內存超過警惕線 回收內存 inet_frag_schedule_worker(f); return NULL; } q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); if (!q) return NULL; q->net = nf;//記錄下網絡空間的分段管理結構指針 f->constructor(q, arg);//以前初始化時,構造函數來初始化-ip4_frag_init add_frag_mem_limit(nf, f->qsize);//sum 網絡空間的分段內存 setup_timer(&q->timer, f->frag_expire, (unsigned long)q);//定時器initand run spin_lock_init(&q->lock); atomic_set(&q->refcnt, 1); return q; } static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { struct ipq *qp = container_of(q, struct ipq, q);//獲取分段隊列指針 struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, frags); struct net *net = container_of(ipv4, struct net, ipv4); const struct ip4_create_arg *arg = a;//ipv4的分段信息指針 qp->protocol = arg->iph->protocol;//IP層頭部協議 qp->id = arg->iph->id;//ip層id qp->ecn = ip4_frag_ecn(arg->iph->tos); qp->saddr = arg->iph->saddr; qp->daddr = arg->iph->daddr; qp->vif = arg->vif; qp->user = arg->user; //記錄對方信息 qp->peer = q->net->max_dist ? inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : NULL; } static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, struct inet_frag_queue *qp_in, struct inet_frags *f, void *arg) { struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f); struct inet_frag_queue *qp; #ifdef CONFIG_SMP /* With SMP race we have to recheck hash table, because * such entry could have been created on other cpu before * we acquired hash bucket lock. */ hlist_for_each_entry(qp, &hb->chain, list) { if (qp->net == nf && f->match(qp, arg)) { atomic_inc(&qp->refcnt); spin_unlock(&hb->chain_lock); qp_in->flags |= INET_FRAG_COMPLETE; inet_frag_put(qp_in, f); return qp; } } #endif qp = qp_in; if (!mod_timer(&qp->timer, jiffies + nf->timeout)) atomic_inc(&qp->refcnt); atomic_inc(&qp->refcnt);//鏈入inet分段管理結構的hash隊列 hlist_add_head(&qp->list, &hb->chain); spin_unlock(&hb->chain_lock); return qp; }
1/2/3 分片數據包加入重組數據包函數
/* Add new segment to existing queue. */ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { struct sk_buff *prev, *next; struct net_device *dev; unsigned int fragsize; int flags, offset; int ihl, end; int err = -ENOENT; u8 ecn; if (qp->q.flags & INET_FRAG_COMPLETE) //分段隊列接收完成 則釋放此分片返回 goto err; /*數據包沒有分段標誌or 分段隊列間隔過大 //重現調整分段隊列是否出錯 若是不是本地生成的分片,則調用ip_frag_too_far 檢測 是否存在 dos攻擊,存在攻擊則調用邋ip_frag_reinit釋放 所用分片 */ if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && unlikely(ip_frag_too_far(qp)) && unlikely(err = ip_frag_reinit(qp))) { ipq_kill(qp);//將ipq從散列表中移除中止定時器 計數器減一 // 調用ipq_unlink 設置ipq爲complete狀態,只有complete狀態才能釋放 goto err; } ecn = ip4_frag_ecn(ip_hdr(skb)->tos); offset = ntohs(ip_hdr(skb)->frag_off); flags = offset & ~IP_OFFSET; offset &= IP_OFFSET; offset <<= 3; /* offset is in 8-byte chunks */ ihl = ip_hdrlen(skb); /* 獲取ip首部中的數據標誌位 片的偏移 首部長度 */ /* Determine the position of this fragment. */ end = offset + skb->len - skb_network_offset(skb) - ihl; err = -EINVAL; /**/ /* Is this the final fragment? 若是是最後一個片則先對分片進行檢測 */ if ((flags & IP_MF) == 0) { /* If we already have some bits beyond end * or have different end, the segment is corrupted. 結束位置小於前一個位置,ipq已經有 last_in 標誌且分片末尾不等於原始數據長度 */ if (end < qp->q.len || ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len)) goto err; qp->q.flags |= INET_FRAG_LAST_IN; qp->q.len = end; /*經過校驗並設置爲last_in標誌,存儲完整的數據長度*/ } else { if (end&7) {//按8字節對其 end &= ~7; if (skb->ip_summed != CHECKSUM_UNNECESSARY) skb->ip_summed = CHECKSUM_NONE; } if (end > qp->q.len) { /* 結束地址大於前一個分段數據地址 Some bits beyond end -> corruption. 若是設置了最後一個分段數據標誌 表示最後一個包,則錯誤*/ if (qp->q.flags & INET_FRAG_LAST_IN) goto err; qp->q.len = end;//記錄當前分段數據塊的結束位置 } } if (end == offset)//等於起始位置 即分片區數據長度爲0 goto err; err = -ENOMEM;//去掉ip首部 if (!pskb_pull(skb, skb_network_offset(skb) + ihl)) goto err; //skb 數據長度爲end-offset ip 有效載荷長度 err = pskb_trim_rcsum(skb, end - offset); if (err) goto err; /* Find out which fragments are in front and at the back of us * in the chain of fragments so far. We must know where to put * this fragment, right? */ prev = qp->q.fragments_tail; if (!prev || FRAG_CB(prev)->offset < offset) { next = NULL; goto found; } prev = NULL; for (next = qp->q.fragments; next != NULL; next = next->next) { if (FRAG_CB(next)->offset >= offset) break; /* bingo! */ prev = next; }/*肯定分片在鏈表中的位置,分片到達的時間順序不一樣 ipq 上的分片按照分片偏移值大小排序 */ found: /* We found where to put this one. Check for overlap with * preceding fragment, and, if needed, align things so that * any overlaps are eliminated. 檢驗和和上一個分片數據是否有重疊 */ if (prev) { int i = (FRAG_CB(prev)->offset + prev->len) - offset; if (i > 0) {//有重疊 調用pskb_pull 消除重疊 offset += i; err = -EINVAL; if (end <= offset) goto err; err = -ENOMEM; if (!pskb_pull(skb, i)) goto err; if (skb->ip_summed != CHECKSUM_UNNECESSARY) skb->ip_summed = CHECKSUM_NONE; } } err = -ENOMEM; /*若是和後面一個分片的數據有重疊, 部分重疊仍是徹底重疊; 重疊部分數據超過下一個分片的數據長度,咋釋放 下發一個分片並在檢查與後面第二個分片的數據是否 有重疊,若是沒有超過下一個則調整下一個分片。 如此反覆直到對全部分片都檢測完。 調整片的偏移以及分片總長度 */ while (next && FRAG_CB(next)->offset < end) { int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */ if (i < next->len) { /* Eat head of the next overlapped fragment * and leave the loop. The next ones cannot overlap. */ if (!pskb_pull(next, i)) goto err; FRAG_CB(next)->offset += i; qp->q.meat -= i; if (next->ip_summed != CHECKSUM_UNNECESSARY) next->ip_summed = CHECKSUM_NONE; break; } else { struct sk_buff *free_it = next; /* Old fragment is completely overridden with * new one drop it. */ next = next->next; if (prev) prev->next = next; else qp->q.fragments = next; qp->q.meat -= free_it->len; sub_frag_mem_limit(qp->q.net, free_it->truesize); kfree_skb(free_it); } } FRAG_CB(skb)->offset = offset;//當前片的偏移 /* Insert this fragment in the chain of fragments. 當前的片插入到ipq隊列中相應的位置*/ skb->next = next; if (!next) qp->q.fragments_tail = skb; if (prev) prev->next = skb; else qp->q.fragments = skb; dev = skb->dev; if (dev) { qp->iif = dev->ifindex; skb->dev = NULL; } qp->q.stamp = skb->tstamp;//更新時間搓 qp->q.meat += skb->len;//sum ipq已收到分片的總長度 qp->ecn |= ecn; //分片組裝模塊的所佔內存的總長度 add_frag_mem_limit(qp->q.net, skb->truesize); if (offset == 0)//爲第一個片 設置標誌 qp->q.flags |= INET_FRAG_FIRST_IN; fragsize = skb->len + ihl; if (fragsize > qp->q.max_size) qp->q.max_size = fragsize; if (ip_hdr(skb)->frag_off & htons(IP_DF) && fragsize > qp->max_df_size) qp->max_df_size = fragsize; if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && qp->q.meat == qp->q.len) {//全部報文都到齊則重組 unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; err = ip_frag_reasm(qp, prev, dev); skb->_skb_refdst = orefdst; return err; } skb_dst_drop(skb); return -EINPROGRESS; err: kfree_skb(skb); return err; }
ip_frag_reasm 重組報文;oop
* Build a new IP datagram from all its fragments. */ /* *用於組裝已到齊的全部分片,當原始 * 數據包的全部分片都已到齊時,會調用此函 * 數組裝分片。 */ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, struct net_device *dev) { struct net *net = container_of(qp->q.net, struct net, ipv4.frags); struct iphdr *iph; struct sk_buff *fp, *head = qp->q.fragments; int len; int ihlen; int err; u8 ecn; /* * 要開始組裝了,所以調用ipq_kill()將此ipq結點從 * ipq散列表刪除,並刪除定時器。 */ ipq_kill(qp); ecn = ip_frag_ecn_table[qp->ecn]; if (unlikely(ecn == 0xff)) { err = -EINVAL; goto out_fail; } /* Make the one we just received the head. */ if (prev) { head = prev->next; fp = skb_clone(head, GFP_ATOMIC); if (!fp) goto out_nomem; fp->next = head->next; if (!fp->next) qp->q.fragments_tail = fp; prev->next = fp; skb_morph(head, qp->q.fragments); head->next = qp->q.fragments->next; consume_skb(qp->q.fragments); qp->q.fragments = head; } WARN_ON(!head); WARN_ON(FRAG_CB(head)->offset != 0); /* Allocate a new buffer for the datagram. 計算原始報文的長度 超過64 KB*/ ihlen = ip_hdrlen(head); len = ihlen + qp->q.len; err = -E2BIG; if (len > 65535) goto out_oversize; /* Head of list must not be cloned. * 在組裝分片時,全部的分片都會組裝到第一個分片 * 上,所以第一個分片是不能克隆的,若是是克隆的, * 則需爲分片組裝從新分配一個SKB。 */ if (skb_unclone(head, GFP_ATOMIC)) goto out_nomem; /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ /* * 分片隊列的第一個SKB不能既帶有數據,又帶有分片,即其 * frag_list上不能有分片skb,若是有則從新分配一個SKB。最終的 * 效果是,head自身不包括數據,其frag_list上連接着全部分片的 * SKB。這也是SKB的一種表現形式,不必定是一個連續的數據塊, * 但最終會調用skb_linearize()將這些數據都複製到一個連續的數據 * 塊中。 */ if (skb_has_frag_list(head)) { struct sk_buff *clone; int i, plen = 0; clone = alloc_skb(0, GFP_ATOMIC); if (!clone) goto out_nomem; clone->next = head->next; head->next = clone; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; skb_frag_list_init(head); for (i = 0; i < skb_shinfo(head)->nr_frags; i++) plen += skb_frag_size(&skb_shinfo(head)->frags[i]); clone->len = clone->data_len = head->data_len - plen; head->data_len -= clone->len; head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; add_frag_mem_limit(qp->q.net, clone->truesize); } /* * 把全部分片組裝起來即將分片連接到第一個 * SKB的frag_list上,同時還須要遍歷全部分片, * 從新計算IP數據包長度以及校驗和等。 */ skb_shinfo(head)->frag_list = head->next; skb_push(head, head->data - skb_network_header(head)); for (fp=head->next; fp; fp = fp->next) { head->data_len += fp->len; head->len += fp->len; if (head->ip_summed != fp->ip_summed) head->ip_summed = CHECKSUM_NONE; else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); head->truesize += fp->truesize; } /* * 重置首部長度、片偏移、標誌位和總長度。 */ sub_frag_mem_limit(qp->q.net, head->truesize); head->next = NULL; head->dev = dev; head->tstamp = qp->q.stamp; IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); iph = ip_hdr(head); iph->tot_len = htons(len); iph->tos |= ecn; /* When we set IP_DF on a refragmented skb we must also force a * call to ip_fragment to avoid forwarding a DF-skb of size s while * original sender only sent fragments of size f (where f < s). * * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest * frag seen to avoid sending tiny DF-fragments in case skb was built * from one very small df-fragment and one large non-df frag. */ if (qp->max_df_size == qp->q.max_size) { IPCB(head)->flags |= IPSKB_FRAG_PMTU; iph->frag_off = htons(IP_DF); } else { iph->frag_off = 0; } ip_send_check(iph); __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS); /* * 既然各分片都已處理完,釋放ipq的分片隊列。 */ qp->q.fragments = NULL; qp->q.fragments_tail = NULL; return 0; out_nomem: net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp); err = -ENOMEM; goto out_fail; out_oversize: net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr); out_fail: __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); return err; }
static void inet_frag_secret_rebuild(struct inet_frags *f) { int i; write_seqlock_bh(&f->rnd_seqlock);//順序鎖 if (!inet_frag_may_rebuild(f)) goto out; /* 獲取新的用於計算hash的隨機值 */ get_random_bytes(&f->rnd, sizeof(u32)); for (i = 0; i < INETFRAGS_HASHSZ; i++) { struct inet_frag_bucket *hb; struct inet_frag_queue *q; struct hlist_node *n; hb = &f->hash[i]; /* 取的桶節點 */ spin_lock(&hb->chain_lock); hlist_for_each_entry_safe(q, n, &hb->chain, list) { unsigned int hval = inet_frag_hashfn(f, q); if (hval != i) {/* 節點不屬於當前桶 */ struct inet_frag_bucket *hb_dest; hlist_del(&q->list); /* 從當前桶中刪除該節點 */ /* Relink to new hash chain. */ hb_dest = &f->hash[hval]; /* 找到目標桶 */ /* This is the only place where we take * another chain_lock while already holding * one. As this will not run concurrently, * we cannot deadlock on hb_dest lock below, if its * already locked it will be released soon since * other caller cannot be waiting for hb lock * that we've taken above. */ spin_lock_nested(&hb_dest->chain_lock, SINGLE_DEPTH_NESTING);/* 節點加入目標桶的鏈表中 */ hlist_add_head(&q->list, &hb_dest->chain); spin_unlock(&hb_dest->chain_lock); } } spin_unlock(&hb->chain_lock); } /* 設置重建標記和重建時間 */ f->rebuild = false; f->last_rebuild_jiffies = jiffies; out: write_sequnlock_bh(&f->rnd_seqlock); }
會定時清除規定 時間內沒有完成重組的upq及其全部的分片ui
/* * Oops, a fragment queue timed out. Kill it and send an ICMP reply. */ static void ip_expire(unsigned long arg) { struct ipq *qp; struct net *net; qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); net = container_of(qp->q.net, struct net, ipv4.frags); spin_lock(&qp->q.lock); //ipq 已是complete狀態不處理 直接釋放ipq以及其全部的分片 if (qp->q.flags & INET_FRAG_COMPLETE) goto out; ipq_kill(qp);//將其從散列表移除 __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);//數據統計 if (!inet_frag_evicting(&qp->q)) {//在回收隊列中 struct sk_buff *head = qp->q.fragments; const struct iphdr *iph; int err; __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments) goto out; rcu_read_lock(); head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) goto out_rcu_unlock; /* skb has no dst, perform route lookup again */ iph = ip_hdr(head); err = ip_route_input_noref(head, iph->daddr, iph->saddr, iph->tos, head->dev); if (err) goto out_rcu_unlock; /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ if (frag_expire_skip_icmp(qp->user) && (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out_rcu_unlock; /* Send an ICMP "Fragment Reassembly Timeout" message. 發送ICMP 報文*/ icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); out_rcu_unlock: rcu_read_unlock(); } out: spin_unlock(&qp->q.lock); ipq_put(qp); }
爲了控制ip組裝所佔用的內存,設置了兩個閾值low_thresh 、high_thresh 當前ipq散列表所佔用的內存存儲在 mem變量中,這些全局變量存在以下結構中(netns_frags)this
struct netns_frags { /* The percpu_counter "mem" need to be cacheline aligned. * mem.count must not share cacheline with other writers */ struct percpu_counter mem ____cacheline_aligned_in_smp; /* sysctls */ int timeout; int high_thresh; int low_thresh; int max_dist; };
當mem大於high_thres 時,須要對散列表清理,直到mem值下降到low_thres。這兩個值能夠經過proc修改
static unsigned int inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) { struct inet_frag_queue *fq; struct hlist_node *n; unsigned int evicted = 0; HLIST_HEAD(expired); spin_lock(&hb->chain_lock); /* 遍歷桶下的鏈表 */ hlist_for_each_entry_safe(fq, n, &hb->chain, list) { if (!inet_fragq_should_evict(fq))/* 未超過限定,無需回收 */ continue; if (!del_timer(&fq->timer)) /* 定時器沒法刪除 */ continue; /* 可以回收的節點加入到臨時hash */ hlist_add_head(&fq->list_evictor, &expired); ++evicted; } spin_unlock(&hb->chain_lock); /* 依次調用回收函數進行回收 */ hlist_for_each_entry_safe(fq, n, &expired, list_evictor) f->frag_expire((unsigned long) fq); return evicted; } static void inet_frag_worker(struct work_struct *work) { /* 本次回收的桶節點數 */ unsigned int budget = INETFRAGS_EVICT_BUCKETS; unsigned int i, evicted = 0; struct inet_frags *f; f = container_of(work, struct inet_frags, frags_work); BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ); local_bh_disable(); /* 從上次回收完的下一個節點開始,進行回收 */ for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) { evicted += inet_evict_bucket(f, &f->hash[i]); /* 回收並統計回收數量 */ i = (i + 1) & (INETFRAGS_HASHSZ - 1); /* 回收節點數超過最大值,中止 */ if (evicted > INETFRAGS_EVICT_MAX) break; } f->next_bucket = i; /* 記錄下次須要開始回收的桶節點 */ local_bh_enable(); /* 若是須要重建,則重建 */ if (f->rebuild && inet_frag_may_rebuild(f)) inet_frag_secret_rebuild(f); }