netdev_rx_queue表示對應的接收隊列,不少網卡硬件上已經支持多個隊列,此時就會有多個netdev_rx_queue隊列,這個結構是掛在net_device,初始化接收隊列的函數:netif_alloc_rx_queuesapi
netif_alloc_rx_queues
數組
static int netif_alloc_rx_queues(struct net_device *dev) { /*獲取接收隊列的個數*/ unsigned int i, count = netdev_extended(dev)->rps_data.num_rx_queues; struct netdev_rx_queue *rx; BUG_ON(count < 1); /*分配netdev_rx_queue 空間*/ rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); if (!rx) { pr_err("netdev: Unable to allocate %u rx queues.\n", count); return -ENOMEM; } /* netdev_rx_queue 和net_device關聯起來。*/ netdev_extended(dev)->rps_data._rx = rx; /*對netdev_rx_queue 中net_device進行賦值操做*/ for (i = 0; i < count; i++) rx[i].dev = dev; return 0; } struct netdev_rx_queue { /*保存當前隊列的rps map*/ struct rps_map *rps_map; /* //每一個設備的隊列保存了一個rps_dev_flow_table */ struct rps_dev_flow_table *rps_flow_table; //對應的kobject struct kobject kobj; /*所屬的net_device*/ struct net_device *dev; } ____cacheline_aligned_in_smp; struct rps_map { /*CPU的個數,也就是CPU數組的個數*/ unsigned int len; struct rcu_head rcu; /*保存了CPU的ID*/ u16 cpus[0]; };
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { struct ipv6hdr *ip6; struct iphdr *ip; struct netdev_rx_queue *rxqueue; struct rps_map *map; struct rps_dev_flow_table *flow_table; struct rps_sock_flow_table *sock_flow_table; struct netdev_rps_info *rpinfo = &netdev_extended(dev)->rps_data; int cpu = -1; int tcpu; u8 ip_proto; u32 addr1, addr2, ports, ihl; rcu_read_lock(); if (skb_rx_queue_recorded(skb)) { /*獲取設備對應的rx隊列。*/ u16 index = skb_get_rx_queue(skb); if (unlikely(index >= rpinfo->num_rx_queues)) { WARN_ONCE(rpinfo->num_rx_queues > 1, "%s received packet " "on queue %u, but number of RX queues is %u\n", dev->name, index, rpinfo->num_rx_queues); goto done; } rxqueue = rpinfo->_rx + index; } else rxqueue = rpinfo->_rx; if (!rxqueue->rps_map && !rxqueue->rps_flow_table) goto done; if (skb->rxhash) //若是硬件已經計算過,則直接跳過,不須要計算HASH值 goto got_hash; /* Skip hash computation on packet header */ switch (skb->protocol) { /*根據不一樣的IP協議獲取源IP和目的IP*/ case __constant_htons(ETH_P_IP): if (!pskb_may_pull(skb, sizeof(*ip))) goto done; ip = (struct iphdr *) skb->data; ip_proto = ip->protocol; addr1 = ip->saddr; addr2 = ip->daddr; ihl = ip->ihl; break; case __constant_htons(ETH_P_IPV6): if (!pskb_may_pull(skb, sizeof(*ip6))) goto done; ip6 = (struct ipv6hdr *) skb->data; ip_proto = ip6->nexthdr; addr1 = ip6->saddr.s6_addr32[3]; addr2 = ip6->daddr.s6_addr32[3]; ihl = (40 >> 2); break; default: goto done; } ports = 0; switch (ip_proto) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_DCCP: case IPPROTO_ESP: case IPPROTO_AH: case IPPROTO_SCTP: case IPPROTO_UDPLITE: if (pskb_may_pull(skb, (ihl * 4) + 4)) ports = *((u32 *) (skb->data + (ihl * 4))); /*獲取四層協議的端口號,tcp頭的前4個字節就是源和目的端口,所以這裏跳過ip頭獲得tcp頭的前4個字節*/ break; default: break; } /*根據獲取到的SIP和DIP,PORT計算HSAH值,*/ skb->rxhash = jhash_3words(addr1, addr2, ports, hashrnd) >> 16; if (!skb->rxhash) skb->rxhash = 1; got_hash: /* rps_sock_flow_table和rps_dev_flow_table 是爲了解決RFS而添加的兩張表,rps_sock_flow_table是一個全局的hash表,這個錶針對socket的,映射了socket對應的cpu,這裏的cpu就是應用層期待軟中斷所在的cpu ,rps_dev_flow_table,這個是針對設備的,每一個設備隊列都含有一個rps_dev_flow_table(這個表主要是保存了上次處理相同連接上的skb所在的cpu),這個hash表中每個元素包含了一個cpu id,一個tail queue的計數器*/ flow_table = rcu_dereference(rxqueue->rps_flow_table); sock_flow_table = rcu_dereference(rps_sock_flow_table); if (flow_table && sock_flow_table) { u16 next_cpu; struct rps_dev_flow *rflow; rflow = &flow_table->flows[skb->rxhash & flow_table->mask]; tcpu = rflow->cpu; next_cpu = sock_flow_table->ents[skb->rxhash & sock_flow_table->mask]; /*首先會獲得兩個flow table,一個是sock_flow_table,另外一個是設備的rps_flow_table(skb對應的設備隊列中對應的flow table),這裏的邏輯是這樣子的取出來兩個cpu,一個是根據rps計算數據包前一次被調度過的cpu(tcpu),一個是應用程序指望的cpu(next_cpu),而後比較這兩個值,若是 1 tcpu未設置(等於RPS_NO_CPU) 2 tcpu是離線的 3 tcpu的input_queue_head大於rps_flow_table中的last_qtail 的話就調度這個skb到next_cpu.而這裏第三點input_queue_head大於rps_flow_table則說明在當前的dev flow table中的數據包已經發送完畢,不然的話爲了不亂序就仍是繼續使用tcpu * If the desired CPU (where last recvmsg was done) is * different from current CPU (one in the rx-queue flow * table entry), switch if one of the following holds: * - Current CPU is unset (equal to RPS_NO_CPU). * - Current CPU is offline. * - The current CPU's queue tail has advanced beyond the * last packet that was enqueued using this table entry. * This guarantees that all previous packets for the flow * have been dequeued, thus preserving in order delivery. */ if (unlikely(tcpu != next_cpu) && (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || ((int)(per_cpu(softnet_data, tcpu).input_queue_head - rflow->last_qtail)) >= 0)) { tcpu = rflow->cpu = next_cpu; if (tcpu != RPS_NO_CPU) rflow->last_qtail = per_cpu(softnet_data, tcpu).input_queue_head; } if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { *rflowp = rflow; /*設置返回的CPU*/ cpu = tcpu; goto done; } } /*當第一次進來時tcpu是RPS_NO_CPU,而且next_cpu也是RPS_NO_CPU,此時會致使跳過rfs處理,而是直接使用rps的處理, */ map = rcu_dereference(rxqueue->rps_map); if (map) { tcpu = map->cpus[((u32) (skb->rxhash * map->len)) >> 16]; /*若是cpu是online的,則返回計算出的這個cpu */ if (cpu_online(tcpu)) { cpu = tcpu; goto done; } } done: rcu_read_unlock(); return cpu; } /*將skb掛在到對應cpu的input queue上的, enqueue_to_backlog接受一個skb和cpu爲參數,經過cpu來判斷skb如何處理。要麼加入所屬的input_pkt_queue中,要麼schecule 軟中斷*/
static int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned int *qtail) { struct softnet_data *queue; unsigned long flags; /*根據傳遞過來的CPU,獲取softnet_data結構體*/ queue = &per_cpu(softnet_data, cpu); local_irq_save(flags); __get_cpu_var(netdev_rx_stat).total++; spin_lock(&queue->input_pkt_queue.lock); if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { if (queue->input_pkt_queue.qlen) { enqueue:/*將數據包添加到input_pkt_queue隊列中*/ __skb_queue_tail(&queue->input_pkt_queue, skb); *qtail = queue->input_queue_head + queue->input_pkt_queue.qlen; spin_unlock_irqrestore(&queue->input_pkt_queue.lock, flags); return NET_RX_SUCCESS; } /* Schedule NAPI for backlog device 能夠調度軟中斷*/ if (napi_schedule_prep(&queue->backlog)) { if (cpu != smp_processor_id()) {/*判斷該SKB是否該CPU處理*/ struct rps_remote_softirq_cpus *rcpus = &__get_cpu_var(rps_remote_softirq_cpus); cpu_set(cpu, rcpus->mask[rcpus->select]); __raise_softirq_irqoff(NET_RX_SOFTIRQ); } else /*應該當前cpu處理,則直接schedule 軟中斷,這裏能夠看到傳遞進去的是backlog */ ____napi_schedule(queue, &queue->backlog); } goto enqueue; } spin_unlock(&queue->input_pkt_queue.lock); __get_cpu_var(netdev_rx_stat).dropped++; local_irq_restore(flags); kfree_skb(skb); return NET_RX_DROP; }
static int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned int *qtail) { struct softnet_data *queue; unsigned long flags; /*根據傳遞過來的CPU,獲取softnet_data結構體*/ queue = &per_cpu(softnet_data, cpu); local_irq_save(flags); __get_cpu_var(netdev_rx_stat).total++; spin_lock(&queue->input_pkt_queue.lock); if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { if (queue->input_pkt_queue.qlen) { enqueue:/*將數據包添加到input_pkt_queue隊列中*/ __skb_queue_tail(&queue->input_pkt_queue, skb); *qtail = queue->input_queue_head + queue->input_pkt_queue.qlen; spin_unlock_irqrestore(&queue->input_pkt_queue.lock, flags); return NET_RX_SUCCESS; } /* Schedule NAPI for backlog device 能夠調度軟中斷*/ if (napi_schedule_prep(&queue->backlog)) { if (cpu != smp_processor_id()) {/*判斷該SKB是否該CPU處理*/ struct rps_remote_softirq_cpus *rcpus = &__get_cpu_var(rps_remote_softirq_cpus); cpu_set(cpu, rcpus->mask[rcpus->select]); __raise_softirq_irqoff(NET_RX_SOFTIRQ); } else /*應該當前cpu處理,則直接schedule 軟中斷,這裏能夠看到傳遞進去的是backlog */ ____napi_schedule(queue, &queue->backlog); } goto enqueue; } spin_unlock(&queue->input_pkt_queue.lock); __get_cpu_var(netdev_rx_stat).dropped++; local_irq_restore(flags); kfree_skb(skb); return NET_RX_DROP; }
int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; int addr_len = 0; int err; inet_rps_record_flow(sk);//設置HASH表 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, &addr_len); if (err >= 0) msg->msg_namelen = addr_len; return err; }
這個函數主要是獲得全局的rps_sock_flow_table,而後調用rps_record_sock_flow來對rps_sock_flow_table進行設置,這裏會將socket的sk_rxhash傳遞進去看成hash的索引,而這個sk_rxhash其實就是skb裏面的rxhash,skb的rxhash就是rps中設置的hash值,這個值是根據四元組進行hash的。這裏用這個當索引一個是爲了相同的socket都能落入一個index。並且下面的軟中斷上下文也比較容易存取這個hash表socket
點擊(此處)摺疊或打開tcp
static inline void inet_rps_record_flow(struct sock *sk) { struct rps_sock_flow_table *sock_flow_table; rcu_read_lock(); sock_flow_table = rcu_dereference(rps_sock_flow_table); rps_record_sock_flow(sock_flow_table, inet_sk_rxhash(sk)); rcu_read_unlock(); }
點擊(此處)摺疊或打開函數
static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, u32 hash) { if (table && hash) { /*獲取索引*/ unsigned int cpu, index = hash & table->mask; /* We only give a hint, preemption can change cpu under us 獲取CPU */ cpu = raw_smp_processor_id(); /*保存對應的cpu,若是等於當前cpu,則說明已經設置過了*/ if (table->ents[index] != cpu) table->ents[index] = cpu; } }
圖:內核代碼流程this