Linux內核中RPS/RFS代碼分析

時間 2019-11-19

標籤 linux 內核 rps rfs 代碼分析欄目 Linux 简体版

原文原文鏈接

netdev_rx_queue表示對應的接收隊列，不少網卡硬件上已經支持多個隊列，此時就會有多個netdev_rx_queue隊列，這個結構是掛在net_device，初始化接收隊列的函數：netif_alloc_rx_queuesapi

netif_alloc_rx_queues
數組

static int netif_alloc_rx_queues(struct net_device *dev)
{
/*獲取接收隊列的個數*/
    unsigned int i, count = netdev_extended(dev)->rps_data.num_rx_queues;
    struct netdev_rx_queue *rx;

    BUG_ON(count < 1);
 /*分配netdev_rx_queue 空間*/
    rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
    if (!rx) {
        pr_err("netdev: Unable to allocate %u rx queues.\n", count);
        return -ENOMEM;
    }
 /* netdev_rx_queue 和net_device關聯起來。*/
    netdev_extended(dev)->rps_data._rx = rx;
 /*對netdev_rx_queue 中net_device進行賦值操做*/
    for (i = 0; i < count; i++)
        rx[i].dev = dev;
    return 0;
}

    struct netdev_rx_queue {
    /*保存當前隊列的rps map*/
    struct rps_map *rps_map;
/* //每一個設備的隊列保存了一個rps_dev_flow_table */    
struct rps_dev_flow_table *rps_flow_table;
//對應的kobject
    struct kobject kobj; 
/*所屬的net_device*/
    struct net_device *dev;
} ____cacheline_aligned_in_smp;


struct rps_map {
/*CPU的個數，也就是CPU數組的個數*/
    unsigned int len;
    struct rcu_head rcu;
/*保存了CPU的ID*/
    u16 cpus[0];
};

get_rps_cpu

static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
         struct rps_dev_flow **rflowp)
{
    struct ipv6hdr *ip6;
    struct iphdr *ip;
    struct netdev_rx_queue *rxqueue;
    struct rps_map *map;
    struct rps_dev_flow_table *flow_table;
    struct rps_sock_flow_table *sock_flow_table;
    struct netdev_rps_info *rpinfo = &netdev_extended(dev)->rps_data;
    int cpu = -1;
    int tcpu;
    u8 ip_proto;
    u32 addr1, addr2, ports, ihl;

    rcu_read_lock();

    if (skb_rx_queue_recorded(skb)) {
/*獲取設備對應的rx隊列。*/
        u16 index = skb_get_rx_queue(skb);
        if (unlikely(index >= rpinfo->num_rx_queues)) {
            WARN_ONCE(rpinfo->num_rx_queues > 1, "%s received packet "
                "on queue %u, but number of RX queues is %u\n",
                dev->name, index, rpinfo->num_rx_queues);
            goto done;
        }
        rxqueue = rpinfo->_rx + index;
    } else
        rxqueue = rpinfo->_rx;

    if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
        goto done;

    if (skb->rxhash) //若是硬件已經計算過，則直接跳過，不須要計算HASH值
        goto got_hash; /* Skip hash computation on packet header */

    switch (skb->protocol) { /*根據不一樣的IP協議獲取源IP和目的IP*/
    case __constant_htons(ETH_P_IP):
        if (!pskb_may_pull(skb, sizeof(*ip)))
            goto done;

        ip = (struct iphdr *) skb->data;
        ip_proto = ip->protocol;
        addr1 = ip->saddr;
        addr2 = ip->daddr;
        ihl = ip->ihl;
        break;
    case __constant_htons(ETH_P_IPV6):
        if (!pskb_may_pull(skb, sizeof(*ip6)))
            goto done;

        ip6 = (struct ipv6hdr *) skb->data;
        ip_proto = ip6->nexthdr;
        addr1 = ip6->saddr.s6_addr32[3];
        addr2 = ip6->daddr.s6_addr32[3];
        ihl = (40 >> 2);
        break;
    default:
        goto done;
    }
    ports = 0;
    switch (ip_proto) {
    case IPPROTO_TCP:
    case IPPROTO_UDP:
    case IPPROTO_DCCP:
    case IPPROTO_ESP:
    case IPPROTO_AH:
    case IPPROTO_SCTP:
    case IPPROTO_UDPLITE:
        if (pskb_may_pull(skb, (ihl * 4) + 4))
            ports = *((u32 *) (skb->data + (ihl * 4))); /*獲取四層協議的端口號，tcp頭的前4個字節就是源和目的端口，所以這裏跳過ip頭獲得tcp頭的前4個字節*/
        break;

    default:
        break;
    }
 /*根據獲取到的SIP和DIP，PORT計算HSAH值，*/
    skb->rxhash = jhash_3words(addr1, addr2, ports, hashrnd) >> 16;
    if (!skb->rxhash)
        skb->rxhash = 1;

got_hash:
/* rps_sock_flow_table和rps_dev_flow_table 是爲了解決RFS而添加的兩張表，rps_sock_flow_table是一個全局的hash表，這個錶針對socket的，映射了socket對應的cpu，這裏的cpu就是應用層期待軟中斷所在的cpu ，rps_dev_flow_table,這個是針對設備的，每一個設備隊列都含有一個rps_dev_flow_table(這個表主要是保存了上次處理相同連接上的skb所在的cpu),這個hash表中每個元素包含了一個cpu id，一個tail queue的計數器*/
    flow_table = rcu_dereference(rxqueue->rps_flow_table);
    sock_flow_table = rcu_dereference(rps_sock_flow_table);
    if (flow_table && sock_flow_table) {
        u16 next_cpu;
        struct rps_dev_flow *rflow;

        rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
        tcpu = rflow->cpu;

        next_cpu = sock_flow_table->ents[skb->rxhash &
         sock_flow_table->mask];

        /*首先會獲得兩個flow table，一個是sock_flow_table,另外一個是設備的rps_flow_table(skb對應的設備隊列中對應的flow table)，這裏的邏輯是這樣子的取出來兩個cpu，一個是根據rps計算數據包前一次被調度過的cpu(tcpu)，一個是應用程序指望的cpu(next_cpu)，而後比較這兩個值，若是 1 tcpu未設置(等於RPS_NO_CPU） 2 tcpu是離線的 3 tcpu的input_queue_head大於rps_flow_table中的last_qtail 的話就調度這個skb到next_cpu.而這裏第三點input_queue_head大於rps_flow_table則說明在當前的dev flow table中的數據包已經發送完畢，不然的話爲了不亂序就仍是繼續使用tcpu
         * If the desired CPU (where last recvmsg was done) is
         * different from current CPU (one in the rx-queue flow
         * table entry), switch if one of the following holds:
         * - Current CPU is unset (equal to RPS_NO_CPU).
         * - Current CPU is offline.
         * - The current CPU's queue tail has advanced beyond the
         * last packet that was enqueued using this table entry.
         * This guarantees that all previous packets for the flow
         * have been dequeued, thus preserving in order delivery.
         */
        if (unlikely(tcpu != next_cpu) &&
         (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
         ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
         rflow->last_qtail)) >= 0)) {
            tcpu = rflow->cpu = next_cpu;
            if (tcpu != RPS_NO_CPU)
                rflow->last_qtail = per_cpu(softnet_data,
                 tcpu).input_queue_head;
        }
        if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
            *rflowp = rflow;
 /*設置返回的CPU*/
            cpu = tcpu;
            goto done;
        }
    }
/*當第一次進來時tcpu是RPS_NO_CPU,而且next_cpu也是RPS_NO_CPU，此時會致使跳過rfs處理，而是直接使用rps的處理, */
    map = rcu_dereference(rxqueue->rps_map);
    if (map) {
        tcpu = map->cpus[((u32) (skb->rxhash * map->len)) >> 16];
/*若是cpu是online的，則返回計算出的這個cpu */
        if (cpu_online(tcpu)) {
            cpu = tcpu;
            goto done;
        }
    }

done:
    rcu_read_unlock();
    return cpu;
}

/*將skb掛在到對應cpu的input queue上的, enqueue_to_backlog接受一個skb和cpu爲參數，經過cpu來判斷skb如何處理。要麼加入所屬的input_pkt_queue中，要麼schecule 軟中斷*/

enqueue_to_backlog

static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
             unsigned int *qtail)
{
    struct softnet_data *queue;
    unsigned long flags;
   /*根據傳遞過來的CPU，獲取softnet_data結構體*/
    queue = &per_cpu(softnet_data, cpu);

    local_irq_save(flags);
    __get_cpu_var(netdev_rx_stat).total++;

    spin_lock(&queue->input_pkt_queue.lock);
    if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
        if (queue->input_pkt_queue.qlen) {
enqueue:/*將數據包添加到input_pkt_queue隊列中*/
            __skb_queue_tail(&queue->input_pkt_queue, skb);
            *qtail = queue->input_queue_head +
                 queue->input_pkt_queue.qlen;

            spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
             flags);
            return NET_RX_SUCCESS;
        }

        /* Schedule NAPI for backlog device 能夠調度軟中斷*/
        if (napi_schedule_prep(&queue->backlog)) {
            if (cpu != smp_processor_id()) {/*判斷該SKB是否該CPU處理*/
                struct rps_remote_softirq_cpus *rcpus =
                 &__get_cpu_var(rps_remote_softirq_cpus);

                cpu_set(cpu, rcpus->mask[rcpus->select]);
                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
            } else
/*應該當前cpu處理，則直接schedule 軟中斷，這裏能夠看到傳遞進去的是backlog */
                ____napi_schedule(queue, &queue->backlog);
        }
        goto enqueue;
    }

    spin_unlock(&queue->input_pkt_queue.lock);

    __get_cpu_var(netdev_rx_stat).dropped++;
    local_irq_restore(flags);

    kfree_skb(skb);
    return NET_RX_DROP;
}

enqueue_to_backlog

static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
             unsigned int *qtail)
{
    struct softnet_data *queue;
    unsigned long flags;
   /*根據傳遞過來的CPU，獲取softnet_data結構體*/
    queue = &per_cpu(softnet_data, cpu);

    local_irq_save(flags);
    __get_cpu_var(netdev_rx_stat).total++;

    spin_lock(&queue->input_pkt_queue.lock);
    if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
        if (queue->input_pkt_queue.qlen) {
enqueue:/*將數據包添加到input_pkt_queue隊列中*/
            __skb_queue_tail(&queue->input_pkt_queue, skb);
            *qtail = queue->input_queue_head +
                 queue->input_pkt_queue.qlen;

            spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
             flags);
            return NET_RX_SUCCESS;
        }

        /* Schedule NAPI for backlog device 能夠調度軟中斷*/
        if (napi_schedule_prep(&queue->backlog)) {
            if (cpu != smp_processor_id()) {/*判斷該SKB是否該CPU處理*/
                struct rps_remote_softirq_cpus *rcpus =
                 &__get_cpu_var(rps_remote_softirq_cpus);

                cpu_set(cpu, rcpus->mask[rcpus->select]);
                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
            } else
/*應該當前cpu處理，則直接schedule 軟中斷，這裏能夠看到傳遞進去的是backlog */
                ____napi_schedule(queue, &queue->backlog);
        }
        goto enqueue;
    }

    spin_unlock(&queue->input_pkt_queue.lock);

    __get_cpu_var(netdev_rx_stat).dropped++;
    local_irq_restore(flags);

    kfree_skb(skb);
    return NET_RX_DROP;
}

inet_recvmsg

int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
         size_t size, int flags)
{
    struct sock *sk = sock->sk;
    int addr_len = 0;
    int err;

    inet_rps_record_flow(sk);//設置HASH表

    err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
                 flags & ~MSG_DONTWAIT, &addr_len);
    if (err >= 0)
        msg->msg_namelen = addr_len;
    return err;
}

這個函數主要是獲得全局的rps_sock_flow_table，而後調用rps_record_sock_flow來對rps_sock_flow_table進行設置，這裏會將socket的sk_rxhash傳遞進去看成hash的索引，而這個sk_rxhash其實就是skb裏面的rxhash，skb的rxhash就是rps中設置的hash值，這個值是根據四元組進行hash的。這裏用這個當索引一個是爲了相同的socket都能落入一個index。並且下面的軟中斷上下文也比較容易存取這個hash表socket

inet_rps_record_flow

點擊(此處)摺疊或打開tcp

static inline void inet_rps_record_flow(struct sock *sk)
{
    struct rps_sock_flow_table *sock_flow_table;

    rcu_read_lock();
    sock_flow_table = rcu_dereference(rps_sock_flow_table);
    rps_record_sock_flow(sock_flow_table, inet_sk_rxhash(sk));
    rcu_read_unlock();
}

rps_record_sock_flow

點擊(此處)摺疊或打開函數

static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
                    u32 hash)
{
    if (table && hash) {
/*獲取索引*/
        unsigned int cpu, index = hash & table->mask;

        /* We only give a hint, preemption can change cpu under us 獲取CPU */
        cpu = raw_smp_processor_id();
 /*保存對應的cpu,若是等於當前cpu，則說明已經設置過了*/
        if (table->ents[index] != cpu)
            table->ents[index] = cpu;
    }
}