NAT惟一五元組選取

使用iptable進行nat設置時,可使用以下擴展選項:前端

# SNAT 源地址轉換,用在 POSTROUTING、INPUT 鏈
--to-source [<ipaddr>[-<ipaddr>]][:port[-port]]
--random        # 映射到隨機端口號,
--random-fully  # 映射到隨機端口號(PRNG 徹底隨機化)
--persistent    # 映射到固定地址

# DNAT 目的地址轉換,用在 PREROUTING、OUTPUT 鏈
--to-destination [<ipaddr>[-<ipaddr>]][:port[-port]]
--random        # 映射到隨機端口號
--persistent    # 映射到固定地址

在內核中有以下幾個標誌與上面的選項對應:算法

/* 指定了IP範圍 */
#define NF_NAT_RANGE_MAP_IPS            (1 << 0)
/* 指定了端口具體範圍 */
#define NF_NAT_RANGE_PROTO_SPECIFIED        (1 << 1)
/* 範圍隨機,使用secure_port函數進行源端口計算,對應於--random */
#define NF_NAT_RANGE_PROTO_RANDOM        (1 << 2)
/* 映射到固定地址,同一個客戶端使用相同的源地址,對應於--persistent */
#define NF_NAT_RANGE_PERSISTENT            (1 << 3)
/* 徹底隨機,對應於--random-fully */
#define NF_NAT_RANGE_PROTO_RANDOM_FULLY        (1 << 4)

//上面幾個標誌有些能夠組合使用

//隨機標誌
#define NF_NAT_RANGE_PROTO_RANDOM_ALL        \
    (NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY)
//範圍標誌
#define NF_NAT_RANGE_MASK                    \
    (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED |    \
     NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT |    \
     NF_NAT_RANGE_PROTO_RANDOM_FULLY)

構建nat信息

​ netfilter在兩個地方會構建nat信息。一個是在命中nat規則後構建nat信息,另一個是relate鏈接會構建nat信息,在expect函數中。構建nat信息都是使用函數nf_nat_setup_info進行構建,二者的差別在於range參數。後者由iptable規則設置,前者由help函數肯定。nat會修改鏈接跟蹤,僅僅修改應答方向。shell

/* 根據提供的nat類型以及範圍進行nat五元組修改 */
unsigned int
nf_nat_setup_info(struct nf_conn *ct,
          const struct nf_nat_range *range,
          enum nf_nat_manip_type maniptype)
{
    struct net *net = nf_ct_net(ct);/* 獲取該鏈接跟蹤所在的網絡命名空間 */
    struct nf_conntrack_tuple curr_tuple, new_tuple;

    /* Can't setup nat info for confirmed ct. */
    /* 鏈接已經確認的不在進行構建 */
    if (nf_ct_is_confirmed(ct))
        return NF_ACCEPT;

    WARN_ON(maniptype != NF_NAT_MANIP_SRC &&
        maniptype != NF_NAT_MANIP_DST);

    if (WARN_ON(nf_nat_initialized(ct, maniptype)))
        return NF_DROP;

    /* What we've got will look like inverse of reply. Normally
     * this is what is in the conntrack, except for prior
     * manipulations (future optimization: if num_manips == 0,
     * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
     * 獲取請求方向的五元組
     */
    nf_ct_invert_tuplepr(&curr_tuple,
                 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
    /* 根據請求方向的五元組獲取nat後的請求方向的五元組 */
    get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
    /* 獲取的惟一的五元組進行翻轉後將會做爲鏈接跟蹤的應答方向的五元組。 */
    /* 新的請求方向的五元組與原來的五元組不同,則須要改變應答方向的五元組 */
    if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
        struct nf_conntrack_tuple reply;

        /* Alter conntrack table so will recognize replies. */
        /* 根據新的五元組獲得應答方向的新的五元組 */
        nf_ct_invert_tuplepr(&reply, &new_tuple);
        /* 替換應答方向的五元組 */
        nf_conntrack_alter_reply(ct, &reply);

        /* Non-atomic: we own this at the moment. */
        if (maniptype == NF_NAT_MANIP_SRC)
            ct->status |= IPS_SRC_NAT;
        else
            ct->status |= IPS_DST_NAT;
        /* 判斷該鏈接是否存在help,若是存在則必須添加seq-adj擴展功能 */
        if (nfct_help(ct) && !nfct_seqadj(ct))
            if (!nfct_seqadj_ext_add(ct))
                return NF_DROP;
    }
    /* 若是是源nat操做,則將該五元組添加到nf_nat_bysource hash表中 */
    /* 該表將會被用來選取snat的源IP,即相同的client會使用相同的源IP */
    if (maniptype == NF_NAT_MANIP_SRC) {
        unsigned int srchash;
        spinlock_t *lock;

        srchash = hash_by_src(net,
                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
        lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS];
        spin_lock_bh(lock);
        hlist_add_head_rcu(&ct->nat_bysource,
                   &nf_nat_bysource[srchash]);
        spin_unlock_bh(lock);
    }

    /* It's done. nat處理完畢 */
    if (maniptype == NF_NAT_MANIP_DST)
        ct->status |= IPS_DST_NAT_DONE;
    else
        ct->status |= IPS_SRC_NAT_DONE;

    return NF_ACCEPT;
}

重點分析get_unique_tuple函數

nf_ct_invert_tuplepr(&curr_tuple,數組

&ct->tuplehash[IP_CT_DIR_REPLY].tuple);語句求出了curr_tuple,對於首包或者鏈接沒有通過nat來講其值就是請求方向的五元組,沒啥不一樣,對於通過了nat的包,則不一樣。
/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
 * we change the source to map into the range. For NF_INET_PRE_ROUTING
 * and NF_INET_LOCAL_OUT, we change the destination to map into the
 * range. It might not be possible to get a unique tuple, but we try.
 * At worst (or if we race), we will end up with a final duplicate in
 * __ip_conntrack_confirm and drop the packet. 
 * 參數tuple爲求出來的惟一的五元組。
 * 參數orig_tuple爲請求方向的五元組。
 * 參數range爲規則設置的參數。
 * 參數maniptype爲nat類型,由hook點決定。
 */
static void
get_unique_tuple(struct nf_conntrack_tuple *tuple,
         const struct nf_conntrack_tuple *orig_tuple,
         const struct nf_nat_range *range,
         struct nf_conn *ct,
         enum nf_nat_manip_type maniptype)
{
    const struct nf_conntrack_zone *zone;
    const struct nf_nat_l3proto *l3proto;
    const struct nf_nat_l4proto *l4proto;
    struct net *net = nf_ct_net(ct);

    zone = nf_ct_zone(ct);

    rcu_read_lock();
    l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num);
    l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num,
                    orig_tuple->dst.protonum);

    /* 1) If this srcip/proto/src-proto-part is currently mapped,
     * and that same mapping gives a unique tuple within the given
     * range, use that.
     *
     * This is only required for source (ie. NAT/masq) mappings.
     * So far, we don't do local source mappings, so multiple
     * manips not an issue.
     */
    if (maniptype == NF_NAT_MANIP_SRC && //第一種狀況,若是是源nat,而且沒有設置隨機標誌
        !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
        /* try the original tuple first */
        /* 首先判斷原始的方向的五元組是否知足snat的範圍要求,若是知足,而且該五元組沒有被使用,則直接使用該五元組 
        ** 這種狀況下不須要進行nat。很是少見。 */
        if (in_range(l3proto, l4proto, orig_tuple, range)) {
            if (!nf_nat_used_tuple(orig_tuple, ct)) {
                *tuple = *orig_tuple;
                goto out;
            }/* 已經使用,則須要進一步計算 */
            
        /* 原始五元組不在範圍內,進行源IP選取,選擇最近使用的相同的源IP的nat後的IP */    
        } else if (find_appropriate_src(net, zone, l3proto, l4proto,
                        orig_tuple, tuple, range)) {
            pr_debug("get_unique_tuple: Found current src map\n");
            /* 查看咱們選取的源IP是否知足惟一,知足則直接退出 */
            if (!nf_nat_used_tuple(tuple, ct))
                goto out;
        }
    }

    /* 2) Select the least-used IP/proto combination in the given range */
    /* 2) 前面的snat沒有選出合適的源IP或者dnat在這裏進一步選擇ip */
    *tuple = *orig_tuple;
    find_best_ips_proto(zone, tuple, range, ct, maniptype);

    /* 3) The per-protocol part of the manip is made to map into
     * the range to make a unique tuple.
     */

    /* Only bother mapping if it's not already in range and unique */
    /* 沒有設置隨機標誌 */
    if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
        if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {//指定了具體端口範圍
            if (l4proto->in_range(tuple, maniptype,//查看當前端口是否在指定的範圍,而且只指定了一個端口,且五元組沒有被使用過,則再也不進行端口的選取。
                          &range->min_proto,
                          &range->max_proto) &&
                (range->min_proto.all == range->max_proto.all ||
                 !nf_nat_used_tuple(tuple, ct)))
                goto out;
        } else if (!nf_nat_used_tuple(tuple, ct)) {//沒有指定具體的端口範圍,而且五元組沒有被使用,則直接使用。
            goto out;
        }
    }

    /* Last change: get protocol to try to obtain unique tuple. */
    /* 最後使用協議去獲取一個端口保證五元組惟一 */
    l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct);
out:
    rcu_read_unlock();
}

find_appropriate_src

/* Only called for SRC manip */
static int
find_appropriate_src(struct net *net,
             const struct nf_conntrack_zone *zone,
             const struct nf_nat_l3proto *l3proto,
             const struct nf_nat_l4proto *l4proto,
             const struct nf_conntrack_tuple *tuple,
             struct nf_conntrack_tuple *result,
             const struct nf_nat_range *range)
{
    unsigned int h = hash_by_src(net, tuple);
    const struct nf_conn *ct;
    //遍歷全部進行snat的請求方向的五元組,查看是否源IP相同,相同則使用對應nat後的源IP。
    hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
        if (same_src(ct, tuple) &&//源IP相同
            net_eq(net, nf_ct_net(ct)) &&//相同命名空間
            nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {//相同的zone
            /* Copy source part from reply tuple. */
            /* 獲取應答方向的五元組,反轉,獲得咱們須要nat後的源IP */
            nf_ct_invert_tuplepr(result,
                       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);//獲取應答方向的反轉五元組
            //還原目的IP
            result->dst = tuple->dst;
            //是否符合指定的range,符合則返回1,不然繼續下一個元素。
            if (in_range(l3proto, l4proto, result, range))
                return 1;
        }
    }
    return 0;
}

find_best_ips_proto

/* For [FUTURE] fragmentation handling, we want the least-used
 * src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
 * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
 * 1-65535, we don't do pro-rata allocation based on ports; we choose
 * the ip with the lowest src-ip/dst-ip/proto usage.
 * 選擇一個最少使用的IP/PRO協議組合。這裏直接採用hash算法計算一個值。
 */
static void
find_best_ips_proto(const struct nf_conntrack_zone *zone,
            struct nf_conntrack_tuple *tuple,
            const struct nf_nat_range *range,
            const struct nf_conn *ct,
            enum nf_nat_manip_type maniptype)
{
    union nf_inet_addr *var_ipp;
    unsigned int i, max;
    /* Host order */
    u32 minip, maxip, j, dist;
    bool full_range;

    /* No IP mapping?  Do nothing. 沒有設置IP轉換標誌,退出*/
    if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
        return;

    if (maniptype == NF_NAT_MANIP_SRC)/* 根據nat類型,指向須要修改的ip內存地址 */
        var_ipp = &tuple->src.u3;
    else
        var_ipp = &tuple->dst.u3;

    /* Fast path: only one choice. 若是隻有一個IP地址,則就使用該IP地址 */
    if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) {
        *var_ipp = range->min_addr;
        return;
    }
    //計算IP地址最後四字節在ip數組中的偏移。
    if (nf_ct_l3num(ct) == NFPROTO_IPV4)
        max = sizeof(var_ipp->ip) / sizeof(u32) - 1;//爲0
    else
        max = sizeof(var_ipp->ip6) / sizeof(u32) - 1;//爲3

    /* Hashing source and destination IPs gives a fairly even
     * spread in practice (if there are a small number of IPs
     * involved, there usually aren't that many connections
     * anyway).  The consistency means that servers see the same
     * client coming from the same IP (some Internet Banking sites
     * like this), even across reboots.
     * 若是設置了NF_NAT_RANGE_PERSISTENT標誌的話,則保證同一個客戶端
     * 使用相同的hash值,即hash的時候僅僅使用源IP,不使用目的IP。
     */
    j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32),
           range->flags & NF_NAT_RANGE_PERSISTENT ?
            0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id);
    //對ip地址的每個四字節進行hash取值,保證在指定的範圍內。
    full_range = false;
    for (i = 0; i <= max; i++) {
        /* If first bytes of the address are at the maximum, use the
         * distance. Otherwise use the full range.
         */
        if (!full_range) {
            minip = ntohl((__force __be32)range->min_addr.all[i]);
            maxip = ntohl((__force __be32)range->max_addr.all[i]);
            dist  = maxip - minip + 1;
        } else {
            minip = 0;
            dist  = ~0;
        }

        var_ipp->all[i] = (__force __u32)
            htonl(minip + reciprocal_scale(j, dist));
        if (var_ipp->all[i] != range->max_addr.all[i])
            full_range = true;

        if (!(range->flags & NF_NAT_RANGE_PERSISTENT))
            j ^= (__force u32)tuple->dst.u3.all[i];
    }
}

l4proto->unique_tuple

l4proto->unique_tuple的實現爲nf_nat_l4proto_unique_tuple。網絡

/*
若是沒有指定範圍,DNAT時目的端口不能改變,SNAT時源端口能夠改變
端口的變化範圍有幾個限制,端口是512之內的映射範圍是1-512,端口
是512-1024的映射範圍是600-1024,1024以上的映射範圍就是1024以上
若是指定了端口的變化範圍,那就按照指定的來
若是是NF_NAT_RANGE_PROTO_RANDOM模式的話,調用L3的secure_port,
根據源目的IP和須要修改的端口計算一個hash值。
若是是NF_NAT_RANGE_PROTO_RANDOM_FULLY模式的話,直接計算隨機數
根據獲得的值根據範圍取餘,再加上最小值就獲得的端口,而後斷定是否已用,
用了的話加1再斷定。
*/
void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
                 struct nf_conntrack_tuple *tuple,
                 const struct nf_nat_range *range,
                 enum nf_nat_manip_type maniptype,
                 const struct nf_conn *ct,
                 u16 *rover)
{
    unsigned int range_size, min, max, i;
    __be16 *portptr;
    u_int16_t off;

    if (maniptype == NF_NAT_MANIP_SRC)
        portptr = &tuple->src.u.all;
    else
        portptr = &tuple->dst.u.all;

    /* If no range specified... 判斷是否指定了具體的端口範圍 */
    if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {/* 沒有指定具體端口範圍的話 */
        /* If it's dst rewrite, can't change port 目的nat不改變端口 */
        if (maniptype == NF_NAT_MANIP_DST)
            return;
        /* 源端口爲保留端口,則須要保證nat後的源端口也爲保留端口 */
        if (ntohs(*portptr) < 1024) {
            /* Loose convention: >> 512 is credential passing */
            /* 源端口小於512,那麼在1-511之間進行選擇 */
            if (ntohs(*portptr) < 512) {
                min = 1;
                range_size = 511 - min + 1;
            } else {
                /* 大於512,則在600到1024之間進行選擇 */
                min = 600;
                range_size = 1023 - min + 1;
            }
        } else {//非保留端口則在1024到65536之間進行選擇
            min = 1024;
            range_size = 65535 - 1024 + 1;
        }
    } else {//指定了具體端口範圍
        min = ntohs(range->min_proto.all);
        max = ntohs(range->max_proto.all);
        if (unlikely(max < min))
            swap(max, min);
        range_size = max - min + 1;
    }

    if (range->flags & NF_NAT_RANGE_PROTO_RANDOM) {
        off = l3proto->secure_port(tuple, maniptype == NF_NAT_MANIP_SRC
                          ? tuple->dst.u.all
                          : tuple->src.u.all);
    } else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) {
        off = prandom_u32();
    } else {
        off = *rover;
    }

    for (i = 0; ; ++off) {
        *portptr = htons(min + off % range_size);
        /* 端口已經被使用,則加1進行嘗試,直到知足要求或者全部狀況都應遍歷完 
        ** 若是是因爲++i == range_size跳出的循環的話,表示沒有選出一個惟一的tuple,會話會被刪除,報文將會在__nf_conntrack_confirm被丟棄*/
        if (++i != range_size && nf_nat_used_tuple(tuple, ct))
            continue;
        /* 若是沒有設置隨機的話,設置當前選用的端口號 */
        if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL))
            *rover = off;
        return;
    }
}
相關文章
相關標籤/搜索