如下代碼取自 kernel 2.6.24. [數據結構] 該結構被基於路由表的classifier使用,用於跟蹤與一個標籤(tag)相關聯的路由流量的統計信息,該統計信息中包含字節數和報文數兩類信息。 這個結構包含一個counters數組,每一個處理器有256個元素。*大小爲256是由於路由標籤的取值範圍爲0到255。IPv4中是由ip_rt_init接 口爲該向量分配空間,IPv6中沒有爲該向量分配空間。ip_rt_acct結構中的四個字段是在ip_rcv_finish接口中更新。 struct ip_rt_acct { }; 路由表hash項 struct rt_hash_bucket { struct rtable *chain; }; 對每一個路由表實例建立一個fib_table結構,這個結構主要由一個路由表標識和管理該路由表的一組函數指針組成: struct fib_table { struct hlist_node tb_hlist; u32 tb_id; //路由表標識 unsigned tb_stamp; //未被使用 //函數被fib_lookup函數調用 int (*tb_lookup)(struct fib_table *tb, const struct flowi *flp, struct fib_result *res); //tb_insert被inet_rtm_newroute和ip_rt_ioctl調用,處理用戶空間的ip route add/change/replace/prepend/append/test 命令和 route add 命令。 //相似地,tb_delete被inet_rtm_delroute(對ip route del ... 命令做出的響應)和ip_rt_ioctl(對route del ... 命令做出的響應)調用, //用於從路由表中刪除一條路由。這兩個函數指針也被fib_magic調用。 int (*tb_insert)(struct fib_table *, struct fib_config *); int (*tb_delete)(struct fib_table *, struct fib_config *); //Dump出路由表的內容。在處理諸如「ip route get...」等用戶命令時被激活。 int (*tb_dump)(struct fib_table *table, struct sk_buff *skb, struct netlink_callback *cb); //將設置有RTNH_F_DEAD標誌的fib_info結構刪除 int (*tb_flush)(struct fib_table *table); //選擇一條缺省路由 void (*tb_select_default)(struct fib_table *table, const struct flowi *flp, struct fib_result *res); //指向該結構的尾部 unsigned char tb_data[0]; }; 一個zone是一組有着相同網絡掩碼長度的路由項。路由表中的路由項按照zone來組織. struct fn_zone { //將活動zones(active zones)(即至少有一條路由項的zones)連接在一塊兒的指針。 //該鏈表的頭部由fn_zone_list來跟蹤,fn_zone_list是fn_hash數據結構的一個字段。 struct fn_zone *fz_next; /* Next not empty zone */ //指向存儲該zone中路由項的哈希表 struct hlist_head *fz_hash; /* Hash table pointer */ //在該zone中路由項的數目(即在該zone的哈希表中fib_node實例的數目)。 //這個值能夠用於檢查是否須要改變該哈希表的容量 int fz_nent; /* Number of entries */ //哈希表fz_hash的容量(桶的數目) int fz_divisor; /* Hash divisor */ u32 fz_hashmask; /* (fz_divisor - 1) */ #define FZ_HASHMASK(fz) ((fz)->fz_hashmask) //在網絡掩碼fz_mask中(全部連續)的比特數目,在代碼中也用prefixlen來表示。 //例如,網絡掩碼255.255.255.0所對應的fz_order爲24。 int fz_order; /* Zone order */ //用fz_order構造的網絡掩碼。例如設fz_order值取3,則生成的fz_mask的二進制表示 //爲11100000.00000000.00000000.00000000,其十進制表示爲224.0.0.0。 __be32 fz_mask; #define FZ_MASK(fz) ((fz)->fz_mask) }; 內核路由項中每個惟一的目的網絡對應一個fib_node實例。目的網絡相同但其它配置參數不一樣的路由項共享同一個fib_node實例。 struct fib_node { //fib_node元素是用哈希表來組織的。這個指針用於將分佈在一張哈希表中的一個桶內全部的fib_node元素連接在一塊兒。 struct hlist_node fn_hash; //每一個fib_node結構與包含一個或多個fib_alias結構的鏈表相關聯。fn_alias指針指向該鏈表的頭部 struct list_head fn_alias; //這是路由項的前綴(或網絡地址,用路由項的netmask字段來表示)。該字段被用做查找路由表時的搜索key __be32 fn_key; }; fib_alias實例是用來區分目的網絡相同但其它配置參數(除了目的地址之外)不一樣的路由項 struct fib_alias { //將與同一個fib_node結構相關聯的全部fib_alias實例連接在一塊兒 struct list_head fa_list; struct rcu_head rcu; struct fib_info *fa_info; //該指針指向一個fib_info實例,該實例存儲着如何處理與該路由相匹配報文的信息 //路由的服務類型(TOS)比特位字段(bitfield)。當該值爲零時表示尚未配置TOS,因此在路由查找時任何值均可以匹配。 //不要將fa_tos字段與fib_rule結構中的r_tos字段相混淆。 //fa_tos字段是用戶對每一條路由表項配置的TOS,而fib_rule結構中的r_tos字段是用戶對策略規則配置的TOS。 u8 fa_tos; u8 fa_type; u8 fa_scope; //路由表項的做用範圍 u8 fa_state; //一些標誌的比特位圖,迄今只定義了下面這一個標誌: //FA_S_ACCESSED //當查找時訪問到fib_alias實例,就設置該標誌來標記。當一個fib_node數據結構改變時這個標誌頗有用: //它用於決定是否應當flush路由緩存。若是fib_node已經被訪問,那麼就可能意味着: //在該路由發生變化時須要清理(clear)路由緩存內的表項,從而觸發一次flush。 }; 下一跳網關等重要的路由信息則存儲在一個fib_info結構 struct fib_info { struct hlist_node fib_hash; struct hlist_node fib_lhash; //引用計數。fib_treeref是持有該fib_info實例引用的fib_node數據結構的數目, //fib_clntref是因爲路由查找成功而被持有的引用計數 int fib_treeref; atomic_t fib_clntref; //標記路由項正在被刪除的標誌。當該標誌被設置爲1時,警告該數據結構將被刪除而不能再使用 int fib_dead; //當前使用的惟一標誌是RTNH_F_DEAD。 //當與一條多路徑路由項相關聯的全部fib_nh結構都設置了RTNH_F_DEAD標誌時,設置該標誌 unsigned fib_flags; int fib_protocol; //設置路由的協議。使用該字段的一個例子是使路由守護進程(Daemon)在與內核通訊時, //操做只能侷限於它們本身生成的路由項。 __be32 fib_prefsrc; //首選源IP地址 u32 fib_priority;//路由優先級。值越小則優先級越高。當沒有明確設定時,內核將它的值初始化爲缺省值0。 //當配置路由時,ip route命令還能夠指定一組metrics。fib_metrics是存儲這一組metrics的一個向量。 //沒有明確設定的Metrics在初始化時被設置爲0。 u32 fib_metrics[RTAX_MAX]; #define fib_mtu fib_metrics[RTAX_MTU-1] #define fib_window fib_metrics[RTAX_WINDOW-1] #define fib_rtt fib_metrics[RTAX_RTT-1] #define fib_advmss fib_metrics[RTAX_ADVMSS-1] int fib_nhs; #ifdef CONFIG_IP_ROUTE_MULTIPATH int fib_power; //當內核編譯支持多路徑時,該字段纔是fib_info數據結構的一部分 #endif //fib_nh是結構類型爲fib_nh的一個可變長向量,fib_nhs是該向量的size。 //只有當內核支持多路徑特性時,fib_nhs纔可能大於1。 struct fib_nh fib_nh[0]; #define fib_dev fib_nh[0].nh_dev }; 對每個下一跳,內核須要跟蹤的信息要比IP地址信息更爲豐富 struct fib_nh { //這是與設備標識nh_oif相關聯的net_device數據結構。由於設備標識和指向net_device結構的指針 //都須要利用(在不一樣的上下文內),因此這兩項都存在於fib_nh結構中,雖然利用其中任何一項就能夠獲得另外一項 struct net_device *nh_dev; struct hlist_node nh_hash; struct fib_info *nh_parent; //該指針指向包含該fib_nh實例的fib_info結構 unsigned nh_flags; //用於獲取下一跳的路由scope。在大多數狀況下爲RT_SCOPE_LINK。該字段由fib_check_nh來初始化。 unsigned char nh_scope; #ifdef CONFIG_IP_ROUTE_MULTIPATH //nh_power是由內核初始化,nh_weight是由用戶利用關鍵字weight來設置。 int nh_weight; int nh_power; #endif #ifdef CONFIG_NET_CLS_ROUTE __u32 nh_tclassid; //它的值是利用realms關鍵字來設置 #endif int nh_oif; //egress設備標識 __be32 nh_gw; //下一跳網關的IP地址,它是利用關鍵字via來設置的。 //注意在NAT狀況下,它表示NAT路由器向外部世界通告的地址, //以及在NAT路由器向內部網中的主機發送回應以前而向外部世界迴應的地址。 //例如,命令ip route add nat 10.1.1.253/32 via 151.41.196.1將設置nh_gw爲151.41.196.1。 //注意在2.6內核中路由代碼已經再也不支持NAT,即原來衆所周知的FastNAT。 }; 策略被存儲在fib_rule結構內 struct fib_rule { struct list_head list; //將這些fib_rule結構連接到一個包含全部fib_rule實例的全局鏈表內 //引用計數。該引用計數的遞增是在fib_lookup函數(只在策略路由版的函數中)中進行的, //這解釋了爲何在每次路由查找成功後老是須要調用fib_res_put(遞減該引用計數)。 atomic_t refcnt; int ifindex; //內核能夠獲得相關的net_device實例,將該實例的ifindex字段拷貝到ifindex中。ifindex值取-1表示禁止該規則. char ifname[IFNAMSIZ]; //策略應用的設備的名稱 //當內核編譯支持「使用Netfilter MARK值做爲路由key「特性時,能夠根據防火牆標籤來定義規則。 //該字段是管理員定義一條策略規則時利用mark關鍵字指定的標籤。 u32 mark; u32 mark_mask; //路由規則的優先級。當管理員利用IPROUTE2軟件包添加一個策略時,可使用關鍵字priority,preference和order來配置。 //優先級0,0x7FFE和0x7FFF是預留給由內核添加的特定規則使用. u32 pref; u32 flags; //路由表標識,範圍從0到255。當用戶沒有指定路由表標識時,IPROUTE2按照如下方法來選擇路由表: //當用戶命令是添加一條規則時使用RT_TABLE_MAIN,其它狀況使用RT_TABLE_UNSPEC(例如刪除一條規則)。 u32 table; u8 action; //路由動做類型 u32 target; struct fib_rule * ctarget; struct rcu_head rcu; }; fib_result結構被fib_semantic_match初始化爲路由查找結果 struct fib_result { unsigned char prefixlen; //匹配路由的前綴長度 unsigned char nh_sel; //標識已經被選中的下一跳 //這兩個字段被初始化爲相匹配的fib_alias實例的fa_type和fa_scope字段的取 unsigned char type; unsigned char scope; struct fib_info *fi; //與匹配的fib_alias實例相關聯的fib_info實例 #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rule *r; //字段由fib_lookup來初始化 #endif }; 用rtable數據結構來存儲緩存內的路由表項 struct rtable { union //這個聯合用來將一個dst_entry結構嵌入到rtable結構中 { struct dst_entry dst; } u; /* Cache lookup keys */ struct flowi fl; //用於緩存查找的搜索key //該指針指向egress設備的IP配置塊。注意對送往本地的ingress報文的路由,設置的egress設備爲loopback設備 struct in_device *idev; unsigned rt_flags; __u16 rt_type; //路由類型 __be32 rt_dst; /* Path destination */ __be32 rt_src; /* Path source */ //Ingress設備標識。這個值是從ingress設備的net_device數據結構中獲得。 //對本地生成的流量(所以不是從任何接口上接收到的),該字段被設置爲出設備的ifindex字段。 int rt_iif; //當目的主機爲直連時(即在同一鏈路上),rt_gateway表示目的地址。 //當須要經過一個網關到達目的地時,rt_gateway被設置爲由路由項中的下一跳網關。 __be32 rt_gateway; /* Miscellaneous cached information */ __be32 rt_spec_dst; //RFC 1122中指定的目的地址 //與本地主機在最近一段時間通訊的每一個遠端IP地址都有一個inet_peer結 struct inet_peer *peer; /* long-living peer info */ }; 數據結構dst_entry被用於存儲緩存路由項中獨立於協議的信息 struct dst_entry { struct rcu_head rcu_head; //用於將分佈在同一個哈希桶內的dst_entry實例連接在一塊兒 struct dst_entry *child; //這字段被IPsec代碼使用 struct net_device *dev; //Egress設備(即將報文送達目的地的發送設備) //當fib_lookup API(只被IPv4使用)失敗時,錯誤值被保存在error(用一個正值)中, //在後面的ip_error中使用該值來決定如何處理本次路由查找失敗(即決定生成哪一類ICMP消息)。 short error; //用於定義該dst_entry實例的可用狀態:0(缺省值)表示該結構有效並且能夠被使用, //2表示該結構將被刪除於是不能被使用,-1被IPsec和IPv6使用但不被IPv4使用。 short obsolete; //標誌集合(Set of flags)。DST_HOST被TCP使用,表示主機路由(即它不是到網絡或到一個廣播/多播地址的路由)。 //DST_NOXFRM,DST_NOPOLICY和DST_NOHASH只用於IPsec。 int flags; #define DST_HOST 1 #define DST_NOXFRM 2 #define DST_NOPOLICY 4 #define DST_NOHASH 8 unsigned long expires; //表示該表項將過時的時間戳 //這些字段被IPsec代碼使用 unsigned short header_len; /* more space at head required */ unsigned short nfheader_len; /* more non-fragment space at head required */ unsigned short trailer_len; /* space to reserve at tail */ //主要被TCP使用。該向量是用fib_info->fib_metrics向量的一份拷貝來初始化(若是fib_metrics向量被定義),當須要時使用缺省值。 u32 metrics[RTAX_MAX]; struct dst_entry *path; //這字段被IPsec代碼使用 //這兩個字段被用於對兩種類型的ICMP消息限速 unsigned long rate_last; /* rate limiting for ICMP */ unsigned long rate_tokens; //neighbour是包含下一跳三層地址到二層地址映射的結構,hh是緩存的二層頭 struct neighbour *neighbour; struct hh_cache *hh; struct xfrm_state *xfrm; //這字段被IPsec代碼使用 //分別表示處理ingress報文和處理egress報文的函數 int (*input)(struct sk_buff*); int (*output)(struct sk_buff*); #ifdef CONFIG_NET_CLS_ROUTE __u32 tclassid;//基於路由表的classifier的標籤 #endif //該結構內的虛函數表(VFT)用於處理dst_entry結構。 struct dst_ops *ops; //用於記錄該表項上次被使用的時間戳。當緩存查找成功時更新該時間戳, //垃圾回收程序使用該時間戳來選擇最合適的應當被釋放的結構。 unsigned long lastuse; atomic_t __refcnt; //引用計數 /* client references */ int __use; //該表項已經被使用的次數(即緩存查找返回該表項的次數) union { struct dst_entry *next; struct rtable *rt_next; struct rt6_info *rt6_next; struct dn_route *dn_next; }; char info[0]; }; 利用flowi數據結構,就能夠根據諸如ingress設備和egress設備、L3和L4協議報頭中的參數等字段的組合對流量進行分類。 它一般被用做路由查找的搜索key,IPsec策略的流量選擇器以及其它高級用途。 struct flowi { Egress設備ID和ingress設備ID int oif; int iif; __u32 mark; union { //該聯合的各個字段是可用於指定L3參數取值的結構。目前支持的協議爲IPv4,IPv6和DECnet。 struct { __be32 daddr; __be32 saddr; __u8 tos; __u8 scope; } ip4_u; struct { struct in6_addr daddr; struct in6_addr saddr; __be32 flowlabel; } ip6_u; struct { __le16 daddr; __le16 saddr; __u8 scope; } dn_u; } nl_u; #define fld_dst nl_u.dn_u.daddr #define fld_src nl_u.dn_u.saddr #define fld_scope nl_u.dn_u.scope #define fl6_dst nl_u.ip6_u.daddr #define fl6_src nl_u.ip6_u.saddr #define fl6_flowlabel nl_u.ip6_u.flowlabel #define fl4_dst nl_u.ip4_u.daddr #define fl4_src nl_u.ip4_u.saddr #define fl4_tos nl_u.ip4_u.tos #define fl4_scope nl_u.ip4_u.scope __u8 proto; //L4協議 __u8 flags; //該變量只定義了一個標誌,FLOWI_FLAG_MULTIPATHOLDROUTE,它最初用於多路徑代碼,但已再也不被使用 #define FLOWI_FLAG_MULTIPATHOLDROUTE 0x01 union { //該聯合的各個字段是可用於指定L4參數取值的主要結構。目前支持的協議爲TCP, UDP,ICMP,DECnet和IPsec協議套件(suite)。 struct { __be16 sport; __be16 dport; } ports; struct { __u8 type; __u8 code; } icmpt; struct { __le16 sport; __le16 dport; } dnports; __be32 spi; struct { __u8 type; } mht; } uli_u; #define fl_ip_sport uli_u.ports.sport #define fl_ip_dport uli_u.ports.dport #define fl_icmp_type uli_u.icmpt.type #define fl_icmp_code uli_u.icmpt.code #define fl_ipsec_spi uli_u.spi #define fl_mh_type uli_u.mht.type __u32 secid; /* used by xfrm; see secid.txt */ } __attribute__((__aligned__(BITS_PER_LONG/8))); #define LOOPBACK(x) (((x) & htonl(0xff000000)) == htonl(0x7f000000)) IP地址高8位爲127的爲回送地址類 #define MULTICAST(x) (((x) & htonl(0xf0000000)) == htonl(0xe0000000)) IP地址高4位爲14的爲多目地址類 #define BADCLASS(x) (((x) & htonl(0xf0000000)) == htonl(0xf0000000)) IP地址高4位爲15的爲非法地址類 #define ZERONET(x) (((x) & htonl(0xff000000)) == htonl(0x00000000)) IP地址高8位爲0的爲零網地址類 #define LOCAL_MCAST(x) (((x) & htonl(0xFFFFFF00)) == htonl(0xE0000000)) IP地址高24位爲0xE00000的爲局域組播地址類 [/數據結構] [初始化] fs_initcall(inet_init); static int __init inet_init(void) //net/ipv4/af_inet.c { ...... ip_init(); //ip路由相關部分初始化 ...... } void __init ip_init(void) { ip_rt_init(); //路由初始化 inet_initpeers(); //初始化一個 AVL 樹,用於追蹤IP對端的信息,遠程主機到這主機最新交換的數據包. #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS) igmp_mc_proc_init(); #endif } 初始化高速路由緩存和FIB(Forwarding Information Base) int __init ip_rt_init(void) { int rc = 0; //hash隨機因子初始化 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ (jiffies ^ (jiffies >> 7))); #ifdef CONFIG_NET_CLS_ROUTE { int order; //肯定須要多少內存頁 for (order = 0; (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) /* NOTHING */; //路由參數統計,即進出的包數和字節數,每一個cpu有256個 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); memset(ip_rt_acct, 0, PAGE_SIZE << order); } #endif //路由操做高速緩存分配初始化,參看下面路由緩存操做實現 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; // ??????????????????? rt_hash_table = (struct rt_hash_bucket *) alloc_large_system_hash("IP route cache", //名字 sizeof(struct rt_hash_bucket), //每一個元素的尺寸 rhash_entries, //元素的個數,默認0,由系統來肯定,即便你給了值,函數也會把它變爲最接近的2的冪 (num_physpages >= 128 * 1024) ? 15 : 17, 0,//可取HASH_EARLY或0,分配內存的地方根據這個有不一樣 &rt_hash_log,//用於返回元素個數的以2爲底的對數,也就是表示元素個數這個數值所用的比特數 &rt_hash_mask,//用於返回上面那個比特數所能表示的最大數 -1 0);//哈希表表元數上限,不是分配內存的總尺寸,不要弄混了。 //若是給個0值,那麼系統使用1/16內存所能容納的元素數做爲哈希表表元數. memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); //清零 rt_hash_lock_init();//鎖初始化 //路由高速緩存垃圾回收閥值 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); ip_rt_max_size = (rt_hash_mask + 1) * 16; //每一個hash項最多16個元素 devinet_init(); ip_fib_init(); //fib初始化 init_timer(&rt_flush_timer); //flush路由緩存時的定時器 rt_flush_timer.function = rt_run_flush; init_timer(&rt_secret_timer); //垃圾回收時要使用的定時器 rt_secret_timer.function = rt_secret_rebuild; /* All the timers, started at system startup tend to synchronize. Perturb it a bit. */ schedule_delayed_work(&expires_work, net_random() % ip_rt_gc_interval + ip_rt_gc_interval); rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval + ip_rt_secret_interval; add_timer(&rt_secret_timer); #ifdef CONFIG_PROC_FS //proc初始化 { struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */ if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) || !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, init_net.proc_net_stat))) { return -ENOMEM; } rtstat_pde->proc_fops = &rt_cpu_seq_fops; } #ifdef CONFIG_NET_CLS_ROUTE create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL); #endif #endif #ifdef CONFIG_XFRM //IPSEC初始化,參考IPSEC實現文章 ??爲何放在這 xfrm_init(); xfrm4_init(); #endif rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL); return rc; } 看上面參數說明 void *__init alloc_large_system_hash(const char *tablename, unsigned long bucketsize, unsigned long numentries, int scale, int flags, unsigned int *_hash_shift, unsigned int *_hash_mask, unsigned long limit) { unsigned long long max = limit; unsigned long log2qty, size; void *table = NULL; /* allow the kernel cmdline to have a say */ if (!numentries) { //決定元素個數 /* round applicable memory size up to nearest megabyte */ numentries = nr_kernel_pages; numentries += (1UL << (20 - PAGE_SHIFT)) - 1; //加上一個255 (i386) //清除低8位,變爲256倍數 numentries >>= 20 - PAGE_SHIFT; numentries <<= 20 - PAGE_SHIFT; /* limit to 1 bucket per 2^scale bytes of low memory */ if (scale > PAGE_SHIFT) numentries >>= (scale - PAGE_SHIFT); else numentries <<= (PAGE_SHIFT - scale); /* Make sure we've got at least a 0-order allocation.. */ if (unlikely((numentries * bucketsize) < PAGE_SIZE)) numentries = PAGE_SIZE / bucketsize; } numentries = roundup_pow_of_two(numentries); //向上到2的冪 /* limit allocation size to 1/16 total memory by default */ if (max == 0) { //使用 1/16 的內存 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; do_div(max, bucketsize); //max = max / bucketsize } if (numentries > max) //不能超過這個數量 numentries = max; log2qty = ilog2(numentries);//元素個數這個數值所用的比特數 do { size = bucketsize << log2qty; //肯定內存使用大小(字節) bucket * 2 ^ log2qty if (flags & HASH_EARLY) table = alloc_bootmem(size); else if (hashdist) //默認0,啓動時能夠添加參數修改 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); else { unsigned long order; for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)//肯定須要使用的頁數 ; table = (void*) __get_free_pages(GFP_ATOMIC, order); /* If bucketsize is not a power-of-two, we may free some pages at the end of hash table. */ if (table) { unsigned long alloc_end = (unsigned long)table + (PAGE_SIZE << order); //內存結束位置 unsigned long used = (unsigned long)table + PAGE_ALIGN(size); //使用結尾位置,以頁對齊 split_page(virt_to_page(table), order); //頁是連續的,把它打散成一頁一頁的 //由於在肯定須要使用的頁數的時候頁的數量是以2的冪增加的. while (used < alloc_end) { //釋放一些不使用的頁 free_page(used); used += PAGE_SIZE; //size -= PAGE_SIZE; 應該加上,不然下面統計信息有一些誤差 } } } } while (!table && size > PAGE_SIZE && --log2qty); //內存分配失敗,減小比特位數也就是除2 if (!table) panic("Failed to allocate %s hash table\n", tablename); printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n", tablename, (1U << log2qty), //表項個數 ilog2(size) - PAGE_SHIFT, //佔2的幾回方頁 size); //內存使用 if (_hash_shift) *_hash_shift = log2qty; if (_hash_mask) *_hash_mask = (1 << log2qty) - 1; return table; } 爲通知鏈netdev_chain註冊另外一個處理鉤子,註冊Netlink套接字上地址和路由命令(即ip addr ... 與ip route ..命令)的處理鉤子函數, 並建立/proc/sys/net/conf和/proc/sys/net/conf/default目錄. void __init devinet_init(void) { register_gifconf(PF_INET, inet_gifconf); register_netdevice_notifier(&ip_netdev_notifier); //rtnetlink和用戶交互使用 rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL); rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL); rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr); #ifdef CONFIG_SYSCTL devinet_sysctl.sysctl_header = register_sysctl_table(devinet_sysctl.devinet_root_dir); //建立/proc/sys/net/ipv4/conf/all/* devinet_sysctl_register(NULL, &ipv4_devconf_dflt); //建立/proc/sys/net/ipv4/conf/default/* #endif } FIB (Forwarding Information Base)初始化 void __init ip_fib_init(void) { unsigned int i; for (i = 0; i < FIB_TABLE_HASHSZ; i++) INIT_HLIST_HEAD(&fib_table_hash[i]); fib4_rules_init(); register_netdevice_notifier(&fib_netdev_notifier); //網絡設備狀態的變化 register_inetaddr_notifier(&fib_inetaddr_notifier); //網絡設備上IP配置的變化 nl_fib_lookup_init(); //netlink fib 查詢初始化 使用 ?? rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL); rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL); rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib); } 當內核配置了策略路由 CONFIG_IP_MULTIPLE_TABLES,系統可以使用多達256張轉發表, fib_lookup()經過規則表(fib_fule)匹配相應的轉發表,再進一步匹配目標地址的路由.咱們瀏覽使用策略路由的代碼. void __init fib4_rules_init(void) { BUG_ON(fib_default_rules_init()); fib_rules_register(&fib4_rules_ops);//註冊這個路由規則操做,就是添加到 static LIST_HEAD(rules_ops); 連表中 } 默認規則初始化,初始化三個路由規則,參看下面路由規則操做實現 這三條規則以鏈表的形式組織在一塊兒,系統管理員能夠經過系統命令操做向這個鏈表中再添加其它優先級的路由規則。 三條內置規則對數據包的源和目的地址,以及服務類型都沒有做任何特殊限制,因此任何一個數據包均可以按順序查找這三個路由表,直到找到相應的路由爲止。 static int __init fib_default_rules_init(void) { int err; //RT_TABLE_LOCAL優先級最高,爲0,RT_TABLE_MAIN爲0x7FFE, RT_TABLE_DEFAULT爲0x7FFF err = fib_default_rule_add(&fib4_rules_ops, 0, RT_TABLE_LOCAL, FIB_RULE_PERMANENT); if (err < 0) return err; err = fib_default_rule_add(&fib4_rules_ops, 0x7FFE, RT_TABLE_MAIN, 0); if (err < 0) return err; err = fib_default_rule_add(&fib4_rules_ops, 0x7FFF, RT_TABLE_DEFAULT, 0); if (err < 0) return err; return 0; } int fib_default_rule_add(struct fib_rules_ops *ops, u32 pref, u32 table, u32 flags) { struct fib_rule *r; r = kzalloc(ops->rule_size, GFP_KERNEL); if (r == NULL) return -ENOMEM; atomic_set(&r->refcnt, 1); r->action = FR_ACT_TO_TBL; r->pref = pref; //優先級 r->table = table; //表類型 r->flags = flags; /* The lock is not required here, the list in unreacheable at the moment this function is called */ list_add_tail(&r->list, &ops->rules_list); //添加到規則連表中 return 0; } [/初始化] [概念解釋] 看代碼以前,咱們先解釋一些概念。 scope 路由和IP地址都有scope(做用範圍或做用域),這告訴內核它們在哪些狀況下是有意義的,是能夠被使用的。 在Linux中,路由的scope表示到目的網絡的距離。IP地址的scope表示該IP地址距離本地主機有多遠,某種程度上也告訴你該地址的owner距離本地主機有多遠。 路由的scope被保存在fib_alias數據結構內的fa_scope字段。下面按照scope遞減順序給出IPv4路由代碼中使用的主要scope,值越小表示的範圍越大: RT_SCOPE_NOWHERE //255 它被代碼視爲非法scope。它的字面含義是路由項不通往任何地方,這基本上就意味着沒有到達目的地的路由。 RT_SCOPE_HOST //254 scope爲RT_SCOPE_HOST的路由項的例子是,爲本地接口配置IP地址時自動建立的路由表項。 RT_SCOPE_LINK //253 爲本地接口配置地址時,派生的目的地爲本地網絡地址(由網絡掩碼定義)和子網廣播地址的路由表項的scope就是RT_SCOPE_LINK。 RT_SCOPE_UNIVERSE 0 該scope被用於全部的通往遠程非直連目的地的路由表項(也就是須要一個下一跳網關的路由項)。 地址的scope被保存在in_ifaddr結構內的ifa_scope字段。對設備上配置的每個IP地址對應一個in_ifaddr實例。 當一個地址只用於主機自身內部通訊時scope爲host,該地址在主機之外不知道而且不能被使用。例如迴環地址127.0.0.1。 當一個地址只在一個局域網(即每一臺計算機經過鏈路層互聯的一個網絡)內有意義且只在局域網內使用時,該地址的scope爲link。 例如子網的廣播地址。子網內一臺主機發送到子網廣播地址的報文被送給同一子網內的其它主機。 當一個地址能夠在任何地方使用時scope爲universe,這是大多數地址的缺省scope。 路由表項中的下一跳網關是另外一個具備scope的對象類型。每一條路由項能夠有零個、一個或多個下一跳,每一個下一跳是由一個fib_nh結構表示。 fib_nh結構中有nh_gw和nh_scope兩個字段:nh_gw是下一跳網關的IP地址,nh_scope是該地址的scope (這兩個字段表示了從本地主機到這個下一跳網關的路由表項的scope)。 [/概念解釋] [輸入流程] IP接收函數ip_rcv->ip_rcv_finish->ip_route_input對一個輸入的包進行路由查找,完畢後調用dst_input->skb->dst->input(skb); int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev) { struct rtable * rth; unsigned hash; int iif = dev->ifindex; //設備索引 tos &= IPTOS_RT_MASK; hash = rt_hash(daddr, saddr, iif); //源地址,目的地址,設備索引 rcu_read_lock(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; rth = rcu_dereference(rth->u.dst.rt_next)) { if (rth->fl.fl4_dst == daddr && rth->fl.fl4_src == saddr && rth->fl.iif == iif && rth->fl.oif == 0 && rth->fl.mark == skb->mark && rth->fl.fl4_tos == tos) { //在路由高速緩存中找到了匹配 dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); skb->dst = (struct dst_entry*)rth; return 0; } RT_CACHE_STAT_INC(in_hlist_search); } rcu_read_unlock(); if (MULTICAST(daddr)) { //目的地址是多播地址 struct in_device *in_dev; rcu_read_lock(); if ((in_dev = __in_dev_get_rcu(dev)) != NULL) { int our = ip_check_mc(in_dev, daddr, saddr, ip_hdr(skb)->protocol);//檢查目的地址是本地配置的多播地址 if (our #ifdef CONFIG_IP_MROUTE || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) //內核編譯時支持多播路由,且轉發開啓 #endif ) { rcu_read_unlock(); return ip_route_input_mc(skb, daddr, saddr, tos, dev, our); //參考下面多播路由處理 } } rcu_read_unlock(); return -EINVAL; } return ip_route_input_slow(skb, daddr, saddr, tos, dev); } 非多播路由處理 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev) { struct fib_result res; struct in_device *in_dev = in_dev_get(dev); //dev->ip_ptr指向這個結構 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = daddr, .saddr = saddr, .tos = tos, .scope = RT_SCOPE_UNIVERSE, } }, .mark = skb->mark, .iif = dev->ifindex }; unsigned flags = 0; u32 itag = 0; struct rtable * rth; unsigned hash; __be32 spec_dst; int err = -EINVAL; int free_res = 0; /* IP on this device is disabled. */ if (!in_dev) goto out; /* Check for the most weird martians, which can be not detected by fib_lookup. */ //源地址是多播,非法或迴環 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr)) goto martian_source; //檢測地址是否錯誤 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0)) goto brd_input; /* Accept zero addresses only to limited broadcast; * I even do not know to fix it or not. Waiting for complains :-) */ if (ZERONET(saddr)) //源地址爲0 goto martian_source; //目的地址爲非法,零類或迴環地址 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr)) goto martian_destination; /* Now we are ready to route packet. */ //在fib中查詢路由信息,將路由查詢結果保存在fib_result結構的res中 if ((err = fib_lookup(&fl, &res)) != 0) { if (!IN_DEV_FORWARD(in_dev))//沒有找到並且設備不能轉發 goto e_hostunreach; goto no_route; //沒找到但能夠轉發 } free_res = 1; //已經找到 RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_BROADCAST) //路由類型爲廣播 goto brd_input; if (res.type == RTN_LOCAL) {//路由類型爲本地,也就是到本地的包 int result; //驗證源地址有效 result = fib_validate_source(saddr, daddr, tos, init_net.loopback_dev->ifindex, dev, &spec_dst, &itag); if (result < 0) goto martian_source; if (result) flags |= RTCF_DIRECTSRC; spec_dst = daddr; goto local_input; } //須要轉發的包,查看是否容許轉發 if (!IN_DEV_FORWARD(in_dev)) goto e_hostunreach; if (res.type != RTN_UNICAST) //路由類型不是單播 goto martian_destination; //須要轉發的包,建立路由項 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); done: in_dev_put(in_dev); if (free_res) fib_res_put(&res); out: return err; brd_input: //目的地址爲廣播地址或源和目的地址爲0或路由結果顯示爲廣播,會跳到這 if (skb->protocol != htons(ETH_P_IP)) //非ip協議 goto e_inval; if (ZERONET(saddr)) //源地址爲0,選擇一個地址 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); else { //驗證源地址 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag); if (err < 0) //源地址不正確 goto martian_source; if (err) flags |= RTCF_DIRECTSRC; //告訴ICMP代碼,不該當對地址掩碼請求消息做出迴應. } flags |= RTCF_BROADCAST; //路由的目的地址是一個廣播地址 res.type = RTN_BROADCAST; //目的地址是一個廣播地址。匹配的ingress報文以廣播方式送往本地,匹配的egress報文以廣播方式發送出去. RT_CACHE_STAT_INC(in_brd); local_input: rth = dst_alloc(&ipv4_dst_ops); //分配路由緩衝項 if (!rth) goto e_nobufs; rth->u.dst.output= ip_rt_bug; //到本地的包,應該沒有發送調用 atomic_set(&rth->u.dst.__refcnt, 1); //DST_HOST被TCP使用,表示主機路由(即它不是到網絡或到一個廣播/多播地址的路由) rth->u.dst.flags= DST_HOST; if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) rth->u.dst.flags |= DST_NOPOLICY; //IPSEC使用 rth->fl.fl4_dst = daddr; rth->rt_dst = daddr; rth->fl.fl4_tos = tos; rth->fl.mark = skb->mark; rth->fl.fl4_src = saddr; rth->rt_src = saddr; #ifdef CONFIG_NET_CLS_ROUTE rth->u.dst.tclassid = itag; #endif rth->rt_iif = rth->fl.iif = dev->ifindex; rth->u.dst.dev = init_net.loopback_dev; //指向迴環設備 dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; rth->u.dst.input= ip_local_deliver; rth->rt_flags = flags|RTCF_LOCAL; //路由目的地址爲本地地址 if (res.type == RTN_UNREACHABLE) { //路由類型爲不可達 rth->u.dst.input= ip_error; rth->u.dst.error= -err; //保存錯誤類型,ip_error會根據這個類型發送相應的icmp包 rth->rt_flags &= ~RTCF_LOCAL; } rth->rt_type = res.type; hash = rt_hash(daddr, saddr, fl.iif); err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); //添加到hash表中 goto done; no_route: //路由信息庫查找失敗時 RT_CACHE_STAT_INC(in_no_route); spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); res.type = RTN_UNREACHABLE; //結果路由類型爲不可達 if (err == -ESRCH) err = -ENETUNREACH; goto local_input; /* Do not cache martian addresses: they should be logged (RFC1812)*/ martian_destination: //目的地址出錯 RT_CACHE_STAT_INC(in_martian_dst); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) printk(KERN_WARNING "martian destination %u.%u.%u.%u from %u.%u.%u.%u, dev %s\n", NIPQUAD(daddr), NIPQUAD(saddr), dev->name); #endif e_hostunreach: err = -EHOSTUNREACH; //函數返回後,根據這個值記錄統計信息,而後丟棄這個包 goto done; e_inval: err = -EINVAL; //返回後直接丟棄 goto done; e_nobufs: err = -ENOBUFS;//返回後直接丟棄 goto done; martian_source: //驗證源地址,源地址出錯 //處理出錯源地址,除了RT_CACHE_STAT_INC(in_martian_src); 若是開啓CONFIG_IP_ROUTE_VERBOSE打印一些信息,其餘什麼也不作. ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); //處理出錯源地址,RT_CACHE_STAT_INC(in_martian_src); goto e_inval; } fib查詢函數 int fib_lookup(struct flowi *flp, struct fib_result *res) { struct fib_lookup_arg arg = { .result = res, }; int err; err = fib_rules_lookup(&fib4_rules_ops, flp, 0, &arg); res->r = arg.rule; //找到的具體路由規則 return err; } 實際查找函數 int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { struct fib_rule *rule; int err; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { //連表中搜尋全部規則 jumped: if (!fib_rule_match(rule, ops, fl, flags)) //查看規則是否匹配 continue; if (rule->action == FR_ACT_GOTO) { //活動類型爲跳轉到其餘規則 struct fib_rule *target; target = rcu_dereference(rule->ctarget); if (target == NULL) { continue; } else { rule = target; goto jumped; } } else if (rule->action == FR_ACT_NOP) continue; else //根據規則查找路由表中內容是否匹配,參考下面路由規則操做實現 err = ops->action(rule, fl, flags, arg); if (err != -EAGAIN) { fib_rule_get(rule); arg->rule = rule; goto out; } } err = -ESRCH; out: rcu_read_unlock(); return err; } 匹配函數 static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags) { int ret = 0; //規則中記錄了設備索引,檢查是否匹配索引 if (rule->ifindex && (rule->ifindex != fl->iif)) goto out; if ((rule->mark ^ fl->mark) & rule->mark_mask) goto out; ret = ops->match(rule, fl, flags); //調用操做匹配函數 out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; } 驗證源地址 int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, __be32 *spec_dst, u32 *itag) { struct in_device *in_dev; struct flowi fl = { .nl_u = { .ip4_u = //反向查詢 { .daddr = src, .saddr = dst, .tos = tos } }, .iif = oif }; //這個索引值爲迴環設備的索引值 struct fib_result res; int no_addr, rpf; int ret; no_addr = rpf = 0; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev) { no_addr = in_dev->ifa_list == NULL; //設備是否有配置ip地址 rpf = IN_DEV_RPFILTER(in_dev); } rcu_read_unlock(); if (in_dev == NULL) goto e_inval; if (fib_lookup(&fl, &res)) goto last_resort; if (res.type != RTN_UNICAST) //反向查找後路由類型指出目的地址不是一個單播地址 goto e_inval_res; *spec_dst = FIB_RES_PREFSRC(res); //進行反向路徑查找時來輔助查找realms fib_combine_itag(itag, &res); #ifdef CONFIG_IP_ROUTE_MULTIPATH if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1) //適合的下一跳不止一個 #else if (FIB_RES_DEV(res) == dev) #endif { ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; fib_res_put(&res); return ret; } fib_res_put(&res); if (no_addr) //沒有ip地址 goto last_resort; if (rpf) goto e_inval; fl.oif = dev->ifindex; //外出接口是進入接口 ret = 0; if (fib_lookup(&fl, &res) == 0) { if (res.type == RTN_UNICAST) { *spec_dst = FIB_RES_PREFSRC(res); ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; } fib_res_put(&res); } return ret; last_resort: if (rpf) goto e_inval; *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); *itag = 0; return 0; e_inval_res: fib_res_put(&res); e_inval: return -EINVAL; } 建立路由項 static inline int ip_mkroute_input(struct sk_buff *skb, struct fib_result* res, const struct flowi *fl, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos) { struct rtable* rth = NULL; int err; unsigned hash; #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) //當查找返回的路由是一條多路徑路由項時,須要選出下一跳 fib_select_multipath(fl, res); #endif //建立一個路由緩存項,由rth返回 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); if (err) return err; /* put it into the cache */ //根據目的,源地址和包進入設備索引計算hash值 hash = rt_hash(daddr, saddr, fl->iif); return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); //將結果存入hash表和 skb->dst指針. } 當一條路由項有多個下一跳時可用時,選擇出下一跳 void fib_select_multipath(const struct flowi *flp, struct fib_result *res) { struct fib_info *fi = res->fi; int w; spin_lock_bh(&fib_multipath_lock); //該字段被初始化爲fib_info實例的全部下一跳權值(fib_nh->nh_weight)的總和, //但不包含因爲某些緣由而不能使用的下一跳(帶有RTNH_F_DEAD標誌)。每當調用fib_select_multipath來選擇一個下一跳時, //fib_power的值遞減。當該值遞減爲小於或等於零時被從新初始化。 if (fi->fib_power <= 0) { int power = 0; change_nexthops(fi) { if (!(nh->nh_flags&RTNH_F_DEAD)) { power += nh->nh_weight; //下一跳的權值,默認爲 1 //nh->nh_power是使該下一跳被選中的tokens。這個值是在初始化fib_info->fib_power時, //首先被初始化爲fib_nh->nh_weight。每當fib_select_multipath選中該下一跳時就遞減該值。 //當這個值遞減爲零時,再也不選中該下一跳,直到nh_power被從新初始化爲fib_nh->nh_weight。 nh->nh_power = nh->nh_weight; } } endfor_nexthops(fi); fi->fib_power = power; if (power <= 0) { spin_unlock_bh(&fib_multipath_lock); /* Race condition: route has just become dead. */ res->nh_sel = 0; return; } } /* w should be random number [0..fi->fib_power-1], it is pretty bad approximation. */ w = jiffies % fi->fib_power; change_nexthops(fi) { if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) { if ((w -= nh->nh_power) <= 0) { //遞減 w, 當fib_power爲1時,可能選不到那個惟一的nh_power爲1的下一跳 nh->nh_power--; fi->fib_power--; res->nh_sel = nhsel; //記錄下一跳的位置 spin_unlock_bh(&fib_multipath_lock); return; } } } endfor_nexthops(fi); /* Race condition: route has just become dead. */ res->nh_sel = 0; spin_unlock_bh(&fib_multipath_lock); } 具體建立路由緩存函數 static inline int __mkroute_input(struct sk_buff *skb, struct fib_result* res, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos, struct rtable **result) { struct rtable *rth; int err; struct in_device *out_dev; unsigned flags = 0; __be32 spec_dst; u32 itag; //獲取外出接口,增長引用計數 out_dev = in_dev_get(FIB_RES_DEV(*res)); if (out_dev == NULL) { //沒有外出接口,出錯 if (net_ratelimit()) printk(KERN_CRIT "Bug in ip_route_input_slow(). Please, report\n"); return -EINVAL; } //驗證源地址是否正確 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), in_dev->dev, &spec_dst, &itag); if (err < 0) { ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); err = -EINVAL; goto cleanup; } //該標誌主要用於告訴ICMP代碼,不該當對地址掩碼請求消息做出迴應。 //每當調用fib_validate_source檢查到接收報文的源地址經過一個本地做用範圍(RT_SCOPE_HOST)的下一跳是可達時,就設置該標誌 if (err) flags |= RTCF_DIRECTSRC; if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) && (IN_DEV_SHARED_MEDIA(out_dev) || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) flags |= RTCF_DOREDIRECT; //當必須向源站送回ICMP_REDIRECT消息時,設置 if (skb->protocol != htons(ETH_P_IP)) { //非IP協議 /* Not IP (i.e. ARP). Do not create route, if it is invalid for proxy arp. DNAT routes are always valid. */ if (out_dev == in_dev && !(flags & RTCF_DNAT)) { err = -EINVAL; goto cleanup; } } rth = dst_alloc(&ipv4_dst_ops); //分配路由表項 if (!rth) { err = -ENOBUFS; goto cleanup; } atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags = DST_HOST; if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) rth->u.dst.flags |= DST_NOPOLICY; if (IN_DEV_CONF_GET(out_dev, NOXFRM)) rth->u.dst.flags |= DST_NOXFRM; rth->fl.fl4_dst = daddr; rth->rt_dst = daddr; rth->fl.fl4_tos = tos; rth->fl.mark = skb->mark; rth->fl.fl4_src = saddr; rth->rt_src = saddr; rth->rt_gateway = daddr; rth->rt_iif = rth->fl.iif = in_dev->dev->ifindex; //包進入時設備的索引 rth->u.dst.dev = (out_dev)->dev; //包輸出設備結構 dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); rth->fl.oif = 0; //外出接口索引設置爲0 ??? why not is out_dev->dev->ifindex ? rth->rt_spec_dst= spec_dst; rth->u.dst.input = ip_forward; //ip 轉發函數,路由查找結束後會調用 rth->u.dst.output = ip_output; rt_set_nexthop(rth, res, itag); //設置下一條網關 rth->rt_flags = flags; *result = rth; //保存結果 err = 0; cleanup: /* release the working reference to the output device */ in_dev_put(out_dev); return err; } 分配路由高速緩存項 void * dst_alloc(struct dst_ops * ops) { struct dst_entry * dst; //有垃圾收集器並且項數已經超過最大限制 if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) { if (ops->gc()) //啓動垃圾回收 return NULL; } //雖然這寫的是分配dst_entry,可是在初始化是內存大小是sizeof(struct rtable) dst = kmem_cache_zalloc(ops->kmem_cachep, GFP_ATOMIC); if (!dst) return NULL; atomic_set(&dst->__refcnt, 0); dst->ops = ops; //操做 dst->lastuse = jiffies; dst->path = dst; dst->input = dst->output = dst_discard; //初始化一個默認函數,調用kfree_skb(skb); #if RT_CACHE_DEBUG >= 2 atomic_inc(&dst_total); #endif atomic_inc(&ops->entries); //增長項數 return dst; } 路由緩存中保存下一跳地址 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) { struct fib_info *fi = res->fi; if (fi) { //找到路由信息 //#define FIB_RES_NH(res) ((res).fi->fib_nh[(res).nh_sel]) //#define FIB_RES_GW(res) (FIB_RES_NH(res).nh_gw) if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) //有下一跳網關 rt->rt_gateway = FIB_RES_GW(*res); //記錄下一跳網關地址 memcpy(rt->u.dst.metrics, fi->fib_metrics, sizeof(rt->u.dst.metrics)); if (fi->fib_mtu == 0) { rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu; if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) && rt->rt_gateway != rt->rt_dst && rt->u.dst.dev->mtu > 576) rt->u.dst.metrics[RTAX_MTU-1] = 576; } #ifdef CONFIG_NET_CLS_ROUTE rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; #endif } else rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu; if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0) rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU) rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0) rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40, ip_rt_min_advmss); if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40) rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; #ifdef CONFIG_NET_CLS_ROUTE #ifdef CONFIG_IP_MULTIPLE_TABLES set_class_tag(rt, fib_rules_tclass(res)); #endif set_class_tag(rt, itag); #endif rt->rt_type = res->type; } 放入hash表 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) { struct rtable *rth, **rthp; unsigned long now; struct rtable *cand, **candp; u32 min_score; int chain_length; int attempts = !in_softirq(); restart: chain_length = 0; min_score = ~(u32)0; cand = NULL; candp = NULL; now = jiffies; rthp = &rt_hash_table[hash].chain; //指向頭 spin_lock_bh(rt_hash_lock_addr(hash)); while ((rth = *rthp) != NULL) { //循環直到空 if (compare_keys(&rth->fl, &rt->fl)) { //找到相同的項 *rthp = rth->u.dst.rt_next; rcu_assign_pointer(rth->u.dst.rt_next, rt_hash_table[hash].chain); rcu_assign_pointer(rt_hash_table[hash].chain, rth); dst_use(&rth->u.dst, now); //增長引用和使用計數,更新時間 (dst->__refcnt ->__use ->lastuse) spin_unlock_bh(rt_hash_lock_addr(hash)); rt_drop(rt); //釋放這個分配的路由項 *rp = rth; //skb->dst 保存這個路由項 return 0; } //不匹配,這個路由緩存項引用計數爲 0 if (!atomic_read(&rth->u.dst.__refcnt)) { u32 score = rt_score(rth); if (score <= min_score) { //價值最小 cand = rth; candp = rthp; min_score = score; } } chain_length++; rthp = &rth->u.dst.rt_next; //移動到下一個 } if (cand) { /* ip_rt_gc_elasticity used to be average length of chain length, when exceeded gc becomes really aggressive. * The second limit is less certain. At the moment it allows only 2 entries per bucket. We will see. */ if (chain_length > ip_rt_gc_elasticity) { //超過hash表桶長度 *candp = cand->u.dst.rt_next; rt_free(cand); } } /* Try to bind route to arp only if it is output route or unicast forwarding path. */ //單播轉發路由和本地生成數據包輸出路由(iif = 0) if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { //須要ARP來解析下一跳的L2地址。而轉發目的地爲廣播地址, //多播地址和本機地址則不須要ARP解析,由於使用其它方法能夠解析獲得這個地址 int err = arp_bind_neighbour(&rt->u.dst); //將路由項和arp綁定 if (err) { spin_unlock_bh(rt_hash_lock_addr(hash)); if (err != -ENOBUFS) { rt_drop(rt); return err; } /* Neighbour tables are full and nothing can be released. Try to shrink route cache, * it is most likely it holds some neighbour records. */ if (attempts-- > 0) { //再也不軟中斷中 int saved_elasticity = ip_rt_gc_elasticity; int saved_int = ip_rt_gc_min_interval; ip_rt_gc_elasticity = 1; ip_rt_gc_min_interval = 0; rt_garbage_collect(); //進行路由緩存的回收 ip_rt_gc_min_interval = saved_int; ip_rt_gc_elasticity = saved_elasticity; goto restart; } if (net_ratelimit()) printk(KERN_WARNING "Neighbour table overflow.\n"); rt_drop(rt); return -ENOBUFS; } } //插入到頭 rt->u.dst.rt_next = rt_hash_table[hash].chain; rt_hash_table[hash].chain = rt; spin_unlock_bh(rt_hash_lock_addr(hash)); *rp = rt; //skb->dst指向這個路由緩存 return 0; } 給定一個設備dev,一個IP地址dst,和一個做用範圍scope,返回做用範圍爲scope的第一個主地址,在經過出設備dev向地址dst發送報文時使用. 每一個設備可能配置有多個地址,並且每一個地址有各自的scope. 提供dst參數的緣由在於,若是在設備dev上配置的不一樣IP地址屬於不一樣子網,程序就能夠返回與dst在同一子網的IP地址. __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) { __be32 addr = 0; struct in_device *in_dev; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto no_in_dev; for_primary_ifa(in_dev) { if (ifa->ifa_scope > scope) //地址範圍要更廣, > scope表示更窄 continue; if (!dst || inet_ifa_match(dst, ifa)) { addr = ifa->ifa_local; break; } if (!addr) addr = ifa->ifa_local; } endfor_ifa(in_dev); no_in_dev: rcu_read_unlock(); if (addr) goto out; //若是在dev上配置的地址都不知足由scope和dst限定的條件,程序則嘗試其餘設備,檢驗是否存在一個IP地址,配置有所要求的scope。 //由於loopback_dev是dev_baselist鏈中所插入的第一個設備,因此首先檢查的就是它. read_lock(&dev_base_lock); rcu_read_lock(); for_each_netdev(&init_net, dev) {//循環全部設備 if ((in_dev = __in_dev_get_rcu(dev)) == NULL) continue; for_primary_ifa(in_dev) { if (ifa->ifa_scope != RT_SCOPE_LINK && ifa->ifa_scope <= scope) { addr = ifa->ifa_local; goto out_unlock_both; } } endfor_ifa(in_dev); } out_unlock_both: read_unlock(&dev_base_lock); rcu_read_unlock(); out: return addr; } [/輸入流程] [輸出流程] 用於出流量的路由查找,這些流量是由本地生成,可能被送往本地或被髮送出去 int ip_route_output_key(struct rtable **rp, struct flowi *flp) { return ip_route_output_flow(rp, flp, NULL, 0); } int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) { int err; if ((err = __ip_route_output_key(rp, flp)) != 0) return err; if (flp->proto) { if (!flp->fl4_src) flp->fl4_src = (*rp)->rt_src; if (!flp->fl4_dst) flp->fl4_dst = (*rp)->rt_dst; err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags); //IPSEC安全路由 if (err == -EREMOTE) err = ipv4_dst_blackhole(rp, flp, sk); return err; } } 查找路由,當緩存查找路由失敗時調用ip_route_output_slow int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) { unsigned hash; struct rtable *rth; //hash目的/源和外出接口索引 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif); rcu_read_lock_bh(); //在路由高速緩存中查找 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; rth = rcu_dereference(rth->u.dst.rt_next)) { if (rth->fl.fl4_dst == flp->fl4_dst && rth->fl.fl4_src == flp->fl4_src && rth->fl.iif == 0 && rth->fl.oif == flp->oif && rth->fl.mark == flp->mark && !((rth->fl.fl4_tos ^ flp->fl4_tos) & (IPTOS_RT_MASK | RTO_ONLINK))) { dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(out_hit); rcu_read_unlock_bh(); *rp = rth; //找到 return 0; } RT_CACHE_STAT_INC(out_hlist_search); } rcu_read_unlock_bh(); return ip_route_output_slow(rp, flp); } 主要的路由解析函數 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) { //#define RT_FL_TOS(oldflp) ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) //調用方能夠將fl4_tos字段的兩個最低位(two least significant bits)用於存儲flags, //ip_route_output_slow可使用該flags來肯定待搜索路由項的scope。由於TOS字段不須要佔用整個八位,因此這種方法是可行的。 u32 tos = RT_FL_TOS(oldflp); //源IP地址、目的IP地址和防火牆標記是直接從函數的輸入參數拷貝而來 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = oldflp->fl4_dst, .saddr = oldflp->fl4_src, .tos = tos & IPTOS_RT_MASK, .scope = ((tos & RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE), } }, .mark = oldflp->mark, .iif = init_net.loopback_dev->ifindex, //由於調用ip_route_output_slow只是爲了路由本地生成的流量, //因此搜索key fl中的源設備被初始化爲迴環設備 .oif = oldflp->oif }; struct fib_result res; unsigned flags = 0; struct net_device *dev_out = NULL; int free_res = 0; int err; res.fi = NULL; #ifdef CONFIG_IP_MULTIPLE_TABLES res.r = NULL; #endif if (oldflp->fl4_src) { //有源地址 err = -EINVAL; //源地址爲,多播,非法或 if (MULTICAST(oldflp->fl4_src) || BADCLASS(oldflp->fl4_src) || ZERONET(oldflp->fl4_src)) goto out; /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ dev_out = ip_dev_find(oldflp->fl4_src); //找出地址所在設備 if (dev_out == NULL) goto out; //沒有指定外出設備且目的地址爲多播或廣播 if (oldflp->oif == 0 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) { fl.oif = dev_out->ifindex; //指定外出設備爲源地址所在設備 goto make_route; } if (dev_out) dev_put(dev_out); dev_out = NULL; } if (oldflp->oif) { //指定了外出設備 dev_out = dev_get_by_index(&init_net, oldflp->oif); //根據索引號找出設備 err = -ENODEV; if (dev_out == NULL) goto out; /* RACE: Check return value of inet_select_addr instead. */ if (__in_dev_get_rtnl(dev_out) == NULL) { dev_put(dev_out); goto out; /* Wrong error code */ } //目的地址爲多播或廣播 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) { if (!fl.fl4_src) //沒有指定源,在外出設備上選擇一個 fl.fl4_src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); goto make_route; } if (!fl.fl4_src) { if (MULTICAST(oldflp->fl4_dst)) fl.fl4_src = inet_select_addr(dev_out, 0, fl.fl4_scope); else if (!oldflp->fl4_dst) //當報文被送往本地 fl.fl4_src = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } } if (!fl.fl4_dst) { //沒有指定目的 fl.fl4_dst = fl.fl4_src; if (!fl.fl4_dst) //沒有目的,說明源和目的都爲0,這些報文被送往本地,而不是被髮送出去 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); //設置爲127.0.0.1 if (dev_out) dev_put(dev_out); dev_out = init_net.loopback_dev; //迴環設備 dev_hold(dev_out); fl.oif = init_net.loopback_dev->ifindex; res.type = RTN_LOCAL; //到本地的 flags |= RTCF_LOCAL; goto make_route; } if (fib_lookup(&fl, &res)) { //查找 res.fi = NULL; //查找失敗 if (oldflp->oif) { //有外出設備 if (fl.fl4_src == 0) fl.fl4_src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); res.type = RTN_UNICAST; //單播 goto make_route; } if (dev_out) dev_put(dev_out); err = -ENETUNREACH; //網絡不可達 goto out; } free_res = 1; //查找成功 if (res.type == RTN_LOCAL) { //路由結果爲到本地 if (!fl.fl4_src) fl.fl4_src = fl.fl4_dst; //沒有源,就把源和目的相同 if (dev_out) dev_put(dev_out); dev_out = init_net.loopback_dev; //迴環設備 dev_hold(dev_out); fl.oif = dev_out->ifindex; if (res.fi) fib_info_put(res.fi); res.fi = NULL; flags |= RTCF_LOCAL; goto make_route; } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res.fi->fib_nhs > 1 && fl.oif == 0) //當查找返回的路由是一條多路徑路由項時,須要選出下一跳 fib_select_multipath(&fl, &res); else #endif //當res.prefixlen字段爲0時表示是缺省路由,這表示「前綴長度」,即與該地址相關的網絡掩碼長度爲0 //注意當搜索key指定了要使用的egress設備(fl.oif)時,不須要調用這兩個程序。這時res已經包含了最終的轉發決策。 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) //當查找返回的路由是缺省路由時,須要選擇使用的缺省網關 fib_select_default(&fl, &res); if (!fl.fl4_src) fl.fl4_src = FIB_RES_PREFSRC(res); if (dev_out) dev_put(dev_out); dev_out = FIB_RES_DEV(res); dev_hold(dev_out); fl.oif = dev_out->ifindex; make_route: //分配輸出路由高速緩存,插入到hash表 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); if (free_res) fib_res_put(&res); if (dev_out) dev_put(dev_out); out: return err; } 選擇缺省的網關 void fib_select_default(const struct flowi *flp, struct fib_result *res) { if (res->r && res->r->action == FR_ACT_TO_TBL && FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) { struct fib_table *tb; if ((tb = fib_get_table(res->r->table)) != NULL) tb->tb_select_default(tb, flp, res); //參考下面路由表函數實現 } } 路由高速緩存分配,插入hash表 static inline int ip_mkroute_output(struct rtable **rp, struct fib_result* res, const struct flowi *fl, const struct flowi *oldflp, struct net_device *dev_out, unsigned flags) { struct rtable *rth = NULL; int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); unsigned hash; if (err == 0) { hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif); //計算hash值 err = rt_intern_hash(hash, rth, rp); //插入hash表 } return err; } 建立路由緩存 static inline int __mkroute_output(struct rtable **result, struct fib_result* res, const struct flowi *fl, const struct flowi *oldflp, struct net_device *dev_out, unsigned flags) { struct rtable *rth; struct in_device *in_dev; u32 tos = RT_FL_TOS(oldflp); int err = 0; //源地址爲127.0.0.1且外出設備不是迴環,出錯 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) return -EINVAL; if (fl->fl4_dst == htonl(0xFFFFFFFF)) //目的是廣播 res->type = RTN_BROADCAST; else if (MULTICAST(fl->fl4_dst)) //多播 res->type = RTN_MULTICAST; else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst)) //錯誤地址 return -EINVAL; if (dev_out->flags & IFF_LOOPBACK) //迴環設備,到本地 flags |= RTCF_LOCAL; /* get work reference to inet device */ in_dev = in_dev_get(dev_out); if (!in_dev) return -EINVAL; if (res->type == RTN_BROADCAST) { //路由結果類型爲廣播 flags |= RTCF_BROADCAST | RTCF_LOCAL; //加上本地 if (res->fi) { fib_info_put(res->fi); res->fi = NULL; } } else if (res->type == RTN_MULTICAST) { //類型爲多播 flags |= RTCF_MULTICAST|RTCF_LOCAL; if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, oldflp->proto)) flags &= ~RTCF_LOCAL; /* If multicast route do not exist use default one, but do not gateway in this case. Yes, it is hack. */ if (res->fi && res->prefixlen < 4) { fib_info_put(res->fi); res->fi = NULL; } } //分配高速緩存項 rth = dst_alloc(&ipv4_dst_ops); if (!rth) { err = -ENOBUFS; goto cleanup; } atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags= DST_HOST; if (IN_DEV_CONF_GET(in_dev, NOXFRM)) rth->u.dst.flags |= DST_NOXFRM; if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) rth->u.dst.flags |= DST_NOPOLICY; rth->fl.fl4_dst = oldflp->fl4_dst; rth->fl.fl4_tos = tos; rth->fl.fl4_src = oldflp->fl4_src; rth->fl.oif = oldflp->oif; rth->fl.mark = oldflp->mark; rth->rt_dst = fl->fl4_dst; rth->rt_src = fl->fl4_src; rth->rt_iif = oldflp->oif ? : dev_out->ifindex; /* get references to the devices that are to be hold by the routing cache entry */ rth->u.dst.dev = dev_out; dev_hold(dev_out); rth->idev = in_dev_get(dev_out); rth->rt_gateway = fl->fl4_dst; rth->rt_spec_dst= fl->fl4_src; rth->u.dst.output = ip_output; RT_CACHE_STAT_INC(out_slow_tot); if (flags & RTCF_LOCAL) { rth->u.dst.input = ip_local_deliver; rth->rt_spec_dst = fl->fl4_dst; } if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { rth->rt_spec_dst = fl->fl4_src; if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { rth->u.dst.output = ip_mc_output; RT_CACHE_STAT_INC(out_slow_mc); } #ifdef CONFIG_IP_MROUTE if (res->type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && !LOCAL_MCAST(oldflp->fl4_dst)) { rth->u.dst.input = ip_mr_input; rth->u.dst.output = ip_mc_output; } } #endif } //給定一個路由緩存項rtable和一個路由表查找結果res,完成rtable內各字段的初始化, //諸如rt_gateway、所嵌入的dst_entry結構的metrics向量和路由標籤初始化等等. rt_set_nexthop(rth, res, 0); rth->rt_flags = flags; *result = rth; cleanup: /* release work reference to inet device */ in_dev_put(in_dev); return err; } [/輸出流程] [多播路由處理] 目的地址是本地多播的處理函數 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, int our) { unsigned hash; struct rtable *rth; __be32 spec_dst; struct in_device *in_dev = in_dev_get(dev); u32 itag = 0; /* Primary sanity checks. */ if (in_dev == NULL) return -EINVAL; //源是多播或粗物或迴環地址,協議不是ip都是出錯 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) || skb->protocol != htons(ETH_P_IP)) goto e_inval; if (ZERONET(saddr)) { if (!LOCAL_MCAST(daddr)) goto e_inval; spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); } else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag) < 0) goto e_inval; rth = dst_alloc(&ipv4_dst_ops); if (!rth) goto e_nobufs; rth->u.dst.output= ip_rt_bug; atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags= DST_HOST; if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) rth->u.dst.flags |= DST_NOPOLICY; //緩存中記錄內容 rth->fl.fl4_dst = daddr; rth->rt_dst = daddr; rth->fl.fl4_tos = tos; rth->fl.mark = skb->mark; rth->fl.fl4_src = saddr; rth->rt_src = saddr; #ifdef CONFIG_NET_CLS_ROUTE rth->u.dst.tclassid = itag; #endif rth->rt_iif = rth->fl.iif = dev->ifindex; rth->u.dst.dev = init_net.loopback_dev; dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); rth->fl.oif = 0; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; rth->rt_type = RTN_MULTICAST; rth->rt_flags = RTCF_MULTICAST; if (our) { //表示目的地址是本地配置的多播地址 rth->u.dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; } #ifdef CONFIG_IP_MROUTE if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) //須要轉發的多播包 rth->u.dst.input = ip_mr_input; #endif RT_CACHE_STAT_INC(in_slow_mc); in_dev_put(in_dev); hash = rt_hash(daddr, saddr, dev->ifindex); return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst); //添加到高速緩存表 e_nobufs: in_dev_put(in_dev); return -ENOBUFS; e_inval: in_dev_put(in_dev); return -EINVAL; } [/多播路由處理] [路由緩存操做實現] static struct dst_ops ipv4_dst_ops = { .family = AF_INET, .protocol = __constant_htons(ETH_P_IP), .gc = rt_garbage_collect, .check = ipv4_dst_check, .destroy = ipv4_dst_destroy, .ifdown = ipv4_dst_ifdown, .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, .entry_size = sizeof(struct rtable), }; [/路由緩存操做實現] [路由規則操做實現] static struct fib_rules_ops fib4_rules_ops = { .family = AF_INET, .rule_size = sizeof(struct fib4_rule), //這結構中包含struct fib_rule結構 .addr_size = sizeof(u32), .action = fib4_rule_action, .match = fib4_rule_match, .configure = fib4_rule_configure, .compare = fib4_rule_compare, .fill = fib4_rule_fill, .default_pref = fib4_rule_default_pref, .nlmsg_payload = fib4_rule_nlmsg_payload, .flush_cache = fib4_rule_flush_cache, .nlgroup = RTNLGRP_IPV4_RULE, .policy = fib4_rule_policy, .rules_list = LIST_HEAD_INIT(fib4_rules_ops.rules_list), .owner = THIS_MODULE, }; 路由規則匹配函數 static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { struct fib4_rule *r = (struct fib4_rule *) rule; __be32 daddr = fl->fl4_dst; __be32 saddr = fl->fl4_src; //源/目的地址匹配 if (((saddr ^ r->src) & r->srcmask) || ((daddr ^ r->dst) & r->dstmask)) return 0; //top匹配 if (r->tos && (r->tos != fl->fl4_tos)) return 0; return 1; //匹配 } 規則匹配後,調用這個動做函數 static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { int err = -EAGAIN; struct fib_table *tbl; //表示一張路由表。不要將它與路由表緩存混淆。 switch (rule->action) { //規則動做類型 case FR_ACT_TO_TBL: //正常 break; case FR_ACT_UNREACHABLE: //丟棄發送不可達信息 err = -ENETUNREACH; goto errout; case FR_ACT_PROHIBIT: //丟棄給出EACCES錯誤 err = -EACCES; goto errout; case FR_ACT_BLACKHOLE: //丟棄包,沒有通知 default: err = -EINVAL; goto errout; } //根據路由規則中的標識查找,路由表 if ((tbl = fib_get_table(rule->table)) == NULL) goto errout; //路由表的查詢函數,參考下面路由表函數實現 err = tbl->tb_lookup(tbl, flp, (struct fib_result *) arg->result); if (err > 0) err = -EAGAIN; errout: return err; } struct fib_table *fib_get_table(u32 id) { struct fib_table *tb; struct hlist_node *node; unsigned int h; if (id == 0) id = RT_TABLE_MAIN; h = id & (FIB_TABLE_HASHSZ - 1); //hash值根據id rcu_read_lock(); hlist_for_each_entry_rcu(tb, node, &fib_table_hash[h], tb_hlist) { if (tb->tb_id == id) { //標識相同 rcu_read_unlock(); return tb; } } rcu_read_unlock(); return NULL; } [/路由規則操做實現] [路由表函數實現] fib_new_table->fib_hash_init初始化一個路由表,置於誰添加的和怎樣添加的參考用戶命令配置實現 struct fib_table * fib_hash_init(u32 id) { struct fib_table *tb; if (fn_hash_kmem == NULL) fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node), 0, SLAB_HWCACHE_ALIGN, NULL); if (fn_alias_kmem == NULL) fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), 0, SLAB_HWCACHE_ALIGN, NULL); tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash), GFP_KERNEL); if (tb == NULL) return NULL; tb->tb_id = id; tb->tb_lookup = fn_hash_lookup; //查詢是否匹配 tb->tb_insert = fn_hash_insert; //向路由表添加一條新路由表項 tb->tb_delete = fn_hash_delete; //從路由表刪除一條路由表項 tb->tb_flush = fn_hash_flush; tb->tb_select_default = fn_hash_select_default; //選擇默認路由 tb->tb_dump = fn_hash_dump; memset(tb->tb_data, 0, sizeof(struct fn_hash)); return tb; } static int fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) { int err; struct fn_zone *fz; struct fn_hash *t = (struct fn_hash*)tb->tb_data; //路由表後面的hash結構 read_lock(&fib_hash_lock); for (fz = t->fn_zone_list; fz; fz = fz->fz_next) { //循環全部的路由域 struct hlist_head *head; struct hlist_node *node; struct fib_node *f; __be32 k = fz_key(flp->fl4_dst, fz); //fh_hash函數把這留下的子網部分地址換算成hash項 head = &fz->fz_hash[fn_hash(k, fz)]; //路由項頭指針 hlist_for_each_entry(f, node, head, fn_hash) { if (f->fn_key != k) //目的要匹配 continue; //詳細檢察匹配,返回0匹配成功 err = fib_semantic_match(&f->fn_alias, flp, res, f->fn_key, fz->fz_mask, fz->fz_order); if (err <= 0) goto out; } } err = 1; out: read_unlock(&fib_hash_lock); return err; } 用目的地址和這域中的子網掩碼相與,留下掩碼部分做爲key 例如,若是正在檢查/24 zone,目的地址flp->fl4_dst爲10.0.1.2,則搜索key k爲10.0.1.2 & 255.255.255.0,結果爲10.0.1.0。 這意味着接下來的代碼要搜索到子網10.0.1.0/24的路由: static inline __be32 fz_key(__be32 dst, struct fn_zone *fz) { return dst & FZ_MASK(fz); //((fz)->fz_mask) } 根據這個key計算出hash值 static inline u32 fn_hash(__be32 key, struct fn_zone *fz) { u32 h = ntohl(key)>>(32 - fz->fz_order); h ^= (h>>20); h ^= (h>>10); h ^= (h>>5); h &= FZ_HASHMASK(fz); //((fz)->fz_hashmask) return h; } 一個fib_node覆蓋了同一子網內的全部路由項,但這些路由項在諸如TOS等其餘字段上可能不一樣. 查找與匹配的fib_node相關聯的fib_alias實例。若是找到相應的fib_alias實例,在配置多路徑狀況下fib_semantic_match還須要選擇出正確的下一跳. 詳細檢察由這函數完成,用查找結果來初始化輸入參數res. int fib_semantic_match(struct list_head *head, const struct flowi *flp, struct fib_result *res, __be32 zone, __be32 mask, int prefixlen) { struct fib_alias *fa; int nh_sel = 0; //輪詢全部的別名 list_for_each_entry_rcu(fa, head, fa_list) { int err; if (fa->fa_tos && fa->fa_tos != flp->fl4_tos) //tox若是有,必需匹配 continue; //scope比搜索key更窄的路由項是不能夠的。 //例如,若是路由子系統查找scope爲RT_SCOPE_UNIVERSE的路由,則不能使用scope爲RT_SCOPE_LINK的路由項。 if (fa->fa_scope < flp->fl4_scope) continue; //該標誌的設置與fib_alias是否被選中無關。當fib_alias實例被刪除時,根據該標誌來決定是否應當flush緩存。 fa->fa_state |= FA_S_ACCESSED; //該數組的每個元素針對一種路由類型,每一個元素包含一個相關的錯誤碼和一個路由做用範圍RT_SCOPE_XXX。 //以fa->fa_type爲索引,就能夠從數組fib_props得出該路由類型對應的錯誤碼和路由做用範圍(scope)。 err = fib_props[fa->fa_type].error; //路由類型是否正確 if (err == 0) { struct fib_info *fi = fa->fa_info; if (fi->fib_flags & RTNH_F_DEAD) //路由子系統已經設置RTNH_F_DEAD標誌來標記該路由項應當被刪除 continue; switch (fa->fa_type) { case RTN_UNICAST: case RTN_LOCAL: case RTN_BROADCAST: case RTN_ANYCAST: case RTN_MULTICAST: for_nexthops(fi) { if (nh->nh_flags & RTNH_F_DEAD) //下一跳不可用 continue; if (!flp->oif || flp->oif == nh->nh_oif) //搜索key指定的egress設備與下一跳配置的不匹配 break; } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (nhsel < fi->fib_nhs) { //找到匹配的下一跳 nh_sel = nhsel; goto out_fill_res; } #else if (nhsel < 1) { //不支持多路徑,則只能有一個下一跳 goto out_fill_res; } #endif endfor_nexthops(fi); continue; default: printk(KERN_DEBUG "impossible 102\n"); return -EINVAL; } } return err; } return 1; //沒有匹配的路由 out_fill_res: //初始化結果 res->prefixlen = prefixlen; //掩碼長度 res->nh_sel = nh_sel; res->type = fa->fa_type; res->scope = fa->fa_scope; res->fi = fa->fa_info; atomic_inc(&res->fi->fib_clntref); return 0; //成功返回 } fn_hash_select_default接收一個結構爲fib_result的res做爲輸入參數,該參數是前面調用fib_lookup而返回的路由查找結果。 fn_hash_select_default使用該結構做爲缺省路由搜索的起點。 static void fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) { int order, last_idx; struct hlist_node *node; struct fib_node *f; struct fib_info *fi = NULL; struct fib_info *last_resort; struct fn_hash *t = (struct fn_hash*)tb->tb_data; struct fn_zone *fz = t->fn_zones[0]; //缺省路由域 if (fz == NULL) return; last_idx = -1; last_resort = NULL; order = -1; read_lock(&fib_hash_lock); hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) { //遍歷全部路由項 struct fib_alias *fa; list_for_each_entry(fa, &f->fn_alias, fa_list) { //該項的全部別名 struct fib_info *next_fi = fa->fa_info; //scope必須相同,類型必須爲單播 if (fa->fa_scope != res->scope || fa->fa_type != RTN_UNICAST) continue; //權限要小於等於res指定的權限 if (next_fi->fib_priority > res->fi->fib_priority) break; //下一跳的scope爲RT_SCOPE_LINK(即必須直連) if (!next_fi->fib_nh[0].nh_gw || next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) continue; fa->fa_state |= FA_S_ACCESSED; if (fi == NULL) { if (next_fi != res->fi) break; } else if (!fib_detect_death(fi, order, &last_resort, &last_idx, &fn_hash_last_dflt)) { if (res->fi) fib_info_put(res->fi); res->fi = fi; atomic_inc(&fi->fib_clntref); fn_hash_last_dflt = order; goto out; } fi = next_fi; order++; } } if (order <= 0 || fi == NULL) { fn_hash_last_dflt = -1; goto out; } //選擇路由項也要考慮下一跳的狀態是否可達。fib_detect_death函數將路由項中L3地址已經被解析爲L2地址的 //下一跳(即狀態爲NUD_REACHABLE)給予更高的優先級。該檢查能夠確保若是當前所用的缺省路由的下一跳網關 //不可達而使該路由項不可用,則須要選擇新的路由項。 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &fn_hash_last_dflt)) { if (res->fi) fib_info_put(res->fi); res->fi = fi; atomic_inc(&fi->fib_clntref); fn_hash_last_dflt = order; goto out; } if (last_idx >= 0) { if (res->fi) fib_info_put(res->fi); res->fi = last_resort; if (last_resort) atomic_inc(&last_resort->fib_clntref); } fn_hash_last_dflt = last_idx; out: read_unlock(&fib_hash_lock); } [/路由表函數實現]