tcp/ip 协议栈Linux内核源码分析11 邻居子系统分析二 arp协议的实现处理
內核版本:3.4.39
內核鄰居子系統定義了一個基本的框架,使得不同的鄰居協議可以共用一套代碼。比起其它的內核模塊,鄰居子系統框架代碼還是比較簡單易懂的。鄰居子系統位于網絡層和流量控制子系統中間,它提供給L3向下發送的接口。看下網絡層發送函數的部分代碼:
static inline int ip_finish_output2(struct sk_buff *skb) {/** ... 省略部分代碼*/rcu_read_lock();neigh = dst_get_neighbour_noref(dst);//如果不存在鄰居表項的話,返回發送失敗if (neigh) {//調用鄰居層提供的發送接口int res = neigh_output(neigh, skb);rcu_read_unlock();return res;}rcu_read_unlock(); }可以看到網絡層是直接將報文傳遞給neigh_output接口,這個接口是一個包裹函數,它內部又調用了鄰居項的發送函數:
static inline int neigh_output(struct neighbour *n, struct sk_buff *skb) {struct hh_cache *hh = &n->hh;//如果存在L2幀頭緩存的話,直接填充MAC地址,然后調用dev_queue_xmit發送//否則就只能調用鄰居項的默認發送接口output,這是個函數指針,會隨著鄰居//項狀態的變化而變更if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)return neigh_hh_output(hh, skb);elsereturn n->output(n, skb); }上述就是鄰居子系統提供給網絡層的發送接口。具體的實現和鄰居協議有關,接下來主要看下IPv4的鄰居協議處理,即ARP
如果理解了鄰居子系統的基礎框架,就不難猜測鄰居協議的實現內容了。ARP協議初始化工作包括向內核注冊arp報文接收處理函數,初始化鄰居表,建立proc、sys文件以及向內核注冊設備發生變化時的回調處理函數。
//arp模塊初始化 void __init arp_init(void) {int i;//注冊一個虛函數表和ARP協議使用的其他常用參數neigh_table_init(&arp_tbl);//注冊arp報文處理函數dev_add_pack(&arp_packet_type);//注冊proc文件arp_proc_init(); #ifdef CONFIG_SYSCTL//注冊sys文件neigh_sysctl_register(NULL, &arp_tbl.parms, "ipv4", NULL); #endif//向內核注冊一個回調函數,用于接受設備狀態和配置變化的通知register_netdevice_notifier(&arp_netdev_notifier); }dev_add_pack就是注冊arp協議報文 接收處理函數,注冊方式和IPv4、IPv6協議類似。
static struct packet_type arp_packet_type __read_mostly = {.type = cpu_to_be16(ETH_P_ARP),.func = arp_rcv, //ARP協議報文接受處理函數 };?鄰居表的初始化,主要工作時將arp_tbl插入到全局鄰居表結構體中neigh_tables,此外就是按照arp_tbl的配置初始化鄰居緩存、gc定時器等等。
//初始化鄰居表,IPv4時arp,IPv6是nd_tbl void neigh_table_init(struct neigh_table *tbl) {struct neigh_table *tmp;//緩存表的初始化,主要工作都在這里neigh_table_init_no_netlink(tbl);//添加到全局的鄰居表鏈表中,每個鄰居協議都要添加自己的鄰居表write_lock(&neigh_tbl_lock);for (tmp = neigh_tables; tmp; tmp = tmp->next) {if (tmp->family == tbl->family)break;}tbl->next = neigh_tables;neigh_tables = tbl;write_unlock(&neigh_tbl_lock);//同一個協議只能添加一次,重復添加這里會報錯if (unlikely(tmp)) {printk(KERN_ERR "NEIGH: Registering multiple tables for ""family %d\n", tbl->family);dump_stack();} } EXPORT_SYMBOL(neigh_table_init);arp_tbl的配置如下:
struct neigh_table arp_tbl = {.family = AF_INET,.key_len = 4,.hash = arp_hash, //計算hash值的一個函數.constructor = arp_constructor, //鄰居項初始化函數.proxy_redo = parp_redo, //處理arp代理的函數.id = "arp_cache", //鄰居項緩存池名.parms = {.tbl = &arp_tbl,.base_reachable_time = 30 * HZ, //只有在30秒內收到可到達性確認才承認reachable狀態.retrans_time = 1 * HZ, //solicit請求重傳時間.gc_staletime = 60 * HZ, //stale狀態的最長持續時間.reachable_time = 30 * HZ, //reachable狀態的最長時間.delay_probe_time = 5 * HZ, //delay狀態的最長時間.queue_len_bytes = 64*1024,.ucast_probes = 3, //單播地址探測次數.mcast_probes = 3, //多播地址探測次數.anycast_delay = 1 * HZ,.proxy_delay = (8 * HZ) / 10,.proxy_qlen = 64,.locktime = 1 * HZ,},.gc_interval = 30 * HZ, //垃圾回收定時器.gc_thresh1 = 128, //保留.gc_thresh2 = 512, //鄰居項閾值.gc_thresh3 = 1024, //鄰居項閾值 }; EXPORT_SYMBOL(arp_tbl);?主要的初始化函數:
void neigh_table_init_no_netlink(struct neigh_table *tbl) {unsigned long now = jiffies;unsigned long phsize;write_pnet(&tbl->parms.net, &init_net);atomic_set(&tbl->parms.refcnt, 1);tbl->parms.reachable_time =neigh_rand_reach_time(tbl->parms.base_reachable_time);//初始化一個統計結構體tbl->stats = alloc_percpu(struct neigh_statistics);if (!tbl->stats)panic("cannot create neighbour cache statistics");#ifdef CONFIG_PROC_FSif (!proc_create_data(tbl->id, 0, init_net.proc_net_stat,&neigh_stat_seq_fops, tbl))panic("cannot create neighbour proc dir entry"); #endif//初始化鄰居hash桶RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3));//獲取arp代理表項大小phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);//分配代理緩存tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL);if (!tbl->nht || !tbl->phash_buckets)panic("cannot allocate neighbour cache hashes");//初始化讀寫鎖rwlock_init(&tbl->lock);//添加一個定時任務,做些清理工作以及更新隨機定時器時間INIT_DELAYED_WORK_DEFERRABLE(&tbl->gc_work, neigh_periodic_work);schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time);//起一個定時器處理arp代理功能setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl);//初始化代理報文隊列skb_queue_head_init_class(&tbl->proxy_queue,&neigh_table_proxy_queue_class);tbl->last_flush = now;tbl->last_rand = now + tbl->parms.reachable_time * 20; } EXPORT_SYMBOL(neigh_table_init_no_netlink);?ARP協議的初始化大概就是上述的內容,其它要關注的點包括鄰居項的創建、更新、查找以及提供給L3發送接口的變化。
創建鄰居表的原因大概有如下幾種:
1.? L3層要發送報文。
2.? 應用層使用ip neigh命令或者arp命令手動添加
3.? 收到arp報文被動學習一個鄰居表項
針對第一種情況看下流程,當內核發送報文的時候首先需要查找路由,出口路由是綁定鄰居緩存的,如果沒有鄰居緩存會新建一個。
鄰居表創建函數:neigh_create,這個函數比較長,我說下主要工作,不趕時間的同學可以慢慢看。
這個函數新建一個鄰居項緩存neighbour結構體,然后初始化變量,包括一個定時器處理函數,初始化的時候要結合鄰居協議提供的配置函數以及網絡設備的配置參數,最終是添加到鄰居表中。需要注意的是,它除了自身的初始化之外,還會調用鄰居協議提供的初始化函數,類似于c++里面的構造函數,arp協議初始化的時候提供了這個構造函數,放到了鄰居表里面,arp_constructor(),這個函數主要初始化鄰居表項的虛擬函數集,這些操作函數提供了L3傳輸接口,即剛開始說到的neigh->output函數。
//創建鄰居緩存 struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,struct net_device *dev) {u32 hash_val;int key_len = tbl->key_len;int error;struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);struct neigh_hash_table *nht;//創建鄰居失敗了,:-(if (!n) {rc = ERR_PTR(-ENOBUFS);goto out;}//設置key的長度,IPv4是4字節,IPv6是16字節memcpy(n->primary_key, pkey, key_len);n->dev = dev;//引用計數增加dev_hold(dev);/* Protocol specific setup. *///協議自定義的初始化函數,arp的初始化函數是arp_constructorif (tbl->constructor && (error = tbl->constructor(n)) < 0) {rc = ERR_PTR(error);goto out_neigh_release;}//如果設備驅動提供了初始化函數的話,這里也要調用一遍if (dev->netdev_ops->ndo_neigh_construct) {error = dev->netdev_ops->ndo_neigh_construct(n);if (error < 0) {rc = ERR_PTR(error);goto out_neigh_release;}}/* Device specific setup. *///設備特殊的初始化函數,如果存在則調用if (n->parms->neigh_setup &&(error = n->parms->neigh_setup(n)) < 0) {rc = ERR_PTR(error);goto out_neigh_release;}//這個字段由可到達性證明來更新//從新建的角度來說,這里設置一個過期值是使得鄰居狀態能比平常和要求有//可到達性證據時,稍快點轉移到stale狀態n->confirmed = jiffies - (n->parms->base_reachable_time << 1);write_lock_bh(&tbl->lock);//獲取鄰居hash表nht = rcu_dereference_protected(tbl->nht,lockdep_is_held(&tbl->lock));//如果hash表不夠大則擴增if (atomic_read(&tbl->entries) > (1 << nht->hash_shift))nht = neigh_hash_grow(tbl, nht->hash_shift + 1);//hash值由目的地址,dev和一個隨機值取得hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);//如果鄰居被廢棄了,則返回錯誤if (n->parms->dead) {rc = ERR_PTR(-EINVAL);goto out_tbl_unlock;}for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val],lockdep_is_held(&tbl->lock));n1 != NULL;n1 = rcu_dereference_protected(n1->next,lockdep_is_held(&tbl->lock))) {//遍歷hash桶,查找是否已經存在鄰居表項,如果已經存在的話//增加統計計數并釋放新建的if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {neigh_hold(n1);rc = n1;goto out_tbl_unlock;}}//更新標志位,當dead為1時表示鄰居表被廢棄了,會被gc回收掉n->dead = 0;//增加引用計數neigh_hold(n);//添加到hash表首部rcu_assign_pointer(n->next,rcu_dereference_protected(nht->hash_buckets[hash_val],lockdep_is_held(&tbl->lock)));rcu_assign_pointer(nht->hash_buckets[hash_val], n);write_unlock_bh(&tbl->lock);NEIGH_PRINTK2("neigh %p is created.\n", n);rc = n; out:return rc; out_tbl_unlock:write_unlock_bh(&tbl->lock); out_neigh_release:neigh_release(n);goto out; } EXPORT_SYMBOL(neigh_create);arp協議提供的構造函數,這里我們重點關注neigh->ops的設置,這是提供給L3的接口,內核定義了四個可選的操作集,選擇哪一個需要根據驅動的能力來。
//鄰居初始化函數 static int arp_constructor(struct neighbour *neigh) {__be32 addr = *(__be32 *)neigh->primary_key;struct net_device *dev = neigh->dev;struct in_device *in_dev;struct neigh_parms *parms;rcu_read_lock();//獲取該鄰居項使用的設備,失敗則返回in_dev = __in_dev_get_rcu(dev);if (in_dev == NULL) {rcu_read_unlock();return -EINVAL;}//獲取地址類型,比如單播、多播或者廣播neigh->type = inet_addr_type(dev_net(dev), addr);//將配置參數改成設備的配置參數//先釋放鄰居表默認的參數引用//然后增加dev的參數引用parms = in_dev->arp_parms;__neigh_parms_put(neigh->parms);neigh->parms = neigh_parms_clone(parms);rcu_read_unlock();//根據設備能力設置操作函數集if (!dev->header_ops) {//如果設備不需要ARP的話,走這里neigh->nud_state = NUD_NOARP;neigh->ops = &arp_direct_ops;neigh->output = neigh_direct_output;} else {/* Good devices (checked by reading texts, but only Ethernet istested)ARPHRD_ETHER: (ethernet, apfddi)ARPHRD_FDDI: (fddi)ARPHRD_IEEE802: (tr)ARPHRD_METRICOM: (strip)ARPHRD_ARCNET:etc. etc. etc.ARPHRD_IPDDP will also work, if author repairs it.I did not it, because this driver does not work evenin old paradigm.*/#if 1/* So... these "amateur" devices are hopeless.The only thing, that I can say now:It is very sad that we need to keep ugly obsoletecode to make them happy.They should be moved to more reasonable state, nowthey use rebuild_header INSTEAD OF hard_start_xmit!!!Besides that, they are sort of out of date(a lot of redundant clones/copies, useless in 2.1),I wonder why people believe that they work.*/switch (dev->type) {default:break;case ARPHRD_ROSE: #if IS_ENABLED(CONFIG_AX25)case ARPHRD_AX25: #if IS_ENABLED(CONFIG_NETROM)case ARPHRD_NETROM: #endifneigh->ops = &arp_broken_ops;neigh->output = neigh->ops->output;return 0; #elsebreak; #endif} #endif//多播mac地址可以計算出來,不需要ARPif (neigh->type == RTN_MULTICAST) {neigh->nud_state = NUD_NOARP;arp_mc_map(addr, neigh->ha, dev, 1);} else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {//環回接口也是不需要ARPneigh->nud_state = NUD_NOARP;memcpy(neigh->ha, dev->dev_addr, dev->addr_len);} else if (neigh->type == RTN_BROADCAST ||(dev->flags & IFF_POINTOPOINT)) {//點對點或者廣播的mac地址也是已知的 neigh->nud_state = NUD_NOARP;memcpy(neigh->ha, dev->broadcast, dev->addr_len);}//根據設備能力選擇函數集,這些函數包括鄰居項操作函數以及于L3層接口//如果設備提供L2幀頭緩存則選擇arp_hh_ops,否則選擇一個通用的arp_generic_opsif (dev->header_ops->cache)neigh->ops = &arp_hh_ops;elseneigh->ops = &arp_generic_ops;//根據鄰居狀態配置輸出接口if (neigh->nud_state & NUD_VALID)neigh->output = neigh->ops->connected_output;elseneigh->output = neigh->ops->output;}return 0; }?默認的操作函數集有如下四組:
static const struct neigh_ops arp_generic_ops = {.family = AF_INET,.solicit = arp_solicit,.error_report = arp_error_report,.output = neigh_resolve_output,.connected_output = neigh_connected_output, };static const struct neigh_ops arp_hh_ops = {.family = AF_INET,.solicit = arp_solicit,.error_report = arp_error_report,.output = neigh_resolve_output,.connected_output = neigh_resolve_output, };//設備不需要L2幀頭 static const struct neigh_ops arp_direct_ops = {.family = AF_INET,.output = neigh_direct_output,.connected_output = neigh_direct_output, };static const struct neigh_ops arp_broken_ops = {.family = AF_INET,.solicit = arp_solicit,.error_report = arp_error_report,.output = neigh_compat_output,.connected_output = neigh_compat_output, };?通常ethernet初始化使用的是通用的arp_generic_ops,結合文章最開始處講到的L3層的發送函數ip_finish_output2,它會根據鄰居的狀態來選擇合適的發送接口,初始化的鄰居項狀態是NONE,因此當調用neigh->output是,最終調用neigh_resolve_output這個接口,這個函數會先將報文放到鄰居項的緩存隊列里面,然后發送solicit探測報文,這樣整個發送流程就結束了。
當系統收到arp報文的時候調用arp_rcv,這個函數首先進行報文的合理性檢查,然后根據報文的內容查找鄰居表,假設這是一個arp響應報文,這時候需要更新鄰居表項狀態為可到達的(reachable)同時檢查緩存隊列,如果存在報文的話就及時發送出去。
arp協議大概的內容就是這樣,協議的初始化,鄰居表創建以及與L3之間發送接口的交互。具體細節部分代碼量還是蠻多的,但是只要掌握了基本內容,代碼不難理解。
參考目錄:
1. 《深入理解Linux網絡技術內幕》
總結
以上是生活随笔為你收集整理的tcp/ip 协议栈Linux内核源码分析11 邻居子系统分析二 arp协议的实现处理的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 坡向对李子种植的影响?
- 下一篇: linux 其他常用命令