生活随笔
收集整理的這篇文章主要介紹了
TCP/IP学习(30)——L2数据链路层的数据包处理详细流程
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
原文地址:TCP/IP學習(30)——L2數據鏈路層的數據包處理詳細流程 作者:GFree_Wind
本文的copyleft歸gfree.wind@gmail.com所有,使用GPL發布,可以自由拷貝,轉載。但轉載請保持文檔的完整性,注明原作者及原鏈接,嚴禁用于任何商業用途。
作者:gfree.wind@gmail.com
博客:linuxfocus.blog.chinaunix.net?? ?
在前面的博文中,我學習了數據包從L2到L5的流程,但是當時因為時間和水平的限制,整個兒流程并沒有涉及太多的細節。前兩天大致又過了這個流程,發現有不少細節還是需要注意的。所以決定,將之前略過的一些內容,詳細的學習一遍。
今天主要是學習L2數據鏈路層的數據包的處理機制。在Linux kernel中,由網卡驅動完成L1物理層和L2數據鏈路層的工作。
首先看函數net_dev_init static int __init net_dev_init(void)
{
????int i, rc = -ENOMEM;
????BUG_ON(!dev_boot_phase);
?? ? /*? ?? ? 創建對應的/proc文件,如/proc/net/dev, /proc/net/softnet_stat等 ?? ? */ ????if (dev_proc_init())
????????goto out;
?? ? /* 初始化netdev對應的kobject*/ ????if (netdev_kobject_init())
????????goto out;
?? ? /*? ?? ? 初始化數據鏈路層的handle上層數據類型表。 ?? ? 回憶前文《TCP/IP學習(28)——數據包完整接受流程》中,在inet_init中注冊了IP包類型到這個表中。 ?? ? */ ????INIT_LIST_HEAD(&ptype_all);
????for (i = 0; i < PTYPE_HASH_SIZE; i++)
????????INIT_LIST_HEAD(&ptype_base[i]);
?? ? /* ?? ? 注冊neddev_net_ops subsystem ?? ? */ ????if (register_pernet_subsys(&netdev_net_ops))
????????goto out;
????/*
???? *????Initialise the packet receive queues.
???? */ ?? ?/*?? ?為每個CPU初始化PERCPU的全局變量softnet_data,作為該CPU的接收緩存??? ?*/ ????for_each_possible_cpu(i) {
????????struct softnet_data *sd = &per_cpu(softnet_data, i);
?? ? ? ?...... ...... ????}
????dev_boot_phase = 0;
????/* The loopback device is special if any other network devices
???? * is present in a network namespace the loopback device must
???? * be present. Since we now dynamically allocate and free the
???? * loopback device ensure this invariant is maintained by
???? * keeping the loopback device as the first device on the
???? * list of network devices. Ensuring the loopback devices
???? * is the first device that appears and the last network device
???? * that disappears.
???? */
????if (register_pernet_device(&loopback_net_ops))
????????goto out;
????if (register_pernet_device(&default_device_ops))
????????goto out;
?? ? /* ?? ? enable軟中斷 ?? ? */ ????open_softirq(NET_TX_SOFTIRQ, net_tx_action);
????open_softirq(NET_RX_SOFTIRQ, net_rx_action);
????hotcpu_notifier(dev_cpu_callback, 0);
????dst_init();
????dev_mcast_init();
????rc = 0;
out:
????return rc;
} net_dev_init在系統啟動時,在注冊網卡之前調用,主要就是初始化net device所需要的一些環境。
下面仍然以Intel PRO/1000的網卡驅動為例,e1000_init_module為該驅動的入口。通過e1000_init_module->pci_register_driver->e1000_probe進入初始化函數。 在e1000_probe中,通過下面這條語句綁定了操作函數。 netdev->netdev_ops = &e1000_netdev_ops; static const struct net_device_ops e1000_netdev_ops = {
????.ndo_open????????= e1000_open,
...... ...... }; 對于今天的主題來說,只需關心e1000_open即可。因為該函數是在激活該網卡時被調用,完成資源的申請,中斷的注冊,即e1000_intr。 static irqreturn_t e1000_intr(int irq, void *data)
{
?? ?...... ......??? ?/* ?? ?檢測是否可以調度NAPI:?? ?當沒有disable NAPI且沒有該網卡對應的NAPI在運行時(保證對應一個網卡的NAPI只有一個實例在運行),即可調度一個新的NAPI。?? ?NAPI是一種新的網卡數據檢查處理方式。基本上是interrupt+poll。詳細信息問google?? ?*/ ????if (likely(napi_schedule_prep(&adapter->napi))) {
?? ? ? ?/*? ?? ? ? ?清楚單次的統計信息。?? ? ? ?剛看到這里時,我也奇怪,為什么total的統計信息要被清零。?? ? ? ?實際上這些統計信息只是一次NAPI運行的統計信息,并不是網卡總的統計信息。?? ? ? ?網卡的統計信息為netdev->stats。NAPI運行完會將下面的值加到網卡的統計信息上的。?? ? ? ?*/ ????????adapter->total_tx_bytes = 0;
????????adapter->total_tx_packets = 0;
????????adapter->total_rx_bytes = 0;
????????adapter->total_rx_packets = 0;
?? ? ? ?/* 要求調度對應的NAPI實例 */ ????????__napi_schedule(&adapter->napi);
????} else {
????????/* this really should not if it does it is basically a
???????? * bug, but not a hard error, so enable ints and continue */
????????if (!test_bit(__E1000_DOWN, &adapter->flags))
????????????e1000_irq_enable(adapter);
????}
????return IRQ_HANDLED;
} 上面為中斷的關鍵流程,其中要求調度對應的NAPI實例時,實際上是引發一個軟中斷。 __raise_softirq_irqoff(NET_RX_SOFTIRQ)。這個中斷函數的主要功能就是要求調度一個NAPI——這里跟以前理解的中斷函數不太一樣。按照教科書式的概念,網卡的中斷函數,應該將數據包從網卡的緩沖中取出放到一個系統緩沖中,然后在引發軟中斷去做剩下的工作。
下面看NET_RX_SOFTIRQ軟中斷對應的處理函數net_rx_action。 static void net_rx_action(struct softirq_action *h)
{
????struct softnet_data *sd = &__get_cpu_var(softnet_data);
????unsigned long time_limit = jiffies + 2;
????int budget = netdev_budget;
????void *have;
????local_irq_disable();
?? ? /* 開始順序poll所有需要poll的網卡 */ ????while (!list_empty(&sd->poll_list)) {
????????struct napi_struct *n;
????????int work, weight;
????????/* If softirq window is exhuasted then punt.
???????? * Allow this to run for 2 jiffies since which will allow
???????? * an average latency of 1.5/HZ.
???????? */
????????if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
????????????goto softnet_break;
????????local_irq_enable();
????????/* Even though interrupts have been re-enabled, this
???????? * access is safe because interrupts can only add new
???????? * entries to the tail of this list, and only ->poll()
???????? * calls can remove this head entry from the list.
???????? */
?? ? ? ?/* 取得一個網卡的NAPI實例 */ ????????n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
?? ? ? ? /* 給這個實例上鎖 */ ????????have = netpoll_poll_lock(n);
????????weight = n->weight;
????????/* This NAPI_STATE_SCHED test is for avoiding a race
???????? * with netpoll's poll_napi(). Only the entity which
???????? * obtains the lock and sees NAPI_STATE_SCHED set will
???????? * actually make the ->poll() call. Therefore we avoid
???????? * accidently calling ->poll() when NAPI is not scheduled.
???????? */
????????work = 0;
????????if (test_bit(NAPI_STATE_SCHED, &n->state)) {
?? ? ? ? ? ?/* poll這個網卡 */ ????????????work = n->poll(n, weight);
????????????trace_napi_poll(n);
????????}
????????WARN_ON_ONCE(work > weight);
????????budget -= work;
????????local_irq_disable();
????????/* Drivers must not modify the NAPI state if they
???????? * consume the entire weight. In such cases this code
???????? * still "owns" the NAPI instance and therefore can
???????? * move the instance around on the list at-will.
???????? */
????????if (unlikely(work == weight)) {
?? ? ? ? ? ?/* 該NAPI的weight消耗完畢,需要處理下一個 */ ????????????if (unlikely(napi_disable_pending(n))) {
????????????????local_irq_enable();
????????????????napi_complete(n);
????????????????local_irq_disable();
????????????} else
????????????????list_move_tail(&n->poll_list, &sd->poll_list);
????????}
????????netpoll_poll_unlock(have);
????}
out:
????net_rps_action_and_irq_enable(sd);
#ifdef CONFIG_NET_DMA
????/*
???? * There may not be any more sk_buffs coming right now, so push
???? * any pending DMA copies to hardware
???? */
????dma_issue_pending_all();
#endif
????return;
softnet_break:
????sd->time_squeeze++;
????__raise_softirq_irqoff(NET_RX_SOFTIRQ);
????goto out;
} 通過上面這個軟中斷處理函數,對應每個網卡來說,又需要跳回驅動,去學習對應的poll函數。對于本文的這個驅動來說,poll函數就是e1000_clean->e1000_clean_rx_irq。這個函數是真正用于處理網卡接收數據包的工作。 static bool e1000_clean_rx_irq(struct e1000_adapter *adapter,
???????????? struct e1000_rx_ring *rx_ring,
???????????? int *work_done, int work_to_do)
{
?? ...... ......?? ?? ?? ? /* 得到當前需要處理buffer*/ ????i = rx_ring->next_to_clean; ?? ?rx_desc = E1000_RX_DESC(*rx_ring, i); ????buffer_info = &rx_ring->buffer_info[i];
????while (rx_desc->status & E1000_RXD_STAT_DD) {
????????struct sk_buff *skb;
????????u8 status;
????????if (*work_done >= work_to_do) //如果已經poll到足夠的包,可以跳出返回
????????????break;
????????(*work_done)++;
????????rmb(); /* read descriptor and rx_buffer_info after status DD */
?? ? ? ? ? ?? ? ? ? /* 得到數據包buffer對應的skb buffer結構地址 */ ????????status = rx_desc->status;
????????skb = buffer_info->skb;
????????buffer_info->skb = NULL;
?? ?/*? ?? ? ? ? 然后做一些網卡硬件相關,及一些sanity check ?? ? ? ? */ ?? ? ? ? ...... ...... ?? ? ? /*? ?? ? ? 設置skb->pkt_type:PACKET_BROADCAST等;?? ? ? 即數據鏈路層協議類型?? ? ? */ ????????skb->protocol = eth_type_trans(skb, netdev);
?? ? ? ? /* 將數據包傳遞給上層,并做一些通用數據鏈路層的處理 */ ????????e1000_receive_skb(adapter, status, rx_desc->special, skb);
next_desc:
?? ? ? ?/* 處理下一個數據包 */ ?? ? ? ?...... ......?? ?}
?? ? /* 更新統計信息等*/ ?? ?...... ......?
?? ?return cleaned; } 在這個函數中,真正的從網卡buffer中取出數據包,然后根據硬件的特性做一些特定處理,并簡單的設置了數據包的一些field,完成L1的操作,設置好L2的報頭。這時,數據包已經為TCP/IP協議棧所需要的skb_buff結構。 然后調用e1000_receive_skb->netif_receive_skb->__netif_receive_skb static int __netif_receive_skb(struct sk_buff *skb)
{
????struct packet_type *ptype, *pt_prev;
????rx_handler_func_t *rx_handler;
????struct net_device *orig_dev;
????struct net_device *master;
????struct net_device *null_or_orig;
????struct net_device *orig_or_bond;
????int ret = NET_RX_DROP;
????__be16 type;
?? ? ?/* 為skb打時間戳 */ ????if (!netdev_tstamp_prequeue)
????????net_timestamp_check(skb);
?? ? /* vlan下硬件加速處理 */ ????if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
????????return NET_RX_SUCCESS;
????/* if we've gotten here through NAPI, check netpoll */
????if (netpoll_receive_skb(skb))
????????return NET_RX_DROP;
?? ? /* 設置skb的iif為接收網卡的索引 */ ????if (!skb->skb_iif)
????????skb->skb_iif = skb->dev->ifindex; ????/*
???? * bonding note: skbs received on inactive slaves should only
???? * be delivered to pkt handlers that are exact matches. Also
???? * the deliver_no_wcard flag will be set. If packet handlers
???? * are sensitive to duplicate packets these skbs will need to
???? * be dropped at the handler. The vlan accel path may have
???? * already set the deliver_no_wcard flag.
???? */
?? ?/*關于網卡的bond的處理, 這個feature我只是了解,所以略過 */ ????null_or_orig = NULL;
????orig_dev = skb->dev;
????master = ACCESS_ONCE(orig_dev->master);
????if (skb->deliver_no_wcard)
????????null_or_orig = orig_dev;
????else if (master) {
????????if (skb_bond_should_drop(skb, master)) {
????????????skb->deliver_no_wcard = 1;
????????????null_or_orig = orig_dev; /* deliver only exact match */
????????} else
????????????skb->dev = master;
????}
????__this_cpu_inc(softnet_data.processed);
?? ?/* 初始化l3 header 和 l4 header 的地址*/ ????skb_reset_network_header(skb);
????skb_reset_transport_header(skb);
?? ?/* 得到mac地址長度,準確來說是2層地址的長度 */ ????skb->mac_len = skb->network_header - skb->mac_header;
????pt_prev = NULL;
????rcu_read_lock();
?? ?/* ?? ?省略一些不太相關的代碼??? ?*/?? ?...... ......????? ?/* ?? ?通過2層協議類型作為key,得到相應鏈表。?? ?*/ ????type = skb->protocol;
????list_for_each_entry_rcu(ptype,
????????????&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
????????if (ptype->type == type && (ptype->dev == null_or_orig ||
???????? ptype->dev == skb->dev || ptype->dev == orig_dev ||
???????? ptype->dev == orig_or_bond)) {
????????????if (pt_prev) //找到匹配的協議類型,上傳給L3層
????????????????ret = deliver_skb(skb, pt_prev, orig_dev);
????????????pt_prev = ptype;
????????}
????}
????if (pt_prev) {
????????ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
????} else {
????????kfree_skb(skb);
????????/* Jamal, now you will not able to escape explaining
???????? * me how you were going to use this. :-)
???????? */
????????ret = NET_RX_DROP;
????}
out:
????rcu_read_unlock();
????return ret;
} 現在基本上已經比較詳細的學習了L2層的數據包處理流程。當然,還有很多很多的細節沒有涉及,道路還很漫長啊。
總結
以上是生活随笔為你收集整理的TCP/IP学习(30)——L2数据链路层的数据包处理详细流程的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。