當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

TCP/IP学习(30)——L2数据链路层的数据包处理详细流程

發布時間：2025/3/17 编程问答 26 豆豆

生活随笔收集整理的這篇文章主要介紹了 TCP/IP学习(30)——L2数据链路层的数据包处理详细流程小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

原文地址：TCP/IP學習(30)——L2數據鏈路層的數據包處理詳細流程作者：GFree_Wind

本文的copyleft歸gfree.wind@gmail.com所有，使用GPL發布，可以自由拷貝，轉載。但轉載請保持文檔的完整性，注明原作者及原鏈接，嚴禁用于任何商業用途。
作者：gfree.wind@gmail.com
博客：linuxfocus.blog.chinaunix.net?? ?
在前面的博文中，我學習了數據包從L2到L5的流程，但是當時因為時間和水平的限制，整個兒流程并沒有涉及太多的細節。前兩天大致又過了這個流程，發現有不少細節還是需要注意的。所以決定，將之前略過的一些內容，詳細的學習一遍。
今天主要是學習L2數據鏈路層的數據包的處理機制。在Linux kernel中，由網卡驅動完成L1物理層和L2數據鏈路層的工作。
首先看函數net_dev_init

static int __init net_dev_init(void)

{

????int i, rc = -ENOMEM;

????BUG_ON(!dev_boot_phase);

?? ? /*? ?? ? 創建對應的/proc文件，如/proc/net/dev, /proc/net/softnet_stat等 ?? ? */

????if (dev_proc_init())

????????goto out;

?? ? /* 初始化netdev對應的kobject*/

????if (netdev_kobject_init())

????????goto out;

?? ? /*? ?? ? 初始化數據鏈路層的handle上層數據類型表。 ?? ? 回憶前文《TCP/IP學習(28)——數據包完整接受流程》中，在inet_init中注冊了IP包類型到這個表中。 ?? ? */

????INIT_LIST_HEAD(&ptype_all);

????for (i = 0; i < PTYPE_HASH_SIZE; i++)

????????INIT_LIST_HEAD(&ptype_base[i]);

?? ? /* ?? ? 注冊neddev_net_ops subsystem ?? ? */

????if (register_pernet_subsys(&netdev_net_ops))

????????goto out;

????/*

???? *????Initialise the packet receive queues.

???? */

?? ?/*

?? ?為每個CPU初始化PERCPU的全局變量softnet_data，作為該CPU的接收緩存?

?? ?*/

????for_each_possible_cpu(i) {

????????struct softnet_data *sd = &per_cpu(softnet_data, i);

?? ? ? ?...... ......

????}

????dev_boot_phase = 0;

????/* The loopback device is special if any other network devices

???? * is present in a network namespace the loopback device must

???? * be present. Since we now dynamically allocate and free the

???? * loopback device ensure this invariant is maintained by

???? * keeping the loopback device as the first device on the

???? * list of network devices. Ensuring the loopback devices

???? * is the first device that appears and the last network device

???? * that disappears.

???? */

????if (register_pernet_device(&loopback_net_ops))

????????goto out;

????if (register_pernet_device(&default_device_ops))

????????goto out;

?? ? /* ?? ? enable軟中斷 ?? ? */

????open_softirq(NET_TX_SOFTIRQ, net_tx_action);

????open_softirq(NET_RX_SOFTIRQ, net_rx_action);

????hotcpu_notifier(dev_cpu_callback, 0);

????dst_init();

????dev_mcast_init();

????rc = 0;

out:

????return rc;

}

net_dev_init在系統啟動時，在注冊網卡之前調用，主要就是初始化net device所需要的一些環境。
下面仍然以Intel PRO/1000的網卡驅動為例，e1000_init_module為該驅動的入口。通過e1000_init_module->pci_register_driver->e1000_probe進入初始化函數。在e1000_probe中，通過下面這條語句綁定了操作函數。 netdev->netdev_ops = &e1000_netdev_ops;

static const struct net_device_ops e1000_netdev_ops = {

????.ndo_open????????= e1000_open,

...... ......

};

對于今天的主題來說，只需關心e1000_open即可。因為該函數是在激活該網卡時被調用，完成資源的申請，中斷的注冊，即e1000_intr。

static irqreturn_t e1000_intr(int irq, void *data)

{

?? ?...... ......?

?? ?/*

?? ?檢測是否可以調度NAPI：

?? ?當沒有disable NAPI且沒有該網卡對應的NAPI在運行時（保證對應一個網卡的NAPI只有一個實例在運行），即可調度一個新的NAPI。

?? ?NAPI是一種新的網卡數據檢查處理方式。基本上是interrupt+poll。詳細信息問google

?? ?*/

????if (likely(napi_schedule_prep(&adapter->napi))) {

?? ? ? ?/*?

?? ? ? ?清楚單次的統計信息。

?? ? ? ?剛看到這里時，我也奇怪，為什么total的統計信息要被清零。

?? ? ? ?實際上這些統計信息只是一次NAPI運行的統計信息，并不是網卡總的統計信息。

?? ? ? ?網卡的統計信息為netdev->stats。NAPI運行完會將下面的值加到網卡的統計信息上的。

?? ? ? ?*/

????????adapter->total_tx_bytes = 0;

????????adapter->total_tx_packets = 0;

????????adapter->total_rx_bytes = 0;

????????adapter->total_rx_packets = 0;

?? ? ? ?/* 要求調度對應的NAPI實例 */

????????__napi_schedule(&adapter->napi);

????} else {

????????/* this really should not if it does it is basically a

???????? * bug, but not a hard error, so enable ints and continue */

????????if (!test_bit(__E1000_DOWN, &adapter->flags))

????????????e1000_irq_enable(adapter);

????}

????return IRQ_HANDLED;

}

上面為中斷的關鍵流程，其中要求調度對應的NAPI實例時，實際上是引發一個軟中斷。 __raise_softirq_irqoff(NET_RX_SOFTIRQ)。這個中斷函數的主要功能就是要求調度一個NAPI——這里跟以前理解的中斷函數不太一樣。按照教科書式的概念，網卡的中斷函數，應該將數據包從網卡的緩沖中取出放到一個系統緩沖中，然后在引發軟中斷去做剩下的工作。
下面看NET_RX_SOFTIRQ軟中斷對應的處理函數net_rx_action。

static void net_rx_action(struct softirq_action *h)

{

????struct softnet_data *sd = &__get_cpu_var(softnet_data);

????unsigned long time_limit = jiffies + 2;

????int budget = netdev_budget;

????void *have;

????local_irq_disable();

?? ? /* 開始順序poll所有需要poll的網卡 */

????while (!list_empty(&sd->poll_list)) {

????????struct napi_struct *n;

????????int work, weight;

????????/* If softirq window is exhuasted then punt.

???????? * Allow this to run for 2 jiffies since which will allow

???????? * an average latency of 1.5/HZ.

???????? */

????????if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))

????????????goto softnet_break;

????????local_irq_enable();

????????/* Even though interrupts have been re-enabled, this

???????? * access is safe because interrupts can only add new

???????? * entries to the tail of this list, and only ->poll()

???????? * calls can remove this head entry from the list.

???????? */

?? ? ? ?/* 取得一個網卡的NAPI實例 */

????????n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);

?? ? ? ? /* 給這個實例上鎖 */

????????have = netpoll_poll_lock(n);

????????weight = n->weight;

????????/* This NAPI_STATE_SCHED test is for avoiding a race

???????? * with netpoll's poll_napi(). Only the entity which

???????? * obtains the lock and sees NAPI_STATE_SCHED set will

???????? * actually make the ->poll() call. Therefore we avoid

???????? * accidently calling ->poll() when NAPI is not scheduled.

???????? */

????????work = 0;

????????if (test_bit(NAPI_STATE_SCHED, &n->state)) {

?? ? ? ? ? ?/* poll這個網卡 */

????????????work = n->poll(n, weight);

????????????trace_napi_poll(n);

????????}

????????WARN_ON_ONCE(work > weight);

????????budget -= work;

????????local_irq_disable();

????????/* Drivers must not modify the NAPI state if they

???????? * consume the entire weight. In such cases this code

???????? * still "owns" the NAPI instance and therefore can

???????? * move the instance around on the list at-will.

???????? */

????????if (unlikely(work == weight)) {

?? ? ? ? ? ?/* 該NAPI的weight消耗完畢，需要處理下一個 */

????????????if (unlikely(napi_disable_pending(n))) {

????????????????local_irq_enable();

????????????????napi_complete(n);

????????????????local_irq_disable();

????????????} else

????????????????list_move_tail(&n->poll_list, &sd->poll_list);

????????}

????????netpoll_poll_unlock(have);

????}

out:

????net_rps_action_and_irq_enable(sd);

#ifdef CONFIG_NET_DMA

????/*

???? * There may not be any more sk_buffs coming right now, so push

???? * any pending DMA copies to hardware

???? */

????dma_issue_pending_all();

#endif

????return;

softnet_break:

????sd->time_squeeze++;

????__raise_softirq_irqoff(NET_RX_SOFTIRQ);

????goto out;

}

通過上面這個軟中斷處理函數，對應每個網卡來說，又需要跳回驅動，去學習對應的poll函數。對于本文的這個驅動來說，poll函數就是e1000_clean->e1000_clean_rx_irq。這個函數是真正用于處理網卡接收數據包的工作。

static bool e1000_clean_rx_irq(struct e1000_adapter *adapter,

???????????? struct e1000_rx_ring *rx_ring,

???????????? int *work_done, int work_to_do)

{

?? ...... ......

?? ?? ?? ? /* 得到當前需要處理buffer*/

????i = rx_ring->next_to_clean;

?? ?rx_desc = E1000_RX_DESC(*rx_ring, i);

????buffer_info = &rx_ring->buffer_info[i];

????while (rx_desc->status & E1000_RXD_STAT_DD) {

????????struct sk_buff *skb;

????????u8 status;

????????if (*work_done >= work_to_do) //如果已經poll到足夠的包，可以跳出返回

????????????break;

????????(*work_done)++;

????????rmb(); /* read descriptor and rx_buffer_info after status DD */

?? ? ? ? ? ?? ? ? ? /* 得到數據包buffer對應的skb buffer結構地址 */

????????status = rx_desc->status;

????????skb = buffer_info->skb;

????????buffer_info->skb = NULL;

?? ?/*? ?? ? ? ? 然后做一些網卡硬件相關，及一些sanity check ?? ? ? ? */ ?? ? ? ? ...... ......

?? ? ? /*?

?? ? ? 設置skb->pkt_type：PACKET_BROADCAST等；

?? ? ? 即數據鏈路層協議類型

?? ? ? */

????????skb->protocol = eth_type_trans(skb, netdev);

?? ? ? ? /* 將數據包傳遞給上層，并做一些通用數據鏈路層的處理 */

????????e1000_receive_skb(adapter, status, rx_desc->special, skb);

next_desc:

?? ? ? ?/* 處理下一個數據包 */

?? ? ? ?...... ......

?? ?}

?? ? /* 更新統計信息等*/

?? ?...... ......?

?? ?return cleaned;

}

在這個函數中，真正的從網卡buffer中取出數據包，然后根據硬件的特性做一些特定處理，并簡單的設置了數據包的一些field，完成L1的操作，設置好L2的報頭。這時，數據包已經為TCP/IP協議棧所需要的skb_buff結構。然后調用e1000_receive_skb->netif_receive_skb->__netif_receive_skb

static int __netif_receive_skb(struct sk_buff *skb)

{

????struct packet_type *ptype, *pt_prev;

????rx_handler_func_t *rx_handler;

????struct net_device *orig_dev;

????struct net_device *master;

????struct net_device *null_or_orig;

????struct net_device *orig_or_bond;

????int ret = NET_RX_DROP;

????__be16 type;

?? ? ?/* 為skb打時間戳 */

????if (!netdev_tstamp_prequeue)

????????net_timestamp_check(skb);

?? ? /* vlan下硬件加速處理 */

????if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))

????????return NET_RX_SUCCESS;

????/* if we've gotten here through NAPI, check netpoll */

????if (netpoll_receive_skb(skb))

????????return NET_RX_DROP;

?? ? /* 設置skb的iif為接收網卡的索引 */

????if (!skb->skb_iif)

????????skb->skb_iif = skb->dev->ifindex;

????/*

???? * bonding note: skbs received on inactive slaves should only

???? * be delivered to pkt handlers that are exact matches. Also

???? * the deliver_no_wcard flag will be set. If packet handlers

???? * are sensitive to duplicate packets these skbs will need to

???? * be dropped at the handler. The vlan accel path may have

???? * already set the deliver_no_wcard flag.

???? */

?? ?/*關于網卡的bond的處理，這個feature我只是了解，所以略過 */

????null_or_orig = NULL;

????orig_dev = skb->dev;

????master = ACCESS_ONCE(orig_dev->master);

????if (skb->deliver_no_wcard)

????????null_or_orig = orig_dev;

????else if (master) {

????????if (skb_bond_should_drop(skb, master)) {

????????????skb->deliver_no_wcard = 1;

????????????null_or_orig = orig_dev; /* deliver only exact match */

????????} else

????????????skb->dev = master;

????}

????__this_cpu_inc(softnet_data.processed);

?? ?/* 初始化l3 header 和 l4 header 的地址*/

????skb_reset_network_header(skb);

????skb_reset_transport_header(skb);

?? ?/* 得到mac地址長度，準確來說是2層地址的長度 */

????skb->mac_len = skb->network_header - skb->mac_header;

????pt_prev = NULL;

????rcu_read_lock();

?? ?/*

?? ?省略一些不太相關的代碼?

?? ?*/

?? ?...... ......

???

?? ?/*

?? ?通過2層協議類型作為key，得到相應鏈表。

?? ?*/

????type = skb->protocol;

????list_for_each_entry_rcu(ptype,

????????????&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {

????????if (ptype->type == type && (ptype->dev == null_or_orig ||

???????? ptype->dev == skb->dev || ptype->dev == orig_dev ||

???????? ptype->dev == orig_or_bond)) {

????????????if (pt_prev) //找到匹配的協議類型，上傳給L3層

????????????????ret = deliver_skb(skb, pt_prev, orig_dev);

????????????pt_prev = ptype;

????????}

????}

????if (pt_prev) {

????????ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);

????} else {

????????kfree_skb(skb);

????????/* Jamal, now you will not able to escape explaining

???????? * me how you were going to use this. :-)

???????? */

????????ret = NET_RX_DROP;

????}

out:

????rcu_read_unlock();

????return ret;

}

現在基本上已經比較詳細的學習了L2層的數據包處理流程。當然，還有很多很多的細節沒有涉及，道路還很漫長啊。

總結

以上是生活随笔為你收集整理的TCP/IP学习(30)——L2数据链路层的数据包处理详细流程的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： CoreGraphics之CGConte
下一篇： Oracle 安装时候的fs.aio-m