tcp/ip 协议栈Linux内核源码分析13 udp套接字发送流程二
內(nèi)核版本:3.4.39
繼續(xù)UDP套接字發(fā)送,上一篇講到了sock_sendmsg,這里繼續(xù),下面是sock_sendmsg的相關(guān)代碼
int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) {/* kiocb為內(nèi)核通用的IO請求結(jié)構(gòu) */struct kiocb iocb;struct sock_iocb siocb;int ret;/* 初始化同步的內(nèi)核IO請求結(jié)構(gòu) */init_sync_kiocb(&iocb, NULL);iocb.private = &siocb;/* 發(fā)送消息 */ret = __sock_sendmsg(&iocb, sock, msg, size);/* 返回結(jié)果表明該消息已經(jīng)加入隊(duì)列,要等待完成事件 */if (-EIOCBQUEUED == ret)ret = wait_on_sync_kiocb(&iocb);return ret; } EXPORT_SYMBOL(sock_sendmsg)這里__sock_sendmsg只是做了安全性檢查,然后就調(diào)用了__sock_sendmsg_nosec函數(shù)。
static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock,struct msghdr *msg, size_t size) {int err = security_socket_sendmsg(sock, msg, size);return err ?: __sock_sendmsg_nosec(iocb, sock, msg, size); }再繼續(xù)看__sock_sendmsg_nosec,代碼如下:
static inline int __sock_sendmsg_nosec(struct kiocb *iocb, struct socket *sock,struct msghdr *msg, size_t size) {/* 獲得套接字在sock_sendmsg中設(shè)置的IO請求, */struct sock_iocb *si = kiocb_to_siocb(iocb);sock_update_classid(sock->sk);sock_update_netprioidx(sock->sk);/* 初始化套接字的IO請求字段 */si->sock = sock;si->scm = NULL;si->msg = msg;si->size = size;/* 根據(jù)不同的套接字類型,調(diào)用其發(fā)送數(shù)據(jù)函數(shù) */return sock->ops->sendmsg(iocb, sock, msg, size); }?到此,我們完成了數(shù)據(jù)包從用戶空間到內(nèi)核空間的流程跟蹤。接下來的數(shù)據(jù)包發(fā)送過程,將根據(jù)不同的協(xié)議,走不同的流程。
我們分析UDP的發(fā)送,UDP的sendmsg操作函數(shù)為udp_sendmsg,代碼如下:
int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,size_t len) {/* 從inet通用套接字得到inet套接字 */struct inet_sock *inet = inet_sk(sk);/* 從inet通用套接字得到UDP套接字 */struct udp_sock *up = udp_sk(sk);struct flowi4 fl4_stack;struct flowi4 *fl4;int ulen = len;struct ipcm_cookie ipc;struct rtable *rt = NULL;int free = 0;int connected = 0;__be32 daddr, faddr, saddr;__be16 dport;u8 tos;int err, is_udplite = IS_UDPLITE(sk);/* 是否有數(shù)據(jù)包聚合:或者UDP套接字設(shè)置了聚合選項(xiàng),或者數(shù)據(jù)包消息指明了還有更多數(shù)據(jù)UDP_CORK 或者 MSG_MORE,表示使用單個(gè)數(shù)據(jù)包發(fā)送多個(gè)數(shù)據(jù)*/int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);struct sk_buff *skb;struct ip_options_data opt_copy;/* 數(shù)據(jù)包長度檢查 */ if (len > 0xFFFF)return -EMSGSIZE;/** Check the flags.*//* 檢查消息標(biāo)志,UDP不支持帶外數(shù)據(jù) *if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */return -EOPNOTSUPP;ipc.opt = NULL;ipc.tx_flags = 0;/* 設(shè)置正確的分片函數(shù) */getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;fl4 = &inet->cork.fl.u.ip4;if (up->pending) {/** There are pending frames.* The socket lock must be held while it's corked.*//* 該UDP套接字還有待發(fā)的數(shù)據(jù)包 */ lock_sock(sk);/* 常見的上鎖雙重檢查機(jī)制 */if (likely(up->pending)) {/* 若待發(fā)的數(shù)據(jù)不是INET數(shù)據(jù),則報(bào)錯(cuò)返回 */if (unlikely(up->pending != AF_INET)) {release_sock(sk);return -EINVAL;}/* 調(diào)到追加數(shù)據(jù)處 */goto do_append_data;}release_sock(sk);}ulen += sizeof(struct udphdr);/** Get and verify the address.*/if (msg->msg_name) {/* 若指定了目標(biāo)地址,則對其進(jìn)行校驗(yàn) */struct sockaddr_in * usin = (struct sockaddr_in *)msg->msg_name;/* 檢查長度 */if (msg->msg_namelen < sizeof(*usin))return -EINVAL;/* 檢查協(xié)議族。目前只支持AF_INET和AF_UNSPEC協(xié)議族 */if (usin->sin_family != AF_INET) {if (usin->sin_family != AF_UNSPEC)return -EAFNOSUPPORT;}/* 若通過了檢查,則設(shè)置目的地址與目的端口 */daddr = usin->sin_addr.s_addr;dport = usin->sin_port;/* 目的端口不能為0 */if (dport == 0)return -EINVAL;} else {/* 如果沒有指定目的地址和目的端口,則當(dāng)前套接字的狀態(tài)必須是已連接,即已經(jīng)調(diào)用過connect設(shè)置了目的地址 */if (sk->sk_state != TCP_ESTABLISHED)return -EDESTADDRREQ;/* 使用之前設(shè)置的目的地址和目的端口 */daddr = inet->inet_daddr;dport = inet->inet_dport;/* Open fast path for connected socket.Route will not be used, if at least one option is set.*/connected = 1;}ipc.addr = inet->inet_saddr;ipc.oif = sk->sk_bound_dev_if;/* 設(shè)置時(shí)間戳標(biāo)志 */err = sock_tx_timestamp(sk, &ipc.tx_flags);if (err)return err;/* 發(fā)送的消息包含控制數(shù)據(jù) */if (msg->msg_controllen) {/* 雖然這個(gè)函數(shù)的名字叫作send,其實(shí)并沒有任何發(fā)送動作,而只是將控制消息設(shè)置到ipc中 */err = ip_cmsg_send(sock_net(sk), msg, &ipc);if (err)return err;/* 設(shè)置釋放ipc.opt的標(biāo)志 */if (ipc.opt)free = 1;connected = 0;}if (!ipc.opt) {/* 如果沒有使用控制消息指定IP選項(xiàng),則檢查套接字的IP選項(xiàng)設(shè)置。如果有,則使用套接字的IP選項(xiàng) */struct ip_options_rcu *inet_opt;rcu_read_lock();inet_opt = rcu_dereference(inet->inet_opt);if (inet_opt) {memcpy(&opt_copy, inet_opt,sizeof(*inet_opt) + inet_opt->opt.optlen);ipc.opt = &opt_copy.opt;}rcu_read_unlock();}saddr = ipc.addr;ipc.addr = faddr = daddr;if (ipc.opt && ipc.opt->opt.srr) {/* 設(shè)置了嚴(yán)格路由 */if (!daddr)return -EINVAL;faddr = ipc.opt->opt.faddr;connected = 0;}/*若有下列情況之一的:1)套接字設(shè)置了本地路由標(biāo)志。2)發(fā)送消息時(shí),指明了不做路由。3)設(shè)置了IP嚴(yán)格路由選項(xiàng)。則設(shè)置不查找路由標(biāo)志*/tos = RT_TOS(inet->tos);if (sock_flag(sk, SOCK_LOCALROUTE) ||(msg->msg_flags & MSG_DONTROUTE) ||(ipc.opt && ipc.opt->opt.is_strictroute)) {tos |= RTO_ONLINK;connected = 0;}/* 如果目的地址是多播地址 */if (ipv4_is_multicast(daddr)) {/* 若未指定出口接口,則使用套接字的多播接口索引 */if (!ipc.oif)ipc.oif = inet->mc_index;/* 若源地址為0,則使用套接字的多播地址 */ if (!saddr)saddr = inet->mc_addr;connected = 0;} else if (!ipc.oif)ipc.oif = inet->uc_index;/* 連接標(biāo)志為真,即此次發(fā)送的數(shù)據(jù)包與上次的地址相同,則判斷保存的路由緩存是否還可用。*/if (connected)/* 從套接字檢查并獲得保存的路由緩存 */rt = (struct rtable *)sk_dst_check(sk, 0);/* 若目前路由緩存為空,則需要查找路由 */if (rt == NULL) {struct net *net = sock_net(sk);fl4 = &fl4_stack;/* 根據(jù)套接字和數(shù)據(jù)包的信息,初始化flowi4—這是查找路由的key */flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,RT_SCOPE_UNIVERSE, sk->sk_protocol,inet_sk_flowi_flags(sk)|FLOWI_FLAG_CAN_SLEEP,faddr, saddr, dport, inet->inet_sport,sock_i_uid(sk));/* 查找出口路由 */security_sk_classify_flow(sk, flowi4_to_flowi(fl4));rt = ip_route_output_flow(net, fl4, sk);if (IS_ERR(rt)) {/* 查找路由失敗 */err = PTR_ERR(rt);rt = NULL;if (err == -ENETUNREACH)IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);goto out;}err = -EACCES;/* 若路由是廣播路由,并且套接字非廣播套接字 */if ((rt->rt_flags & RTCF_BROADCAST) &&!sock_flag(sk, SOCK_BROADCAST))goto out;/* 若該UDP為已連接狀態(tài),則保存這個(gè)路由緩存 */ if (connected)sk_dst_set(sk, dst_clone(&rt->dst));}/* 如果數(shù)據(jù)包設(shè)置了MSG_CONFIRM標(biāo)志,則是要告訴鏈路層,對端是可達(dá)的。調(diào)到do_confrim處,可以發(fā)現(xiàn)其實(shí)現(xiàn)方法是在有neibour信息的情況下,直接更新neibour確認(rèn)時(shí)間戳為當(dāng)前時(shí)間。 */if (msg->msg_flags&MSG_CONFIRM)goto do_confirm; back_from_confirm:saddr = fl4->saddr;if (!ipc.addr)daddr = ipc.addr = fl4->daddr;/* Lockless fast path for the non-corking case. *//* 沒有使用cork選項(xiàng)或MSG_MORE標(biāo)志。這也是最常見的情況。 */if (!corkreq) {/* 每次都生成一個(gè)UDP數(shù)據(jù)包 */skb = ip_make_skb(sk, fl4, getfrag, msg->msg_iov, ulen,sizeof(struct udphdr), &ipc, &rt,msg->msg_flags);err = PTR_ERR(skb);/* 成功生成了數(shù)據(jù)包 */if (skb && !IS_ERR(skb))/* 發(fā)送UDP數(shù)據(jù)包 */err = udp_send_skb(skb, fl4);goto out;}lock_sock(sk);if (unlikely(up->pending)) {/* The socket is already corked while preparing it. *//* ... which is an evident application bug. --ANK *//*現(xiàn)在馬上要做cork處理,但發(fā)現(xiàn)套接字已經(jīng)cork了。因此這是一個(gè)應(yīng)用程序bug。釋放套接字鎖,并返回錯(cuò)誤。*/release_sock(sk);LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("cork app bug 2\n"));err = -EINVAL;goto out;}/** Now cork the socket to pend data.*//* 設(shè)置cork中的流信息 */ fl4 = &inet->cork.fl.u.ip4;fl4->daddr = daddr;fl4->saddr = saddr;fl4->fl4_dport = dport;fl4->fl4_sport = inet->inet_sport;up->pending = AF_INET;do_append_data:/* 增加UDP數(shù)據(jù)長度 */up->len += ulen;/* 向IP數(shù)據(jù)包中追加新的數(shù)據(jù) */err = ip_append_data(sk, fl4, getfrag, msg->msg_iov, ulen,sizeof(struct udphdr), &ipc, &rt,corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);if (err)// 若發(fā)生錯(cuò)誤,則丟棄所有未決的數(shù)據(jù)包udp_flush_pending_frames(sk);else if (!corkreq)// 若不在cork即阻塞,則發(fā)送所有未決的數(shù)據(jù)包err = udp_push_pending_frames(sk);else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))/* 若沒有未決的數(shù)據(jù)包,則重置未決標(biāo)志 */up->pending = 0;release_sock(sk);out:/* 清理工作,釋放各種資源,并增加相應(yīng)的統(tǒng)計(jì)計(jì)數(shù) */ip_rt_put(rt);if (free)kfree(ipc.opt);if (!err)return len;/** ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting* ENOBUFS might not be good (it's not tunable per se), but otherwise* we don't have a good statistic (IpOutDiscards but it can be too many* things). We could add another new stat but at least for now that* seems like overkill.*/if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {UDP_INC_STATS_USER(sock_net(sk),UDP_MIB_SNDBUFERRORS, is_udplite);}return err;do_confirm:dst_confirm(&rt->dst);if (!(msg->msg_flags&MSG_PROBE) || len)goto back_from_confirm;err = 0;goto out; } EXPORT_SYMBOL(udp_sendmsg);一般情況下,在使用UDP發(fā)送數(shù)據(jù)包時(shí)很少會使用CORK或MSG_MORE標(biāo)志,因?yàn)槲覀兿M诿看握{(diào)用發(fā)送接口時(shí),就發(fā)送一次UDP數(shù)據(jù)包。因此可以不必考慮CORK和MSG_MORE的情況,而繼續(xù)追蹤udp_send_skb函數(shù)。?
static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) {struct sock *sk = skb->sk;struct inet_sock *inet = inet_sk(sk);struct udphdr *uh;int err = 0;int is_udplite = IS_UDPLITE(sk);int offset = skb_transport_offset(skb);int len = skb->len - offset;__wsum csum = 0;/** Create a UDP header*//* 創(chuàng)建UDP報(bào)文頭部 */ uh = udp_hdr(skb);uh->source = inet->inet_sport;uh->dest = fl4->fl4_dport;uh->len = htons(len);uh->check = 0;/*如果是輕量級UDP協(xié)議,則調(diào)用相應(yīng)的校驗(yàn)和計(jì)算函數(shù)。* 輕量級UDP協(xié)議簡單說就是可以校驗(yàn)指定長度的數(shù)據(jù)長度而不是全部* 減少數(shù)據(jù)丟棄分線。具體google下*/if (is_udplite) /* UDP-Lite */csum = udplite_csum(skb);/* 禁止了UDP校驗(yàn)和 */else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */skb->ip_summed = CHECKSUM_NONE;goto send;} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum *//* 硬件支持校驗(yàn)和的計(jì)算 */udp4_hwcsum(skb, fl4->saddr, fl4->daddr);goto send;} else/* 一般情況下的校驗(yàn)和計(jì)算 */csum = udp_csum(skb);/* add protocol-dependent pseudo-header *//* 計(jì)算UDP的校驗(yàn)和,需要考慮偽首部 */uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len,sk->sk_protocol, csum);/* 如果校驗(yàn)和為0,則需要將其設(shè)置為0xFFFF。因?yàn)閁DP的零校驗(yàn)和,有特殊的含義,表示沒有校驗(yàn)和。*/if (uh->check == 0)uh->check = CSUM_MANGLED_0;send:/* 發(fā)送IP數(shù)據(jù)包 */err = ip_send_skb(skb);if (err) {if (err == -ENOBUFS && !inet->recverr) {UDP_INC_STATS_USER(sock_net(sk),UDP_MIB_SNDBUFERRORS, is_udplite);err = 0;}} elseUDP_INC_STATS_USER(sock_net(sk),UDP_MIB_OUTDATAGRAMS, is_udplite);return err; }至此,UDP已經(jīng)完成了自己的工作,后面的發(fā)送工作將交由IP層來負(fù)責(zé)。?
參考文檔:
1.?《Linux環(huán)境編程:從應(yīng)用到內(nèi)核》
2.??淺析Linux網(wǎng)絡(luò)子系統(tǒng)(一)?
總結(jié)
以上是生活随笔為你收集整理的tcp/ip 协议栈Linux内核源码分析13 udp套接字发送流程二的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 社会抚养费是什么意思
- 下一篇: tcp/ip 协议栈Linux内核源码分