linux内核协议栈 UDP之数据报接收过程

2021-06-04 Linux后端开发

UDP报文接收概述

UDP数据报的接收要分两部分来看：

网络层接收完数据包后递交给UDP后，UDP的处理过程。该过程UDP需要做的工作就是接收数据包并对其进行校验，校验成功后将其放入接收队列 sk_receive_queue 中等待用户空间程序来读取。
用户空间程序调用read()等系统调用读取已经放入接收队列 sk_receive_queue 中的数据。

从IP层接收数据包 udp_rcv()

该函数是在AF_INET协议族初始化时，由UDP注册给网络层的回调函数，当网络层代码处理完一个输入数据包后，如果该数据包是发往本机的，并且其上层协议就是UDP，那么会调用该回调函数。

int udp_rcv(struct sk_buff *skb)
{
	return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
}
 
@skb: 输入数据包
@udptable：已绑定端口的UDP传输控制块，将从该哈希表查找给skb属于哪个套接字
@proto：L4协议号，到这里可能是IPPROTO_UDP或者IPPROTO_UDPLITE
int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
		   int proto)
{
	struct sock *sk;
	struct udphdr *uh;
	unsigned short ulen;
	struct rtable *rt = skb_rtable(skb);
	__be32 saddr, daddr;
	struct net *net = dev_net(skb->dev);
 
	/*
	 *  Validate the packet.
	 */
	//调整SKB内部数据布局，使得线性地址空间中至少包含UDP首部
	if (!pskb_may_pull(skb, sizeof(struct udphdr)))
		goto drop;		/* No space for header. */
 
	uh   = udp_hdr(skb);
	ulen = ntohs(uh->len);
	//skb中的数据长度不能小于UDP首部指示的数据包长度，即数据包是完整的
	if (ulen > skb->len)
		goto short_packet;
 
	if (proto == IPPROTO_UDP) {
		//1. UDP数据包长度必须大于首部长度
		//2. pskb_trim_rcum()会去掉可能的填充(UDP数据包过小，IP可能会填充)，然后重新计算校验和
		if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
			goto short_packet;
		uh = udp_hdr(skb);
	}
	//计算校验和
	if (udp4_csum_init(skb, uh, proto))
		goto csum_error;
 
	//获取数据包中的源IP和目的IP地址
	saddr = ip_hdr(skb)->saddr;
	daddr = ip_hdr(skb)->daddr;
	//对于多播或者广播报文的处理
	if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
		return __udp4_lib_mcast_deliver(net, skb, uh, saddr, daddr, udptable);
 
	//根据报文的源端口号和目的端口号查询udptable，寻找应该接收该数据包的传输控制块
	sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
	//找到了处理该数据包的传输控制块，调用udp_queue_rcv_skb()接收数据包
    if (sk != NULL) {
		int ret = udp_queue_rcv_skb(sk, skb);
		sock_put(sk);
 
		/* a return value > 0 means to resubmit the input, but
		 * it wants the return to be -protocol, or 0
		 */
		if (ret > 0)
			return -ret;
		return 0;
	}
	//到这里，说明没有传输控制块接收该数据包，做些统计然后丢弃该数据包
 
	//IPSec相关
	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
		goto drop;
	nf_reset(skb);
 
	/* No socket. Drop packet silently, if checksum is wrong */
	if (udp_lib_checksum_complete(skb))
		goto csum_error;
	//累计输入数据包错误统计值，并且回复端口不可达ICMP报文
	UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 
	/*
	 * Hmm.  We got an UDP packet to a port to which we
	 * don't wanna listen.  Ignore it.
	 */
	kfree_skb(skb);
	return 0;
 
short_packet:
	LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%un",
		       proto == IPPROTO_UDPLITE ? "-Lite" : "",
		       &saddr,
		       ntohs(uh->source),
		       ulen,
		       skb->len,
		       &daddr,
		       ntohs(uh->dest));
	goto drop;
 
csum_error:
	/*
	 * RFC1122: OK.  Discards the bad packet silently (as far as
	 * the network is concerned, anyway) as per 4.1.3.4 (MUST).
	 */
	LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %dn",
		       proto == IPPROTO_UDPLITE ? "-Lite" : "",
		       &saddr,
		       ntohs(uh->source),
		       &daddr,
		       ntohs(uh->dest),
		       ulen);
drop:
	UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
	kfree_skb(skb);
	return 0;
}

疑惑：为何校验和的计算和验证要分udp4_csum_init()和udp_lib_checksum_complete()两步完成？？？

查找数据包所属套接字 __udp4_lib_lookup_skb()

如上，非常关键的一步就是根据数据包中目的地址信息寻找应该由谁来处理该数据包。

static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
						 __be16 sport, __be16 dport,
						 struct udp_table *udptable)
{
	struct sock *sk;
	const struct iphdr *iph = ip_hdr(skb);
 
	//在网络层可能已经为该数据包查询过传输控制块了，这时会将查询结果记录到skb->sk中
	if (unlikely(sk = skb_steal_sock(skb)))
		return sk;
	else
		//之前没有查询过，继续查询
		return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
					 iph->daddr, dport, inet_iif(skb),
					 udptable);
}
 
@dif: 该数据包的输入网络设备接口
static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
		__be16 sport, __be32 daddr, __be16 dport,
		int dif, struct udp_table *udptable)
{
	struct sock *sk, *result;
	struct hlist_nulls_node *node;
	//目的端口号为哈希表的key
	unsigned short hnum = ntohs(dport);
	unsigned int hash = udp_hashfn(net, hnum);
	struct udp_hslot *hslot = &udptable->hash[hash];
	int score, badness;
 
	rcu_read_lock();
begin:
	//遍历冲突链，寻找一个分值最高的保存到result中
	result = NULL;
	badness = -1;
	sk_nulls_for_each_rcu(sk, node, &hslot->head) {
		score = compute_score(sk, net, saddr, hnum, sport,
				      daddr, dport, dif);
		if (score > badness) {
			result = sk;
			badness = score;
		}
	}
	/*
	 * if the nulls value we got at the end of this lookup is
	 * not the expected one, we must restart lookup.
	 * We probably met an item that was moved to another chain.
	 */
	if (get_nulls_value(node) != hash)
		goto begin;
 
	if (result) {
		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
			result = NULL;
		else if (unlikely(compute_score(result, net, saddr, hnum, sport,
				  daddr, dport, dif) < badness)) {
			sock_put(result);
			goto begin;
		}
	}
	rcu_read_unlock();
	return result;
}

疑惑：查个表为什么这么复杂，这个分值什么鬼???

数据包进入队列 udp_queue_rcv_skb()

找到数据包目的端口对应的传输控制块后，会调用该函数接收该数据包。

/* returns:
 *  -1: error
 *   0: success
 *  >0: "udp encap" protocol resubmission
 *
 * Note that in the success and error cases, the skb is assumed to
 * have either been requeued or freed.
 */
int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
	struct udp_sock *up = udp_sk(sk);
	int rc;
	int is_udplite = IS_UDPLITE(sk);
 
	//IPSec相关
	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
		goto drop;
	nf_reset(skb);
	//IPSeck相关处理
	if (up->encap_type) {
		/*
		 * This is an encapsulation socket so pass the skb to
		 * the socket's udp_encap_rcv() hook. Otherwise, just
		 * fall through and pass this up the UDP socket.
		 * up->encap_rcv() returns the following value:
		 * =0 if skb was successfully passed to the encap
		 *    handler or was discarded by it.
		 * >0 if skb should be passed on to UDP.
		 * <0 if skb should be resubmitted as proto -N
		 */
 
		/* if we're overly short, let UDP handle it */
		if (skb->len > sizeof(struct udphdr) &&
		    up->encap_rcv != NULL) {
			int ret;
 
			ret = (*up->encap_rcv)(sk, skb);
			if (ret <= 0) {
				UDP_INC_STATS_BH(sock_net(sk),
						 UDP_MIB_INDATAGRAMS,
						 is_udplite);
				return -ret;
			}
		}
 
		/* FALLTHROUGH -- it's a UDP Packet */
	}
 
	//UDPlite相关处理
	if ((is_udplite & UDPLITE_RECV_CC)  &&  UDP_SKB_CB(skb)->partial_cov) {
 
		/*
		 * MIB statistics other than incrementing the error count are
		 * disabled for the following two types of errors: these depend
		 * on the Application settings, not on the functioning of the
		 * protocol stack as such.
		 *
		 * RFC 3828 here recommends (sec 3.3): "There should also be a
		 * way ... to ... at least let the receiving application block
		 * delivery of packets with coverage values less than a value
		 * provided by the application."
		 */
		if (up->pcrlen == 0) {          /* full coverage was set  */
			LIMIT_NETDEBUG(KERN_WARNING "UDPLITE: partial coverage "
				"%d while full coverage %d requestedn",
				UDP_SKB_CB(skb)->cscov, skb->len);
			goto drop;
		}
		/* The next case involves violating the min. coverage requested
		 * by the receiver. This is subtle: if receiver wants x and x is
		 * greater than the buffersize/MTU then receiver will complain
		 * that it wants x while sender emits packets of smaller size y.
		 * Therefore the above ...()->partial_cov statement is essential.
		 */
		if (UDP_SKB_CB(skb)->cscov  <  up->pcrlen) {
			LIMIT_NETDEBUG(KERN_WARNING
				"UDPLITE: coverage %d too small, need min %dn",
				UDP_SKB_CB(skb)->cscov, up->pcrlen);
			goto drop;
		}
	}
 
	//如果设置了套接口过滤器时，那么需要提前进行校验和的处理，保证传给过滤器的数据包一定是校验通过的
	if (sk->sk_filter) {
		if (udp_lib_checksum_complete(skb))
			goto drop;
	}
 
	rc = 0;
 
	//锁定socket
	bh_lock_sock(sk);
	//如果当前没有用户空间程序正在从接收队列接收数据，那么直接将SKB放入到接收队列中即可
	if (!sock_owned_by_user(sk))
		rc = __udp_queue_rcv_skb(sk, skb);
	else
		//如果接收队列已经被锁定，那么暂时将数据放入到后备队列中，后备队列中的数据在
		//release_sock()中被转移到接收队列中
		sk_add_backlog(sk, skb);
	bh_unlock_sock(sk);
 
	return rc;
 
drop:
	UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
	kfree_skb(skb);
	return -1;
}

数据包进接收队列 sk_receive_queue

static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
	int is_udplite = IS_UDPLITE(sk);
	int rc;
 
	//调用sock_queue_rcv_skb()接收
	if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) {
		/* Note that an ENOMEM error is charged twice */
		if (rc == -ENOMEM) {
			//如果由于内存问题导致数据包接收失败，进行统计
			UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
					 is_udplite);
			atomic_inc(&sk->sk_drops);
		}
		goto drop;
	}
 
	return 0;
 
drop:
	UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
	kfree_skb(skb);
	return -1;
}
 
int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
	int err = 0;
	int skb_len;
 
	//如果接收该数据包后，占用内存过大，则接收失败
	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
	    (unsigned)sk->sk_rcvbuf) {
		err = -ENOMEM;
		goto out;
	}
	//对于设置了套接字过滤器的调用其过滤器回调，过滤失败直接返回失败
	err = sk_filter(sk, skb);
	if (err)
		goto out;
	//进行内存相关的统计，如果内存不足或者超过了接收缓存上限，则接收失败
	if (!sk_rmem_schedule(sk, skb->truesize)) {
		err = -ENOBUFS;
		goto out;
	}
 
	skb->dev = NULL;
	//输入数据包由该套接字认领
	skb_set_owner_r(skb, sk);
 
	/* Cache the SKB length before we tack it onto the receive
	 * queue.  Once it is added it no longer belongs to us and
	 * may be freed by other threads of control pulling packets
	 * from the queue.
	 */
	skb_len = skb->len;
	//将该SKB加入到接收队列中
	skb_queue_tail(&sk->sk_receive_queue, skb);
	//调用回调通知可能由于数据不足而block的进程
	if (!sock_flag(sk, SOCK_DEAD))
		sk->sk_data_ready(sk, skb_len);
out:
	return err;
}

唤醒阻塞进程 sock_def_readable（进接收队列唤醒）

将数据放入接收队列后，需要唤醒那些因为数据不足而阻塞的进程，这是通过上面的sk->sk_data_ready()回调实现的，对于UDP，该函数就是 sock_def_readable。

static void sock_def_readable(struct sock *sk, int len)
{
	//先获取读锁
	read_lock(&sk->sk_callback_lock);
	//如果有正在阻塞的进程，唤醒它们
	if (sk_has_sleeper(sk))
		wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
						POLLRDNORM | POLLRDBAND);
	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
	read_unlock(&sk->sk_callback_lock);
}
 
static inline int sk_has_sleeper(struct sock *sk)
{
	/*
	 * We need to be sure we are in sync with the
	 * add_wait_queue modifications to the wait queue.
	 *
	 * This memory barrier is paired in the sock_poll_wait.
	 */
	smp_mb__after_lock();
	//block的进程都阻塞在了sk->sk_sleep等待队列上
	return sk->sk_sleep && waitqueue_active(sk->sk_sleep);
}

数据包进后备队列 sk_backlog

在下半部接收时，如果传输控制块已经被进程锁定，那么会先将数据放入到后备队列中，等进程释放传输控制块时再进行处理，这种设计可以使得软中断能够尽快的结束。

/* The per-socket spinlock must be held here. */
//调用该函数时，要确保已经使用自旋锁sk_lock.slock
static inline void sk_add_backlog(struct sock *sk, struct sk_buff *skb)
{
	//将skb放入后备队列的末尾
	if (!sk->sk_backlog.tail) {
		sk->sk_backlog.head = sk->sk_backlog.tail = skb;
	} else {
		sk->sk_backlog.tail->next = skb;
		sk->sk_backlog.tail = skb;
	}
	skb->next = NULL;
}