mirror of
https://gitee.com/bianbu-linux/linux-6.6
synced 2025-04-24 14:07:52 -04:00
The maximum hash table size is 64K due to the nature of the protocol. [0] It's smaller than TCP, and fewer sockets can cause a performance drop. On an EC2 c5.24xlarge instance (192 GiB memory), after running iperf3 in different netns, creating 32Mi sockets without data transfer in the root netns causes regression for the iperf3's connection. uhash_entries sockets length Gbps 64K 1 1 5.69 1Mi 16 5.27 2Mi 32 4.90 4Mi 64 4.09 8Mi 128 2.96 16Mi 256 2.06 32Mi 512 1.12 The per-netns hash table breaks the lengthy lists into shorter ones. It is useful on a multi-tenant system with thousands of netns. With smaller hash tables, we can look up sockets faster, isolate noisy neighbours, and reduce lock contention. The max size of the per-netns table is 64K as well. This is because the possible hash range by udp_hashfn() always fits in 64K within the same netns and we cannot make full use of the whole buckets larger than 64K. /* 0 < num < 64K -> X < hash < X + 64K */ (num + net_hash_mix(net)) & mask; Also, the min size is 128. We use a bitmap to search for an available port in udp_lib_get_port(). To keep the bitmap on the stack and not fire the CONFIG_FRAME_WARN error at build time, we round up the table size to 128. The sysctl usage is the same with TCP: $ dmesg | cut -d ' ' -f 6- | grep "UDP hash" UDP hash table entries: 65536 (order: 9, 2097152 bytes, vmalloc) # sysctl net.ipv4.udp_hash_entries net.ipv4.udp_hash_entries = 65536 # can be changed by uhash_entries # sysctl net.ipv4.udp_child_hash_entries net.ipv4.udp_child_hash_entries = 0 # disabled by default # ip netns add test1 # ip netns exec test1 sysctl net.ipv4.udp_hash_entries net.ipv4.udp_hash_entries = -65536 # share the global table # sysctl -w net.ipv4.udp_child_hash_entries=100 net.ipv4.udp_child_hash_entries = 100 # ip netns add test2 # ip netns exec test2 sysctl net.ipv4.udp_hash_entries net.ipv4.udp_hash_entries = 128 # own a per-netns table with 2^n buckets We could optimise the hash table lookup/iteration further by removing the netns comparison for the per-netns one in the future. Also, we could optimise the sparse udp_hslot layout by putting it in udp_table. [0]: https://lore.kernel.org/netdev/4ACC2815.7010101@gmail.com/ Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com> Signed-off-by: David S. Miller <davem@davemloft.net>
164 lines
4.7 KiB
C
164 lines
4.7 KiB
C
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
/*
|
|
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
|
* operating system. INET is implemented using the BSD Socket
|
|
* interface as the means of communication with the user level.
|
|
*
|
|
* Definitions for the UDP protocol.
|
|
*
|
|
* Version: @(#)udp.h 1.0.2 04/28/93
|
|
*
|
|
* Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
|
|
*/
|
|
#ifndef _LINUX_UDP_H
|
|
#define _LINUX_UDP_H
|
|
|
|
#include <net/inet_sock.h>
|
|
#include <linux/skbuff.h>
|
|
#include <net/netns/hash.h>
|
|
#include <uapi/linux/udp.h>
|
|
|
|
static inline struct udphdr *udp_hdr(const struct sk_buff *skb)
|
|
{
|
|
return (struct udphdr *)skb_transport_header(skb);
|
|
}
|
|
|
|
#define UDP_HTABLE_SIZE_MIN_PERNET 128
|
|
#define UDP_HTABLE_SIZE_MIN (CONFIG_BASE_SMALL ? 128 : 256)
|
|
#define UDP_HTABLE_SIZE_MAX 65536
|
|
|
|
static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
|
|
{
|
|
return (num + net_hash_mix(net)) & mask;
|
|
}
|
|
|
|
struct udp_sock {
|
|
/* inet_sock has to be the first member */
|
|
struct inet_sock inet;
|
|
#define udp_port_hash inet.sk.__sk_common.skc_u16hashes[0]
|
|
#define udp_portaddr_hash inet.sk.__sk_common.skc_u16hashes[1]
|
|
#define udp_portaddr_node inet.sk.__sk_common.skc_portaddr_node
|
|
int pending; /* Any pending frames ? */
|
|
unsigned int corkflag; /* Cork is required */
|
|
__u8 encap_type; /* Is this an Encapsulation socket? */
|
|
unsigned char no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
|
|
no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
|
|
encap_enabled:1, /* This socket enabled encap
|
|
* processing; UDP tunnels and
|
|
* different encapsulation layer set
|
|
* this
|
|
*/
|
|
gro_enabled:1, /* Request GRO aggregation */
|
|
accept_udp_l4:1,
|
|
accept_udp_fraglist:1;
|
|
/*
|
|
* Following member retains the information to create a UDP header
|
|
* when the socket is uncorked.
|
|
*/
|
|
__u16 len; /* total length of pending frames */
|
|
__u16 gso_size;
|
|
/*
|
|
* Fields specific to UDP-Lite.
|
|
*/
|
|
__u16 pcslen;
|
|
__u16 pcrlen;
|
|
/* indicator bits used by pcflag: */
|
|
#define UDPLITE_BIT 0x1 /* set by udplite proto init function */
|
|
#define UDPLITE_SEND_CC 0x2 /* set via udplite setsockopt */
|
|
#define UDPLITE_RECV_CC 0x4 /* set via udplite setsocktopt */
|
|
__u8 pcflag; /* marks socket as UDP-Lite if > 0 */
|
|
__u8 unused[3];
|
|
/*
|
|
* For encapsulation sockets.
|
|
*/
|
|
int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
|
|
void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, int err,
|
|
__be16 port, u32 info, u8 *payload);
|
|
int (*encap_err_lookup)(struct sock *sk, struct sk_buff *skb);
|
|
void (*encap_destroy)(struct sock *sk);
|
|
|
|
/* GRO functions for UDP socket */
|
|
struct sk_buff * (*gro_receive)(struct sock *sk,
|
|
struct list_head *head,
|
|
struct sk_buff *skb);
|
|
int (*gro_complete)(struct sock *sk,
|
|
struct sk_buff *skb,
|
|
int nhoff);
|
|
|
|
/* udp_recvmsg try to use this before splicing sk_receive_queue */
|
|
struct sk_buff_head reader_queue ____cacheline_aligned_in_smp;
|
|
|
|
/* This field is dirtied by udp_recvmsg() */
|
|
int forward_deficit;
|
|
|
|
/* This fields follows rcvbuf value, and is touched by udp_recvmsg */
|
|
int forward_threshold;
|
|
};
|
|
|
|
#define UDP_MAX_SEGMENTS (1 << 6UL)
|
|
|
|
static inline struct udp_sock *udp_sk(const struct sock *sk)
|
|
{
|
|
return (struct udp_sock *)sk;
|
|
}
|
|
|
|
static inline void udp_set_no_check6_tx(struct sock *sk, bool val)
|
|
{
|
|
udp_sk(sk)->no_check6_tx = val;
|
|
}
|
|
|
|
static inline void udp_set_no_check6_rx(struct sock *sk, bool val)
|
|
{
|
|
udp_sk(sk)->no_check6_rx = val;
|
|
}
|
|
|
|
static inline bool udp_get_no_check6_tx(struct sock *sk)
|
|
{
|
|
return udp_sk(sk)->no_check6_tx;
|
|
}
|
|
|
|
static inline bool udp_get_no_check6_rx(struct sock *sk)
|
|
{
|
|
return udp_sk(sk)->no_check6_rx;
|
|
}
|
|
|
|
static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk,
|
|
struct sk_buff *skb)
|
|
{
|
|
int gso_size;
|
|
|
|
if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
|
|
gso_size = skb_shinfo(skb)->gso_size;
|
|
put_cmsg(msg, SOL_UDP, UDP_GRO, sizeof(gso_size), &gso_size);
|
|
}
|
|
}
|
|
|
|
static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
if (!skb_is_gso(skb))
|
|
return false;
|
|
|
|
if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && !udp_sk(sk)->accept_udp_l4)
|
|
return true;
|
|
|
|
if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST && !udp_sk(sk)->accept_udp_fraglist)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
static inline void udp_allow_gso(struct sock *sk)
|
|
{
|
|
udp_sk(sk)->accept_udp_l4 = 1;
|
|
udp_sk(sk)->accept_udp_fraglist = 1;
|
|
}
|
|
|
|
#define udp_portaddr_for_each_entry(__sk, list) \
|
|
hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node)
|
|
|
|
#define udp_portaddr_for_each_entry_rcu(__sk, list) \
|
|
hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node)
|
|
|
|
#define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE)
|
|
|
|
#endif /* _LINUX_UDP_H */
|