From 94623f579ce338b5fa61b5acaa5beb8aa657fb9e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 Apr 2023 13:54:37 +0200 Subject: [PATCH 01/33] netfilter: br_netfilter: fix recent physdev match breakage Recent attempt to ensure PREROUTING hook is executed again when a decrypted ipsec packet received on a bridge passes through the network stack a second time broke the physdev match in INPUT hook. We can't discard the nf_bridge info strct from sabotage_in hook, as this is needed by the physdev match. Keep the struct around and handle this with another conditional instead. Fixes: 2b272bb558f1 ("netfilter: br_netfilter: disable sabotage_in hook after first suppression") Reported-and-tested-by: Farid BENAMROUCHE Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/skbuff.h | 1 + net/bridge/br_netfilter_hooks.c | 17 +++++++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ff7ad331fb82..1ec3530c8191 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -294,6 +294,7 @@ struct nf_bridge_info { u8 pkt_otherhost:1; u8 in_prerouting:1; u8 bridged_dnat:1; + u8 sabotage_in_done:1; __u16 frag_max_size; struct net_device *physindev; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 638a4d5359db..4bc6761517bb 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -868,12 +868,17 @@ static unsigned int ip_sabotage_in(void *priv, { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - if (nf_bridge && !nf_bridge->in_prerouting && - !netif_is_l3_master(skb->dev) && - !netif_is_l3_slave(skb->dev)) { - nf_bridge_info_free(skb); - state->okfn(state->net, state->sk, skb); - return NF_STOLEN; + if (nf_bridge) { + if (nf_bridge->sabotage_in_done) + return NF_ACCEPT; + + if (!nf_bridge->in_prerouting && + !netif_is_l3_master(skb->dev) && + !netif_is_l3_slave(skb->dev)) { + nf_bridge->sabotage_in_done = 1; + state->okfn(state->net, state->sk, skb); + return NF_STOLEN; + } } return NF_ACCEPT; From af0acf22aea359e04412237d68787401f96bb583 Mon Sep 17 00:00:00 2001 From: Chen Aotian Date: Thu, 6 Apr 2023 12:01:51 +0800 Subject: [PATCH 02/33] netfilter: nf_tables: Modify nla_memdup's flag to GFP_KERNEL_ACCOUNT For memory alloc that store user data from nla[NFTA_OBJ_USERDATA], use GFP_KERNEL_ACCOUNT is more suitable. Fixes: 33758c891479 ("memcg: enable accounting for nft objects") Signed-off-by: Chen Aotian Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6004d4b24451..cd52109e674a 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7052,7 +7052,7 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, } if (nla[NFTA_OBJ_USERDATA]) { - obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL); + obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL_ACCOUNT); if (obj->udata == NULL) goto err_userdata; From 3037933448f60f9acb705997eae62013ecb81e0d Mon Sep 17 00:00:00 2001 From: Gwangun Jung Date: Thu, 13 Apr 2023 19:35:54 +0900 Subject: [PATCH 03/33] net: sched: sch_qfq: prevent slab-out-of-bounds in qfq_activate_agg If the TCA_QFQ_LMAX value is not offered through nlattr, lmax is determined by the MTU value of the network device. The MTU of the loopback device can be set up to 2^31-1. As a result, it is possible to have an lmax value that exceeds QFQ_MIN_LMAX. Due to the invalid lmax value, an index is generated that exceeds the QFQ_MAX_INDEX(=24) value, causing out-of-bounds read/write errors. The following reports a oob access: [ 84.582666] BUG: KASAN: slab-out-of-bounds in qfq_activate_agg.constprop.0 (net/sched/sch_qfq.c:1027 net/sched/sch_qfq.c:1060 net/sched/sch_qfq.c:1313) [ 84.583267] Read of size 4 at addr ffff88810f676948 by task ping/301 [ 84.583686] [ 84.583797] CPU: 3 PID: 301 Comm: ping Not tainted 6.3.0-rc5 #1 [ 84.584164] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 [ 84.584644] Call Trace: [ 84.584787] [ 84.584906] dump_stack_lvl (lib/dump_stack.c:107 (discriminator 1)) [ 84.585108] print_report (mm/kasan/report.c:320 mm/kasan/report.c:430) [ 84.585570] kasan_report (mm/kasan/report.c:538) [ 84.585988] qfq_activate_agg.constprop.0 (net/sched/sch_qfq.c:1027 net/sched/sch_qfq.c:1060 net/sched/sch_qfq.c:1313) [ 84.586599] qfq_enqueue (net/sched/sch_qfq.c:1255) [ 84.587607] dev_qdisc_enqueue (net/core/dev.c:3776) [ 84.587749] __dev_queue_xmit (./include/net/sch_generic.h:186 net/core/dev.c:3865 net/core/dev.c:4212) [ 84.588763] ip_finish_output2 (./include/net/neighbour.h:546 net/ipv4/ip_output.c:228) [ 84.589460] ip_output (net/ipv4/ip_output.c:430) [ 84.590132] ip_push_pending_frames (./include/net/dst.h:444 net/ipv4/ip_output.c:126 net/ipv4/ip_output.c:1586 net/ipv4/ip_output.c:1606) [ 84.590285] raw_sendmsg (net/ipv4/raw.c:649) [ 84.591960] sock_sendmsg (net/socket.c:724 net/socket.c:747) [ 84.592084] __sys_sendto (net/socket.c:2142) [ 84.593306] __x64_sys_sendto (net/socket.c:2150) [ 84.593779] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) [ 84.593902] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) [ 84.594070] RIP: 0033:0x7fe568032066 [ 84.594192] Code: 0e 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b8 0f 1f 00 41 89 ca 64 8b 04 25 18 00 00 00 85 c09[ 84.594796] RSP: 002b:00007ffce388b4e8 EFLAGS: 00000246 ORIG_RAX: 000000000000002c Code starting with the faulting instruction =========================================== [ 84.595047] RAX: ffffffffffffffda RBX: 00007ffce388cc70 RCX: 00007fe568032066 [ 84.595281] RDX: 0000000000000040 RSI: 00005605fdad6d10 RDI: 0000000000000003 [ 84.595515] RBP: 00005605fdad6d10 R08: 00007ffce388eeec R09: 0000000000000010 [ 84.595749] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000040 [ 84.595984] R13: 00007ffce388cc30 R14: 00007ffce388b4f0 R15: 0000001d00000001 [ 84.596218] [ 84.596295] [ 84.596351] Allocated by task 291: [ 84.596467] kasan_save_stack (mm/kasan/common.c:46) [ 84.596597] kasan_set_track (mm/kasan/common.c:52) [ 84.596725] __kasan_kmalloc (mm/kasan/common.c:384) [ 84.596852] __kmalloc_node (./include/linux/kasan.h:196 mm/slab_common.c:967 mm/slab_common.c:974) [ 84.596979] qdisc_alloc (./include/linux/slab.h:610 ./include/linux/slab.h:731 net/sched/sch_generic.c:938) [ 84.597100] qdisc_create (net/sched/sch_api.c:1244) [ 84.597222] tc_modify_qdisc (net/sched/sch_api.c:1680) [ 84.597357] rtnetlink_rcv_msg (net/core/rtnetlink.c:6174) [ 84.597495] netlink_rcv_skb (net/netlink/af_netlink.c:2574) [ 84.597627] netlink_unicast (net/netlink/af_netlink.c:1340 net/netlink/af_netlink.c:1365) [ 84.597759] netlink_sendmsg (net/netlink/af_netlink.c:1942) [ 84.597891] sock_sendmsg (net/socket.c:724 net/socket.c:747) [ 84.598016] ____sys_sendmsg (net/socket.c:2501) [ 84.598147] ___sys_sendmsg (net/socket.c:2557) [ 84.598275] __sys_sendmsg (./include/linux/file.h:31 net/socket.c:2586) [ 84.598399] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) [ 84.598520] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) [ 84.598688] [ 84.598744] The buggy address belongs to the object at ffff88810f674000 [ 84.598744] which belongs to the cache kmalloc-8k of size 8192 [ 84.599135] The buggy address is located 2664 bytes to the right of [ 84.599135] allocated 7904-byte region [ffff88810f674000, ffff88810f675ee0) [ 84.599544] [ 84.599598] The buggy address belongs to the physical page: [ 84.599777] page:00000000e638567f refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x10f670 [ 84.600074] head:00000000e638567f order:3 entire_mapcount:0 nr_pages_mapped:0 pincount:0 [ 84.600330] flags: 0x200000000010200(slab|head|node=0|zone=2) [ 84.600517] raw: 0200000000010200 ffff888100043180 dead000000000122 0000000000000000 [ 84.600764] raw: 0000000000000000 0000000080020002 00000001ffffffff 0000000000000000 [ 84.601009] page dumped because: kasan: bad access detected [ 84.601187] [ 84.601241] Memory state around the buggy address: [ 84.601396] ffff88810f676800: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 84.601620] ffff88810f676880: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 84.601845] >ffff88810f676900: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 84.602069] ^ [ 84.602243] ffff88810f676980: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 84.602468] ffff88810f676a00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 84.602693] ================================================================== [ 84.602924] Disabling lock debugging due to kernel taint Fixes: 3015f3d2a3cd ("pkt_sched: enable QFQ to support TSO/GSO") Reported-by: Gwangun Jung Signed-off-by: Gwangun Jung Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/sch_qfq.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index cf5ebe43b3b4..02098a02943e 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -421,15 +421,16 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } else weight = 1; - if (tb[TCA_QFQ_LMAX]) { + if (tb[TCA_QFQ_LMAX]) lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); - if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) { - pr_notice("qfq: invalid max length %u\n", lmax); - return -EINVAL; - } - } else + else lmax = psched_mtu(qdisc_dev(sch)); + if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) { + pr_notice("qfq: invalid max length %u\n", lmax); + return -EINVAL; + } + inv_w = ONE_FP / weight; weight = ONE_FP / inv_w; From c730fce7c70cfce831f4bdc9e49880ba1f61a092 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 14 Apr 2023 17:47:55 +0200 Subject: [PATCH 04/33] s390/bpf: Fix bpf_arch_text_poke() with new_addr == NULL Thomas Richter reported a crash in linux-next with a backtrace similar to the following one: [<0000000000000000>] 0x0 ([<000000000031a182>] bpf_trace_run4+0xc2/0x218) [<00000000001d59f4>] __bpf_trace_sched_switch+0x1c/0x28 [<0000000000c44a3a>] __schedule+0x43a/0x890 [<0000000000c44ef8>] schedule+0x68/0x110 [<0000000000c4e5ca>] do_nanosleep+0xa2/0x168 [<000000000026e7fe>] hrtimer_nanosleep+0xf6/0x1c0 [<000000000026eb6e>] __s390x_sys_nanosleep+0xb6/0xf0 [<0000000000c3b81c>] __do_syscall+0x1e4/0x208 [<0000000000c50510>] system_call+0x70/0x98 Last Breaking-Event-Address: [<000003ff7fda1814>] bpf_prog_65e887c70a835bbf_on_switch+0x1a4/0x1f0 The problem is that bpf_arch_text_poke() with new_addr == NULL is susceptible to the following race condition: T1 T2 ----------------- ------------------- plt.target = NULL entry: brcl 0xf,plt entry.mask = 0 lgrl %r1,plt.target br %r1 Fix by setting PLT target to the instruction following `brcl 0xf,plt` instead of 0. This way T2 will simply resume the execution of the eBPF program, which is the desired effect of passing new_addr == NULL. Fixes: f1d5df84cd8c ("s390/bpf: Implement bpf_arch_text_poke()") Reported-by: Thomas Richter Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Reviewed-by: Heiko Carstens Link: https://lore.kernel.org/bpf/20230414154755.184502-1-iii@linux.ibm.com --- arch/s390/net/bpf_jit_comp.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index d0846ba818ee..6b1876e4ad3f 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -539,7 +539,7 @@ static void bpf_jit_plt(void *plt, void *ret, void *target) { memcpy(plt, bpf_plt, BPF_PLT_SIZE); *(void **)((char *)plt + (bpf_plt_ret - bpf_plt)) = ret; - *(void **)((char *)plt + (bpf_plt_target - bpf_plt)) = target; + *(void **)((char *)plt + (bpf_plt_target - bpf_plt)) = target ?: ret; } /* @@ -2010,7 +2010,9 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, } __packed insn; char expected_plt[BPF_PLT_SIZE]; char current_plt[BPF_PLT_SIZE]; + char new_plt[BPF_PLT_SIZE]; char *plt; + char *ret; int err; /* Verify the branch to be patched. */ @@ -2032,12 +2034,15 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, err = copy_from_kernel_nofault(current_plt, plt, BPF_PLT_SIZE); if (err < 0) return err; - bpf_jit_plt(expected_plt, (char *)ip + 6, old_addr); + ret = (char *)ip + 6; + bpf_jit_plt(expected_plt, ret, old_addr); if (memcmp(current_plt, expected_plt, BPF_PLT_SIZE)) return -EINVAL; /* Adjust the call address. */ + bpf_jit_plt(new_plt, ret, new_addr); s390_kernel_write(plt + (bpf_plt_target - bpf_plt), - &new_addr, sizeof(void *)); + new_plt + (bpf_plt_target - bpf_plt), + sizeof(void *)); } /* Adjust the mask of the branch. */ From 853618d5886bf94812f31228091cd37d308230f7 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Fri, 14 Apr 2023 14:08:35 +0800 Subject: [PATCH 05/33] virtio_net: bugfix overflow inside xdp_linearize_page() Here we copy the data from the original buf to the new page. But we not check that it may be overflow. As long as the size received(including vnethdr) is greater than 3840 (PAGE_SIZE -VIRTIO_XDP_HEADROOM). Then the memcpy will overflow. And this is completely possible, as long as the MTU is large, such as 4096. In our test environment, this will cause crash. Since crash is caused by the written memory, it is meaningless, so I do not include it. Fixes: 72979a6c3590 ("virtio_net: xdp, add slowpath case for non contiguous buffers") Signed-off-by: Xuan Zhuo Acked-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 2396c28c0122..ea1bd4bb326d 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -814,8 +814,13 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, int page_off, unsigned int *len) { - struct page *page = alloc_page(GFP_ATOMIC); + int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + struct page *page; + if (page_off + *len + tailroom > PAGE_SIZE) + return NULL; + + page = alloc_page(GFP_ATOMIC); if (!page) return NULL; @@ -823,7 +828,6 @@ static struct page *xdp_linearize_page(struct receive_queue *rq, page_off += *len; while (--*num_buf) { - int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); unsigned int buflen; void *buf; int off; From a80bb8e7233b2ad6ff119646b6e33fb3edcec37b Mon Sep 17 00:00:00 2001 From: Ding Hui Date: Fri, 14 Apr 2023 23:23:06 +0800 Subject: [PATCH 06/33] sfc: Fix use-after-free due to selftest_work There is a use-after-free scenario that is: When the NIC is down, user set mac address or vlan tag to VF, the xxx_set_vf_mac() or xxx_set_vf_vlan() will invoke efx_net_stop() and efx_net_open(), since netif_running() is false, the port will not start and keep port_enabled false, but selftest_work is scheduled in efx_net_open(). If we remove the device before selftest_work run, the efx_stop_port() will not be called since the NIC is down, and then efx is freed, we will soon get a UAF in run_timer_softirq() like this: [ 1178.907941] ================================================================== [ 1178.907948] BUG: KASAN: use-after-free in run_timer_softirq+0xdea/0xe90 [ 1178.907950] Write of size 8 at addr ff11001f449cdc80 by task swapper/47/0 [ 1178.907950] [ 1178.907953] CPU: 47 PID: 0 Comm: swapper/47 Kdump: loaded Tainted: G O --------- -t - 4.18.0 #1 [ 1178.907954] Hardware name: SANGFOR X620G40/WI2HG-208T1061A, BIOS SPYH051032-U01 04/01/2022 [ 1178.907955] Call Trace: [ 1178.907956] [ 1178.907960] dump_stack+0x71/0xab [ 1178.907963] print_address_description+0x6b/0x290 [ 1178.907965] ? run_timer_softirq+0xdea/0xe90 [ 1178.907967] kasan_report+0x14a/0x2b0 [ 1178.907968] run_timer_softirq+0xdea/0xe90 [ 1178.907971] ? init_timer_key+0x170/0x170 [ 1178.907973] ? hrtimer_cancel+0x20/0x20 [ 1178.907976] ? sched_clock+0x5/0x10 [ 1178.907978] ? sched_clock_cpu+0x18/0x170 [ 1178.907981] __do_softirq+0x1c8/0x5fa [ 1178.907985] irq_exit+0x213/0x240 [ 1178.907987] smp_apic_timer_interrupt+0xd0/0x330 [ 1178.907989] apic_timer_interrupt+0xf/0x20 [ 1178.907990] [ 1178.907991] RIP: 0010:mwait_idle+0xae/0x370 If the NIC is not actually brought up, there is no need to schedule selftest_work, so let's move invoking efx_selftest_async_start() into efx_start_all(), and it will be canceled by broughting down. Fixes: dd40781e3a4e ("sfc: Run event/IRQ self-test asynchronously when interface is brought up") Fixes: e340be923012 ("sfc: add ndo_set_vf_mac() function for EF10") Debugged-by: Huang Cun Cc: Donglin Peng Suggested-by: Martin Habets Signed-off-by: Ding Hui Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/efx.c | 1 - drivers/net/ethernet/sfc/efx_common.c | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 884d8d168862..1eceffa02b55 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -541,7 +541,6 @@ int efx_net_open(struct net_device *net_dev) else efx->state = STATE_NET_UP; - efx_selftest_async_start(efx); return 0; } diff --git a/drivers/net/ethernet/sfc/efx_common.c b/drivers/net/ethernet/sfc/efx_common.c index cc30524c2fe4..361687de308d 100644 --- a/drivers/net/ethernet/sfc/efx_common.c +++ b/drivers/net/ethernet/sfc/efx_common.c @@ -544,6 +544,8 @@ void efx_start_all(struct efx_nic *efx) /* Start the hardware monitor if there is one */ efx_start_monitor(efx); + efx_selftest_async_start(efx); + /* Link state detection is normally event-driven; we have * to poll now because we could have missed a change */ From 338469d677e5d426f5ada88761f16f6d2c7c1981 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Sat, 15 Apr 2023 12:33:09 -0300 Subject: [PATCH 07/33] net/sched: clear actions pointer in miss cookie init fail Palash reports a UAF when using a modified version of syzkaller[1]. When 'tcf_exts_miss_cookie_base_alloc()' fails in 'tcf_exts_init_ex()' a call to 'tcf_exts_destroy()' is made to free up the tcf_exts resources. In flower, a call to '__fl_put()' when 'tcf_exts_init_ex()' fails is made; Then calling 'tcf_exts_destroy()', which triggers an UAF since the already freed tcf_exts action pointer is lingering in the struct. Before the offending patch, this was not an issue since there was no case where the tcf_exts action pointer could linger. Therefore, restore the old semantic by clearing the action pointer in case of a failure to initialize the miss_cookie. [1] https://github.com/cmu-pasta/linux-kernel-enriched-corpus v1->v2: Fix compilation on configs without tc actions (kernel test robot) Fixes: 80cd22c35c90 ("net/sched: cls_api: Support hardware miss to tc action") Reported-by: Palash Oswal Acked-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Signed-off-by: David S. Miller --- net/sched/cls_api.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 2a6b6be0811b..35785a36c802 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -3235,6 +3235,9 @@ int tcf_exts_init_ex(struct tcf_exts *exts, struct net *net, int action, err_miss_alloc: tcf_exts_destroy(exts); +#ifdef CONFIG_NET_CLS_ACT + exts->actions = NULL; +#endif return err; } EXPORT_SYMBOL(tcf_exts_init_ex); From c55c0e91c813589dc55bea6bf9a9fbfaa10ae41d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 17 Apr 2023 10:21:36 +0200 Subject: [PATCH 08/33] netfilter: nf_tables: fix ifdef to also consider nf_tables=m nftables can be built as a module, so fix the preprocessor conditional accordingly. Fixes: 478b360a47b7 ("netfilter: nf_tables: fix nf_trace always-on with XT_TRACE=n") Reported-by: Florian Fainelli Reported-by: Jakub Kicinski Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/skbuff.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1ec3530c8191..dbcaac8b6966 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4713,7 +4713,7 @@ static inline void nf_reset_ct(struct sk_buff *skb) static inline void nf_reset_trace(struct sk_buff *skb) { -#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES) +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES) skb->nf_trace = 0; #endif } @@ -4733,7 +4733,7 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src, dst->_nfct = src->_nfct; nf_conntrack_get(skb_nfct(src)); #endif -#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES) +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES) if (copy) dst->nf_trace = src->nf_trace; #endif From 8485d093b076e59baff424552e8aecfc5bd2d261 Mon Sep 17 00:00:00 2001 From: Aleksandr Loktionov Date: Fri, 24 Mar 2023 18:16:38 +0100 Subject: [PATCH 09/33] i40e: fix accessing vsi->active_filters without holding lock Fix accessing vsi->active_filters without holding the mac_filter_hash_lock. Move vsi->active_filters = 0 inside critical section and move clear_bit(__I40E_VSI_OVERFLOW_PROMISC, vsi->state) after the critical section to ensure the new filters from other threads can be added only after filters cleaning in the critical section is finished. Fixes: 278e7d0b9d68 ("i40e: store MAC/VLAN filters in a hash with the MAC Address as key") Signed-off-by: Aleksandr Loktionov Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 228cd502bb48..6313575a4b6c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -14133,15 +14133,15 @@ static int i40e_add_vsi(struct i40e_vsi *vsi) vsi->id = ctxt.vsi_number; } - vsi->active_filters = 0; - clear_bit(__I40E_VSI_OVERFLOW_PROMISC, vsi->state); spin_lock_bh(&vsi->mac_filter_hash_lock); + vsi->active_filters = 0; /* If macvlan filters already exist, force them to get loaded */ hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) { f->state = I40E_FILTER_NEW; f_count++; } spin_unlock_bh(&vsi->mac_filter_hash_lock); + clear_bit(__I40E_VSI_OVERFLOW_PROMISC, vsi->state); if (f_count) { vsi->flags |= I40E_VSI_FLAG_FILTER_CHANGED; From c86c00c6935505929cc9adb29ddb85e48c71f828 Mon Sep 17 00:00:00 2001 From: Aleksandr Loktionov Date: Mon, 3 Apr 2023 07:13:18 +0200 Subject: [PATCH 10/33] i40e: fix i40e_setup_misc_vector() error handling Add error handling of i40e_setup_misc_vector() in i40e_rebuild(). In case interrupt vectors setup fails do not re-open vsi-s and do not bring up vf-s, we have no interrupts to serve a traffic anyway. Fixes: 41c445ff0f48 ("i40e: main driver core") Signed-off-by: Aleksandr Loktionov Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 6313575a4b6c..7c30abd0dfc2 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -11059,8 +11059,11 @@ static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired) pf->hw.aq.asq_last_status)); } /* reinit the misc interrupt */ - if (pf->flags & I40E_FLAG_MSIX_ENABLED) + if (pf->flags & I40E_FLAG_MSIX_ENABLED) { ret = i40e_setup_misc_vector(pf); + if (ret) + goto end_unlock; + } /* Add a filter to drop all Flow control frames from any VSI from being * transmitted. By doing so we stop a malicious VF from sending out From 1a2bd3bd72e978304cdc0a7385e8048e8242225d Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 14 Apr 2023 09:26:14 -0700 Subject: [PATCH 11/33] ice: document RDMA devlink parameters Commit e523af4ee560 ("net/ice: Add support for enable_iwarp and enable_roce devlink param") added support for the enable_roce and enable_iwarp parameters in the ice driver. It didn't document these parameters in the ice devlink documentation file. Add this documentation, including a note about the mutual exclusion between the two modes. Signed-off-by: Jacob Keller Reviewed-by: Leon Romanovsky Acked-by: Tony Nguyen Link: https://lore.kernel.org/r/20230414162614.571861-1-jacob.e.keller@intel.com Signed-off-by: Jakub Kicinski --- Documentation/networking/devlink/ice.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Documentation/networking/devlink/ice.rst b/Documentation/networking/devlink/ice.rst index 10f282c2117c..2f60e34ab926 100644 --- a/Documentation/networking/devlink/ice.rst +++ b/Documentation/networking/devlink/ice.rst @@ -7,6 +7,21 @@ ice devlink support This document describes the devlink features implemented by the ``ice`` device driver. +Parameters +========== + +.. list-table:: Generic parameters implemented + + * - Name + - Mode + - Notes + * - ``enable_roce`` + - runtime + - mutually exclusive with ``enable_iwarp`` + * - ``enable_iwarp`` + - runtime + - mutually exclusive with ``enable_roce`` + Info versions ============= From d46fc894147cf98dd6e8210aa99ed46854191840 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 17 Apr 2023 12:14:29 +0200 Subject: [PATCH 12/33] netfilter: nf_tables: validate catch-all set elements catch-all set element might jump/goto to chain that uses expressions that require validation. Fixes: aaa31047a6d2 ("netfilter: nftables: add catch-all set element support") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++ net/netfilter/nf_tables_api.c | 64 ++++++++++++++++++++++++++++--- net/netfilter/nft_lookup.c | 36 ++--------------- 3 files changed, 66 insertions(+), 38 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 9430128aae99..1b8e305bb54a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1085,6 +1085,10 @@ struct nft_chain { }; int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain); +int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_set_elem *elem); +int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index cd52109e674a..98043e83af71 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3447,6 +3447,64 @@ static int nft_table_validate(struct net *net, const struct nft_table *table) return 0; } +int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_set_elem *elem) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + struct nft_ctx *pctx = (struct nft_ctx *)ctx; + const struct nft_data *data; + int err; + + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && + *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) + return 0; + + data = nft_set_ext_data(ext); + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + pctx->level++; + err = nft_chain_validate(ctx, data->verdict.chain); + if (err < 0) + return err; + pctx->level--; + break; + default: + break; + } + + return 0; +} + +struct nft_set_elem_catchall { + struct list_head list; + struct rcu_head rcu; + void *elem; +}; + +int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_elem elem; + struct nft_set_ext *ext; + int ret = 0; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask)) + continue; + + elem.priv = catchall->elem; + ret = nft_setelem_validate(ctx, set, NULL, &elem); + if (ret < 0) + return ret; + } + + return ret; +} + static struct nft_rule *nft_rule_lookup_byid(const struct net *net, const struct nft_chain *chain, const struct nlattr *nla); @@ -4759,12 +4817,6 @@ err_set_name: return err; } -struct nft_set_elem_catchall { - struct list_head list; - struct rcu_head rcu; - void *elem; -}; - static void nft_set_catchall_destroy(const struct nft_ctx *ctx, struct nft_set *set) { diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index cae5a6724163..cecf8ab90e58 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -199,37 +199,6 @@ nla_put_failure: return -1; } -static int nft_lookup_validate_setelem(const struct nft_ctx *ctx, - struct nft_set *set, - const struct nft_set_iter *iter, - struct nft_set_elem *elem) -{ - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); - struct nft_ctx *pctx = (struct nft_ctx *)ctx; - const struct nft_data *data; - int err; - - if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && - *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) - return 0; - - data = nft_set_ext_data(ext); - switch (data->verdict.code) { - case NFT_JUMP: - case NFT_GOTO: - pctx->level++; - err = nft_chain_validate(ctx, data->verdict.chain); - if (err < 0) - return err; - pctx->level--; - break; - default: - break; - } - - return 0; -} - static int nft_lookup_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **d) @@ -245,9 +214,12 @@ static int nft_lookup_validate(const struct nft_ctx *ctx, iter.skip = 0; iter.count = 0; iter.err = 0; - iter.fn = nft_lookup_validate_setelem; + iter.fn = nft_setelem_validate; priv->set->ops->walk(ctx, priv->set, &iter); + if (!iter.err) + iter.err = nft_set_catchall_validate(ctx, priv->set); + if (iter.err < 0) return iter.err; From e50b9b9e8610d47b7c22529443e45a16b1ea3a15 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Sat, 15 Apr 2023 16:12:27 +0800 Subject: [PATCH 13/33] cxgb4: fix use after free bugs caused by circular dependency problem The flower_stats_timer can schedule flower_stats_work and flower_stats_work can also arm the flower_stats_timer. The process is shown below: ----------- timer schedules work ------------ ch_flower_stats_cb() //timer handler schedule_work(&adap->flower_stats_work); ----------- work arms timer ------------ ch_flower_stats_handler() //workqueue callback function mod_timer(&adap->flower_stats_timer, ...); When the cxgb4 device is detaching, the timer and workqueue could still be rearmed. The process is shown below: (cleanup routine) | (timer and workqueue routine) remove_one() | free_some_resources() | ch_flower_stats_cb() //timer cxgb4_cleanup_tc_flower() | schedule_work() del_timer_sync() | | ch_flower_stats_handler() //workqueue | mod_timer() cancel_work_sync() | kfree(adapter) //FREE | ch_flower_stats_cb() //timer | adap->flower_stats_work //USE This patch changes del_timer_sync() to timer_shutdown_sync(), which could prevent rearming of the timer from the workqueue. Fixes: e0f911c81e93 ("cxgb4: fetch stats for offloaded tc flower flows") Signed-off-by: Duoming Zhou Link: https://lore.kernel.org/r/20230415081227.7463-1-duoming@zju.edu.cn Signed-off-by: Paolo Abeni --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c index dd9be229819a..d3541159487d 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c @@ -1135,7 +1135,7 @@ void cxgb4_cleanup_tc_flower(struct adapter *adap) return; if (adap->flower_stats_timer.function) - del_timer_sync(&adap->flower_stats_timer); + timer_shutdown_sync(&adap->flower_stats_timer); cancel_work_sync(&adap->flower_stats_work); rhashtable_destroy(&adap->flower_tbl); adap->tc_flower_initialized = false; From d4eb7e39929a3b1ff30fb751b4859fc2410702a0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 17 Apr 2023 17:50:28 +0200 Subject: [PATCH 14/33] netfilter: nf_tables: tighten netlink attribute requirements for catch-all elements If NFT_SET_ELEM_CATCHALL is set on, then userspace provides no set element key. Otherwise, bail out with -EINVAL. Fixes: aaa31047a6d2 ("netfilter: nftables: add catch-all set element support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 98043e83af71..e48ab8dfb541 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6108,7 +6108,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) return err; - if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL)) + if (((flags & NFT_SET_ELEM_CATCHALL) && nla[NFTA_SET_ELEM_KEY]) || + (!(flags & NFT_SET_ELEM_CATCHALL) && !nla[NFTA_SET_ELEM_KEY])) return -EINVAL; if (flags != 0) { From e8b51a1a15d5a3cce231e0669f6a161dc5bb9b75 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 16 Apr 2023 23:58:18 -0700 Subject: [PATCH 15/33] bnxt_en: Do not initialize PTP on older P3/P4 chips The driver does not support PTP on these older chips and it is assuming that firmware on these older chips will not return the PORT_MAC_PTP_QCFG_RESP_FLAGS_HWRM_ACCESS flag in __bnxt_hwrm_ptp_qcfg(), causing the function to abort quietly. But newer firmware now sets this flag and so __bnxt_hwrm_ptp_qcfg() will proceed further. Eventually it will fail in bnxt_ptp_init() -> bnxt_map_ptp_regs() because there is no code to support the older chips. The driver will then complain: "PTP initialization failed.\n" Fix it so that we abort quietly earlier without going through the unnecessary steps and alarming the user with the warning log. Fixes: ae5c42f0b92c ("bnxt_en: Get PTP hardware capability from firmware") Signed-off-by: Michael Chan Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index c23e3b397bcf..ef97a4190b39 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7627,7 +7627,7 @@ static int __bnxt_hwrm_ptp_qcfg(struct bnxt *bp) u8 flags; int rc; - if (bp->hwrm_spec_code < 0x10801) { + if (bp->hwrm_spec_code < 0x10801 || !BNXT_CHIP_P5_THOR(bp)) { rc = -ENODEV; goto no_ptp; } From 4f4e54b1041e60694117893cd986831153a3e719 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Sun, 16 Apr 2023 23:58:19 -0700 Subject: [PATCH 16/33] bnxt_en: Fix a possible NULL pointer dereference in unload path In the driver unload path, the driver currently checks the valid BNXT_FLAG_ROCE_CAP flag in bnxt_rdma_aux_device_uninit() before proceeding. This is flawed because the flag may not be set initially during driver load. It may be set later after the NVRAM setting is changed followed by a firmware reset. Relying on the BNXT_FLAG_ROCE_CAP flag may crash in bnxt_rdma_aux_device_uninit() if the aux device was never initialized: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 PGD 8ae6aa067 P4D 0 Oops: 0000 [#1] SMP NOPTI CPU: 39 PID: 42558 Comm: rmmod Kdump: loaded Tainted: G OE --------- - - 4.18.0-348.el8.x86_64 #1 Hardware name: Dell Inc. PowerEdge R750/0WT8Y6, BIOS 1.5.4 12/17/2021 RIP: 0010:device_del+0x1b/0x410 Code: 89 a5 50 03 00 00 4c 89 a5 58 03 00 00 eb 89 0f 1f 44 00 00 41 56 41 55 41 54 4c 8d a7 80 00 00 00 55 53 48 89 fb 48 83 ec 18 <48> 8b 2f 4c 89 e7 65 48 8b 04 25 28 00 00 00 48 89 44 24 10 31 c0 RSP: 0018:ff7f82bf469a7dc8 EFLAGS: 00010292 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000206 RDI: 0000000000000000 RBP: ff31b7cd114b0ac0 R08: 0000000000000000 R09: ffffffff935c3400 R10: ff31b7cd45bc3440 R11: 0000000000000001 R12: 0000000000000080 R13: ffffffffc1069f40 R14: 0000000000000000 R15: 0000000000000000 FS: 00007fc9903ce740(0000) GS:ff31b7d4ffac0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000992fee004 CR4: 0000000000773ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: bnxt_rdma_aux_device_uninit+0x1f/0x30 [bnxt_en] bnxt_remove_one+0x2f/0x1f0 [bnxt_en] pci_device_remove+0x3b/0xc0 device_release_driver_internal+0x103/0x1f0 driver_detach+0x54/0x88 bus_remove_driver+0x77/0xc9 pci_unregister_driver+0x2d/0xb0 bnxt_exit+0x16/0x2c [bnxt_en] __x64_sys_delete_module+0x139/0x280 do_syscall_64+0x5b/0x1a0 entry_SYSCALL_64_after_hwframe+0x65/0xca RIP: 0033:0x7fc98f3af71b Fix this by modifying the check inside bnxt_rdma_aux_device_uninit() to check for bp->aux_priv instead. We also need to make some changes in bnxt_rdma_aux_device_init() to make sure that bp->aux_priv is set only when the aux device is fully initialized. Fixes: d80d88b0dfff ("bnxt_en: Add auxiliary driver support") Reviewed-by: Ajit Khaparde Signed-off-by: Kalesh AP Signed-off-by: Michael Chan Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index e7b5e28ee29f..852eb449ccae 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -304,7 +304,7 @@ void bnxt_rdma_aux_device_uninit(struct bnxt *bp) struct auxiliary_device *adev; /* Skip if no auxiliary device init was done. */ - if (!(bp->flags & BNXT_FLAG_ROCE_CAP)) + if (!bp->aux_priv) return; aux_priv = bp->aux_priv; @@ -324,6 +324,7 @@ static void bnxt_aux_dev_release(struct device *dev) bp->edev = NULL; kfree(aux_priv->edev); kfree(aux_priv); + bp->aux_priv = NULL; } static void bnxt_set_edev_info(struct bnxt_en_dev *edev, struct bnxt *bp) @@ -359,19 +360,18 @@ void bnxt_rdma_aux_device_init(struct bnxt *bp) if (!(bp->flags & BNXT_FLAG_ROCE_CAP)) return; - bp->aux_priv = kzalloc(sizeof(*bp->aux_priv), GFP_KERNEL); - if (!bp->aux_priv) + aux_priv = kzalloc(sizeof(*bp->aux_priv), GFP_KERNEL); + if (!aux_priv) goto exit; - bp->aux_priv->id = ida_alloc(&bnxt_aux_dev_ids, GFP_KERNEL); - if (bp->aux_priv->id < 0) { + aux_priv->id = ida_alloc(&bnxt_aux_dev_ids, GFP_KERNEL); + if (aux_priv->id < 0) { netdev_warn(bp->dev, "ida alloc failed for ROCE auxiliary device\n"); - kfree(bp->aux_priv); + kfree(aux_priv); goto exit; } - aux_priv = bp->aux_priv; aux_dev = &aux_priv->aux_dev; aux_dev->id = aux_priv->id; aux_dev->name = "rdma"; @@ -380,10 +380,11 @@ void bnxt_rdma_aux_device_init(struct bnxt *bp) rc = auxiliary_device_init(aux_dev); if (rc) { - ida_free(&bnxt_aux_dev_ids, bp->aux_priv->id); - kfree(bp->aux_priv); + ida_free(&bnxt_aux_dev_ids, aux_priv->id); + kfree(aux_priv); goto exit; } + bp->aux_priv = aux_priv; /* From this point, all cleanup will happen via the .release callback & * any error unwinding will need to include a call to From c0e73276f0fcbbd3d4736ba975d7dc7a48791b0c Mon Sep 17 00:00:00 2001 From: Nikita Zhandarovich Date: Mon, 17 Apr 2023 05:07:18 -0700 Subject: [PATCH 17/33] mlxfw: fix null-ptr-deref in mlxfw_mfa2_tlv_next() Function mlxfw_mfa2_tlv_multi_get() returns NULL if 'tlv' in question does not pass checks in mlxfw_mfa2_tlv_payload_get(). This behaviour may lead to NULL pointer dereference in 'multi->total_len'. Fix this issue by testing mlxfw_mfa2_tlv_multi_get()'s return value against NULL. Found by Linux Verification Center (linuxtesting.org) with static analysis tool SVACE. Fixes: 410ed13cae39 ("Add the mlxfw module for Mellanox firmware flash process") Co-developed-by: Natalia Petrova Signed-off-by: Nikita Zhandarovich Reviewed-by: Ido Schimmel Link: https://lore.kernel.org/r/20230417120718.52325-1-n.zhandarovich@fintech.ru Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c index 017d68f1e123..972c571b4158 100644 --- a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c +++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c @@ -31,6 +31,8 @@ mlxfw_mfa2_tlv_next(const struct mlxfw_mfa2_file *mfa2_file, if (tlv->type == MLXFW_MFA2_TLV_MULTI_PART) { multi = mlxfw_mfa2_tlv_multi_get(mfa2_file, tlv); + if (!multi) + return NULL; tlv_len = NLA_ALIGN(tlv_len + be16_to_cpu(multi->total_len)); } From 8267fc71abb2dc47338570e56dd3473a58313fce Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 17 Apr 2023 23:53:22 +0200 Subject: [PATCH 18/33] veth: take into account peer device for NETDEV_XDP_ACT_NDO_XMIT xdp_features flag For veth pairs, NETDEV_XDP_ACT_NDO_XMIT is supported by the current device if the peer one is running a XDP program or if it has GRO enabled. Fix the xdp_features flags reporting considering peer device and not current one for NETDEV_XDP_ACT_NDO_XMIT. Fixes: fccca038f300 ("veth: take into account device reconfiguration for xdp_features flag") Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/4f1ca6f6f6b42ae125bfdb5c7782217c83968b2e.1681767806.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- drivers/net/veth.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index e1b38fbf1dd9..4b3c6647edc6 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1262,11 +1262,12 @@ static void veth_set_xdp_features(struct net_device *dev) peer = rtnl_dereference(priv->peer); if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) { + struct veth_priv *priv_peer = netdev_priv(peer); xdp_features_t val = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_RX_SG; - if (priv->_xdp_prog || veth_gro_requested(dev)) + if (priv_peer->_xdp_prog || veth_gro_requested(peer)) val |= NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_NDO_XMIT_SG; xdp_set_features_flag(dev, val); @@ -1504,19 +1505,23 @@ static int veth_set_features(struct net_device *dev, { netdev_features_t changed = features ^ dev->features; struct veth_priv *priv = netdev_priv(dev); + struct net_device *peer; int err; if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) return 0; + peer = rtnl_dereference(priv->peer); if (features & NETIF_F_GRO) { err = veth_napi_enable(dev); if (err) return err; - xdp_features_set_redirect_target(dev, true); + if (peer) + xdp_features_set_redirect_target(peer, true); } else { - xdp_features_clear_redirect_target(dev); + if (peer) + xdp_features_clear_redirect_target(peer); veth_napi_del(dev); } return 0; @@ -1598,13 +1603,13 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, peer->max_mtu = max_mtu; } - xdp_features_set_redirect_target(dev, true); + xdp_features_set_redirect_target(peer, true); } if (old_prog) { if (!prog) { - if (!veth_gro_requested(dev)) - xdp_features_clear_redirect_target(dev); + if (peer && !veth_gro_requested(dev)) + xdp_features_clear_redirect_target(peer); if (dev->flags & IFF_UP) veth_disable_xdp(dev); From c484fcc058bada604d7e4e5228d4affb646ddbc2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 17 Apr 2023 09:12:16 +0300 Subject: [PATCH 19/33] bonding: Fix memory leak when changing bond type to Ethernet When a net device is put administratively up, its 'IFF_UP' flag is set (if not set already) and a 'NETDEV_UP' notification is emitted, which causes the 8021q driver to add VLAN ID 0 on the device. The reverse happens when a net device is put administratively down. When changing the type of a bond to Ethernet, its 'IFF_UP' flag is incorrectly cleared, resulting in the kernel skipping the above process and VLAN ID 0 being leaked [1]. Fix by restoring the flag when changing the type to Ethernet, in a similar fashion to the restoration of the 'IFF_SLAVE' flag. The issue can be reproduced using the script in [2], with example out before and after the fix in [3]. [1] unreferenced object 0xffff888103479900 (size 256): comm "ip", pid 329, jiffies 4294775225 (age 28.561s) hex dump (first 32 bytes): 00 a0 0c 15 81 88 ff ff 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmalloc_trace+0x2a/0xe0 [] vlan_vid_add+0x30c/0x790 [] vlan_device_event+0x1491/0x21a0 [] notifier_call_chain+0xbe/0x1f0 [] call_netdevice_notifiers_info+0xba/0x150 [] __dev_notify_flags+0x132/0x2e0 [] dev_change_flags+0x11f/0x180 [] do_setlink+0xb96/0x4060 [] __rtnl_newlink+0xc0a/0x18a0 [] rtnl_newlink+0x6c/0xa0 [] rtnetlink_rcv_msg+0x43e/0xe00 [] netlink_rcv_skb+0x170/0x440 [] netlink_unicast+0x53f/0x810 [] netlink_sendmsg+0x96b/0xe90 [] ____sys_sendmsg+0x30f/0xa70 [] ___sys_sendmsg+0x13a/0x1e0 unreferenced object 0xffff88810f6a83e0 (size 32): comm "ip", pid 329, jiffies 4294775225 (age 28.561s) hex dump (first 32 bytes): a0 99 47 03 81 88 ff ff a0 99 47 03 81 88 ff ff ..G.......G..... 81 00 00 00 01 00 00 00 cc cc cc cc cc cc cc cc ................ backtrace: [] kmalloc_trace+0x2a/0xe0 [] vlan_vid_add+0x409/0x790 [] vlan_device_event+0x1491/0x21a0 [] notifier_call_chain+0xbe/0x1f0 [] call_netdevice_notifiers_info+0xba/0x150 [] __dev_notify_flags+0x132/0x2e0 [] dev_change_flags+0x11f/0x180 [] do_setlink+0xb96/0x4060 [] __rtnl_newlink+0xc0a/0x18a0 [] rtnl_newlink+0x6c/0xa0 [] rtnetlink_rcv_msg+0x43e/0xe00 [] netlink_rcv_skb+0x170/0x440 [] netlink_unicast+0x53f/0x810 [] netlink_sendmsg+0x96b/0xe90 [] ____sys_sendmsg+0x30f/0xa70 [] ___sys_sendmsg+0x13a/0x1e0 [2] ip link add name t-nlmon type nlmon ip link add name t-dummy type dummy ip link add name t-bond type bond mode active-backup ip link set dev t-bond up ip link set dev t-nlmon master t-bond ip link set dev t-nlmon nomaster ip link show dev t-bond ip link set dev t-dummy master t-bond ip link show dev t-bond ip link del dev t-bond ip link del dev t-dummy ip link del dev t-nlmon [3] Before: 12: t-bond: mtu 1500 qdisc noqueue state DOWN mode DEFAULT group default qlen 1000 link/netlink 12: t-bond: mtu 1500 qdisc noqueue state UP mode DEFAULT group default qlen 1000 link/ether 46:57:39:a4:46:a2 brd ff:ff:ff:ff:ff:ff After: 12: t-bond: mtu 1500 qdisc noqueue state DOWN mode DEFAULT group default qlen 1000 link/netlink 12: t-bond: mtu 1500 qdisc noqueue state UP mode DEFAULT group default qlen 1000 link/ether 66:48:7b:74:b6:8a brd ff:ff:ff:ff:ff:ff Fixes: e36b9d16c6a6 ("bonding: clean muticast addresses when device changes type") Fixes: 75c78500ddad ("bonding: remap muticast addresses without using dev_close() and dev_open()") Fixes: 9ec7eb60dcbc ("bonding: restore IFF_MASTER/SLAVE flags on bond enslave ether type change") Reported-by: Mirsad Goran Todorovac Link: https://lore.kernel.org/netdev/78a8a03b-6070-3e6b-5042-f848dab16fb8@alu.unizg.hr/ Tested-by: Mirsad Goran Todorovac Signed-off-by: Ido Schimmel Acked-by: Jay Vosburgh Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 8cc9a74789b7..7a7d584f378a 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1777,14 +1777,15 @@ void bond_lower_state_changed(struct slave *slave) /* The bonding driver uses ether_setup() to convert a master bond device * to ARPHRD_ETHER, that resets the target netdevice's flags so we always - * have to restore the IFF_MASTER flag, and only restore IFF_SLAVE if it was set + * have to restore the IFF_MASTER flag, and only restore IFF_SLAVE and IFF_UP + * if they were set */ static void bond_ether_setup(struct net_device *bond_dev) { - unsigned int slave_flag = bond_dev->flags & IFF_SLAVE; + unsigned int flags = bond_dev->flags & (IFF_SLAVE | IFF_UP); ether_setup(bond_dev); - bond_dev->flags |= IFF_MASTER | slave_flag; + bond_dev->flags |= IFF_MASTER | flags; bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING; } From 6f4833383e8514ea796d094e05c24889b8997fde Mon Sep 17 00:00:00 2001 From: Seiji Nishikawa Date: Mon, 17 Apr 2023 21:21:27 +0900 Subject: [PATCH 20/33] net: vmxnet3: Fix NULL pointer dereference in vmxnet3_rq_rx_complete() When vmxnet3_rq_create() fails to allocate rq->data_ring.base due to page allocation failure, subsequent call to vmxnet3_rq_rx_complete() can result in NULL pointer dereference. To fix this bug, check not only that rxDataRingUsed is true but also that adapter->rxdataring_enabled is true before calling memcpy() in vmxnet3_rq_rx_complete(). [1728352.477993] ethtool: page allocation failure: order:9, mode:0x6000c0(GFP_KERNEL), nodemask=(null),cpuset=/,mems_allowed=0 ... [1728352.478009] Call Trace: [1728352.478028] dump_stack+0x41/0x60 [1728352.478035] warn_alloc.cold.120+0x7b/0x11b [1728352.478038] ? _cond_resched+0x15/0x30 [1728352.478042] ? __alloc_pages_direct_compact+0x15f/0x170 [1728352.478043] __alloc_pages_slowpath+0xcd3/0xd10 [1728352.478047] __alloc_pages_nodemask+0x2e2/0x320 [1728352.478049] __dma_direct_alloc_pages.constprop.25+0x8a/0x120 [1728352.478053] dma_direct_alloc+0x5a/0x2a0 [1728352.478056] vmxnet3_rq_create.part.57+0x17c/0x1f0 [vmxnet3] ... [1728352.478188] vmxnet3 0000:0b:00.0 ens192: rx data ring will be disabled ... [1728352.515347] BUG: unable to handle kernel NULL pointer dereference at 0000000000000034 ... [1728352.515440] RIP: 0010:memcpy_orig+0x54/0x130 ... [1728352.515655] Call Trace: [1728352.515665] [1728352.515672] vmxnet3_rq_rx_complete+0x419/0xef0 [vmxnet3] [1728352.515690] vmxnet3_poll_rx_only+0x31/0xa0 [vmxnet3] ... Signed-off-by: Seiji Nishikawa Signed-off-by: David S. Miller --- drivers/net/vmxnet3/vmxnet3_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index da488cbb0542..f2b76ee866a4 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -1504,7 +1504,7 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq, goto rcd_done; } - if (rxDataRingUsed) { + if (rxDataRingUsed && adapter->rxdataring_enabled) { size_t sz; BUG_ON(rcd->len > rq->data_ring.desc_size); From 4e006c7a6dac0ead4c1bf606000aa90a372fc253 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 17 Apr 2023 09:00:52 -0400 Subject: [PATCH 21/33] net: rpl: fix rpl header size calculation This patch fixes a missing 8 byte for the header size calculation. The ipv6_rpl_srh_size() is used to check a skb_pull() on skb->data which points to skb_transport_header(). Currently we only check on the calculated addresses fields using CmprI and CmprE fields, see: https://www.rfc-editor.org/rfc/rfc6554#section-3 there is however a missing 8 byte inside the calculation which stands for the fields before the addresses field. Those 8 bytes are represented by sizeof(struct ipv6_rpl_sr_hdr) expression. Fixes: 8610c7c6e3bd ("net: ipv6: add support for rpl sr exthdr") Signed-off-by: Alexander Aring Reported-by: maxpl0it Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/rpl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c index 488aec9e1a74..d1876f192225 100644 --- a/net/ipv6/rpl.c +++ b/net/ipv6/rpl.c @@ -32,7 +32,8 @@ static void *ipv6_rpl_segdata_pos(const struct ipv6_rpl_sr_hdr *hdr, int i) size_t ipv6_rpl_srh_size(unsigned char n, unsigned char cmpri, unsigned char cmpre) { - return (n * IPV6_PFXTAIL_LEN(cmpri)) + IPV6_PFXTAIL_LEN(cmpre); + return sizeof(struct ipv6_rpl_sr_hdr) + (n * IPV6_PFXTAIL_LEN(cmpri)) + + IPV6_PFXTAIL_LEN(cmpre); } void ipv6_rpl_srh_decompress(struct ipv6_rpl_sr_hdr *outhdr, From 2a6a870e44dd88f1a6a2893c65ef756a9edfb4c7 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 17 Apr 2023 16:00:40 +0200 Subject: [PATCH 22/33] mptcp: stops worker on unaccepted sockets at listener close This is a partial revert of the blamed commit, with a relevant change: mptcp_subflow_queue_clean() now just change the msk socket status and stop the worker, so that the UaF issue addressed by the blamed commit is not re-introduced. The above prevents the mptcp worker from running concurrently with inet_csk_listen_stop(), as such race would trigger a warning, as reported by Christoph: RSP: 002b:00007f784fe09cd8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e WARNING: CPU: 0 PID: 25807 at net/ipv4/inet_connection_sock.c:1387 inet_csk_listen_stop+0x664/0x870 net/ipv4/inet_connection_sock.c:1387 RAX: ffffffffffffffda RBX: 00000000006bc050 RCX: 00007f7850afd6a9 RDX: 0000000000000000 RSI: 0000000020000340 RDI: 0000000000000004 Modules linked in: RBP: 0000000000000002 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000006bc05c R13: fffffffffffffea8 R14: 00000000006bc050 R15: 000000000001fe40 CPU: 0 PID: 25807 Comm: syz-executor.7 Not tainted 6.2.0-g778e54711659 #7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 RIP: 0010:inet_csk_listen_stop+0x664/0x870 net/ipv4/inet_connection_sock.c:1387 RAX: 0000000000000000 RBX: ffff888100dfbd40 RCX: 0000000000000000 RDX: ffff8881363aab80 RSI: ffffffff81c494f4 RDI: 0000000000000005 RBP: ffff888126dad080 R08: 0000000000000005 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: ffff888100dfe040 R13: 0000000000000001 R14: 0000000000000000 R15: ffff888100dfbdd8 FS: 00007f7850a2c800(0000) GS:ffff88813bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b32d26000 CR3: 000000012fdd8006 CR4: 0000000000770ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: __tcp_close+0x5b2/0x620 net/ipv4/tcp.c:2875 __mptcp_close_ssk+0x145/0x3d0 net/mptcp/protocol.c:2427 mptcp_destroy_common+0x8a/0x1c0 net/mptcp/protocol.c:3277 mptcp_destroy+0x41/0x60 net/mptcp/protocol.c:3304 __mptcp_destroy_sock+0x56/0x140 net/mptcp/protocol.c:2965 __mptcp_close+0x38f/0x4a0 net/mptcp/protocol.c:3057 mptcp_close+0x24/0xe0 net/mptcp/protocol.c:3072 inet_release+0x53/0xa0 net/ipv4/af_inet.c:429 __sock_release+0x4e/0xf0 net/socket.c:651 sock_close+0x15/0x20 net/socket.c:1393 __fput+0xff/0x420 fs/file_table.c:321 task_work_run+0x8b/0xe0 kernel/task_work.c:179 resume_user_mode_work include/linux/resume_user_mode.h:49 [inline] exit_to_user_mode_loop kernel/entry/common.c:171 [inline] exit_to_user_mode_prepare+0x113/0x120 kernel/entry/common.c:203 __syscall_exit_to_user_mode_work kernel/entry/common.c:285 [inline] syscall_exit_to_user_mode+0x1d/0x40 kernel/entry/common.c:296 do_syscall_64+0x46/0x90 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x7f7850af70dc RAX: 0000000000000000 RBX: 0000000000000004 RCX: 00007f7850af70dc RDX: 00007f7850a2c800 RSI: 0000000000000002 RDI: 0000000000000003 RBP: 00000000006bd980 R08: 0000000000000000 R09: 00000000000018a0 R10: 00000000316338a4 R11: 0000000000000293 R12: 0000000000211e31 R13: 00000000006bc05c R14: 00007f785062c000 R15: 0000000000211af0 Fixes: 0a3f4f1f9c27 ("mptcp: fix UaF in listener shutdown") Cc: stable@vger.kernel.org Reported-by: Christoph Paasch Link: https://github.com/multipath-tcp/mptcp_net-next/issues/371 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 6 +++- net/mptcp/protocol.h | 1 + net/mptcp/subflow.c | 72 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 1 deletion(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 06c5872e3b00..5181fb91595b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2365,8 +2365,12 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, mptcp_subflow_drop_ctx(ssk); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ - if (ssk->sk_state == TCP_LISTEN) + if (ssk->sk_state == TCP_LISTEN) { + tcp_set_state(ssk, TCP_CLOSE); + mptcp_subflow_queue_clean(sk, ssk); + inet_csk_listen_stop(ssk); mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); + } __tcp_close(ssk, 0); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 339a6f072989..3a2db1b862dd 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -629,6 +629,7 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow); void __mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); +void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index d34588850545..bf5e5c72b5ee 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1819,6 +1819,78 @@ static void subflow_state_change(struct sock *sk) } } +void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) +{ + struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; + struct mptcp_sock *msk, *next, *head = NULL; + struct request_sock *req; + + /* build a list of all unaccepted mptcp sockets */ + spin_lock_bh(&queue->rskq_lock); + for (req = queue->rskq_accept_head; req; req = req->dl_next) { + struct mptcp_subflow_context *subflow; + struct sock *ssk = req->sk; + + if (!sk_is_mptcp(ssk)) + continue; + + subflow = mptcp_subflow_ctx(ssk); + if (!subflow || !subflow->conn) + continue; + + /* skip if already in list */ + msk = mptcp_sk(subflow->conn); + if (msk->dl_next || msk == head) + continue; + + sock_hold(subflow->conn); + msk->dl_next = head; + head = msk; + } + spin_unlock_bh(&queue->rskq_lock); + if (!head) + return; + + /* can't acquire the msk socket lock under the subflow one, + * or will cause ABBA deadlock + */ + release_sock(listener_ssk); + + for (msk = head; msk; msk = next) { + struct sock *sk = (struct sock *)msk; + + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); + next = msk->dl_next; + msk->dl_next = NULL; + + /* prevent the stack from later re-schedule the worker for + * this socket + */ + inet_sk_state_store(sk, TCP_CLOSE); + release_sock(sk); + + /* lockdep will report a false positive ABBA deadlock + * between cancel_work_sync and the listener socket. + * The involved locks belong to different sockets WRT + * the existing AB chain. + * Using a per socket key is problematic as key + * deregistration requires process context and must be + * performed at socket disposal time, in atomic + * context. + * Just tell lockdep to consider the listener socket + * released here. + */ + mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_); + mptcp_cancel_work(sk); + mutex_acquire(&listener_sk->sk_lock.dep_map, 0, 0, _RET_IP_); + + sock_put(sk); + } + + /* we are still under the listener msk socket lock */ + lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING); +} + static int subflow_ulp_init(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); From 63740448a32eb662e05894425b47bcc5814136f4 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 17 Apr 2023 16:00:41 +0200 Subject: [PATCH 23/33] mptcp: fix accept vs worker race The mptcp worker and mptcp_accept() can race, as reported by Christoph: refcount_t: addition on 0; use-after-free. WARNING: CPU: 1 PID: 14351 at lib/refcount.c:25 refcount_warn_saturate+0x105/0x1b0 lib/refcount.c:25 Modules linked in: CPU: 1 PID: 14351 Comm: syz-executor.2 Not tainted 6.3.0-rc1-gde5e8fd0123c #11 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 RIP: 0010:refcount_warn_saturate+0x105/0x1b0 lib/refcount.c:25 Code: 02 31 ff 89 de e8 1b f0 a7 ff 84 db 0f 85 6e ff ff ff e8 3e f5 a7 ff 48 c7 c7 d8 c7 34 83 c6 05 6d 2d 0f 02 01 e8 cb 3d 90 ff <0f> 0b e9 4f ff ff ff e8 1f f5 a7 ff 0f b6 1d 54 2d 0f 02 31 ff 89 RSP: 0018:ffffc90000a47bf8 EFLAGS: 00010282 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff88802eae98c0 RSI: ffffffff81097d4f RDI: 0000000000000001 RBP: ffff88802e712180 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000001 R11: ffff88802eaea148 R12: ffff88802e712100 R13: ffff88802e712a88 R14: ffff888005cb93a8 R15: ffff88802e712a88 FS: 0000000000000000(0000) GS:ffff88803ed00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f277fd89120 CR3: 0000000035486002 CR4: 0000000000370ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __refcount_add include/linux/refcount.h:199 [inline] __refcount_inc include/linux/refcount.h:250 [inline] refcount_inc include/linux/refcount.h:267 [inline] sock_hold include/net/sock.h:775 [inline] __mptcp_close+0x4c6/0x4d0 net/mptcp/protocol.c:3051 mptcp_close+0x24/0xe0 net/mptcp/protocol.c:3072 inet_release+0x56/0xa0 net/ipv4/af_inet.c:429 __sock_release+0x51/0xf0 net/socket.c:653 sock_close+0x18/0x20 net/socket.c:1395 __fput+0x113/0x430 fs/file_table.c:321 task_work_run+0x96/0x100 kernel/task_work.c:179 exit_task_work include/linux/task_work.h:38 [inline] do_exit+0x4fc/0x10c0 kernel/exit.c:869 do_group_exit+0x51/0xf0 kernel/exit.c:1019 get_signal+0x12b0/0x1390 kernel/signal.c:2859 arch_do_signal_or_restart+0x25/0x260 arch/x86/kernel/signal.c:306 exit_to_user_mode_loop kernel/entry/common.c:168 [inline] exit_to_user_mode_prepare+0x131/0x1a0 kernel/entry/common.c:203 __syscall_exit_to_user_mode_work kernel/entry/common.c:285 [inline] syscall_exit_to_user_mode+0x19/0x40 kernel/entry/common.c:296 do_syscall_64+0x46/0x90 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x7fec4b4926a9 Code: Unable to access opcode bytes at 0x7fec4b49267f. RSP: 002b:00007fec49f9dd78 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca RAX: fffffffffffffe00 RBX: 00000000006bc058 RCX: 00007fec4b4926a9 RDX: 0000000000000000 RSI: 0000000000000080 RDI: 00000000006bc058 RBP: 00000000006bc050 R08: 00000000007df998 R09: 00000000007df998 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000006bc05c R13: fffffffffffffea8 R14: 000000000000000b R15: 000000000001fe40 The root cause is that the worker can force fallback to TCP the first mptcp subflow, actually deleting the unaccepted msk socket. We can explicitly prevent the race delaying the unaccepted msk deletion at listener shutdown time. In case the closed subflow is later accepted, just drop the mptcp context and let the user-space deal with the paired mptcp socket. Fixes: b6985b9b8295 ("mptcp: use the workqueue to destroy unaccepted sockets") Cc: stable@vger.kernel.org Reported-by: Christoph Paasch Link: https://github.com/multipath-tcp/mptcp_net-next/issues/375 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Tested-by: Christoph Paasch Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 68 +++++++++++++++++++++++++++++--------------- net/mptcp/protocol.h | 1 + net/mptcp/subflow.c | 22 +++++++------- 3 files changed, 58 insertions(+), 33 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5181fb91595b..b998e9df53ce 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2315,7 +2315,26 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, unsigned int flags) { struct mptcp_sock *msk = mptcp_sk(sk); - bool need_push, dispose_it; + bool dispose_it, need_push = false; + + /* If the first subflow moved to a close state before accept, e.g. due + * to an incoming reset, mptcp either: + * - if either the subflow or the msk are dead, destroy the context + * (the subflow socket is deleted by inet_child_forget) and the msk + * - otherwise do nothing at the moment and take action at accept and/or + * listener shutdown - user-space must be able to accept() the closed + * socket. + */ + if (msk->in_accept_queue && msk->first == ssk) { + if (!sock_flag(sk, SOCK_DEAD) && !sock_flag(ssk, SOCK_DEAD)) + return; + + /* ensure later check in mptcp_worker() will dispose the msk */ + sock_set_flag(sk, SOCK_DEAD); + lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); + mptcp_subflow_drop_ctx(ssk); + goto out_release; + } dispose_it = !msk->subflow || ssk != msk->subflow->sk; if (dispose_it) @@ -2351,18 +2370,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, if (!inet_csk(ssk)->icsk_ulp_ops) { WARN_ON_ONCE(!sock_flag(ssk, SOCK_DEAD)); kfree_rcu(subflow, rcu); - } else if (msk->in_accept_queue && msk->first == ssk) { - /* if the first subflow moved to a close state, e.g. due to - * incoming reset and we reach here before inet_child_forget() - * the TCP stack could later try to close it via - * inet_csk_listen_stop(), or deliver it to the user space via - * accept(). - * We can't delete the subflow - or risk a double free - nor let - * the msk survive - or will be leaked in the non accept scenario: - * fallback and let TCP cope with the subflow cleanup. - */ - WARN_ON_ONCE(sock_flag(ssk, SOCK_DEAD)); - mptcp_subflow_drop_ctx(ssk); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ if (ssk->sk_state == TCP_LISTEN) { @@ -2377,6 +2384,8 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, /* close acquired an extra ref */ __sock_put(ssk); } + +out_release: release_sock(ssk); sock_put(ssk); @@ -2431,21 +2440,14 @@ static void __mptcp_close_subflow(struct sock *sk) mptcp_close_ssk(sk, ssk, subflow); } - /* if the MPC subflow has been closed before the msk is accepted, - * msk will never be accept-ed, close it now - */ - if (!msk->first && msk->in_accept_queue) { - sock_set_flag(sk, SOCK_DEAD); - inet_sk_state_store(sk, TCP_CLOSE); - } } -static bool mptcp_check_close_timeout(const struct sock *sk) +static bool mptcp_should_close(const struct sock *sk) { s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp; struct mptcp_subflow_context *subflow; - if (delta >= TCP_TIMEWAIT_LEN) + if (delta >= TCP_TIMEWAIT_LEN || mptcp_sk(sk)->in_accept_queue) return true; /* if all subflows are in closed status don't bother with additional @@ -2653,7 +2655,7 @@ static void mptcp_worker(struct work_struct *work) * even if it is orphaned and in FIN_WAIT2 state */ if (sock_flag(sk, SOCK_DEAD)) { - if (mptcp_check_close_timeout(sk)) { + if (mptcp_should_close(sk)) { inet_sk_state_store(sk, TCP_CLOSE); mptcp_do_fastclose(sk); } @@ -2899,6 +2901,14 @@ static void __mptcp_destroy_sock(struct sock *sk) sock_put(sk); } +void __mptcp_unaccepted_force_close(struct sock *sk) +{ + sock_set_flag(sk, SOCK_DEAD); + inet_sk_state_store(sk, TCP_CLOSE); + mptcp_do_fastclose(sk); + __mptcp_destroy_sock(sk); +} + static __poll_t mptcp_check_readable(struct mptcp_sock *msk) { /* Concurrent splices from sk_receive_queue into receive_queue will @@ -3737,6 +3747,18 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssk->sk_socket) mptcp_sock_graft(ssk, newsock); } + + /* Do late cleanup for the first subflow as necessary. Also + * deal with bad peers not doing a complete shutdown. + */ + if (msk->first && + unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) { + __mptcp_close_ssk(newsk, msk->first, + mptcp_subflow_ctx(msk->first), 0); + if (unlikely(list_empty(&msk->conn_list))) + inet_sk_state_store(newsk, TCP_CLOSE); + } + release_sock(newsk); } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 3a2db1b862dd..d6469b6ab38e 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -634,6 +634,7 @@ void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); void mptcp_cancel_work(struct sock *sk); +void __mptcp_unaccepted_force_close(struct sock *sk); void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk); bool mptcp_addresses_equal(const struct mptcp_addr_info *a, diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index bf5e5c72b5ee..281c1cc8dc8d 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -723,9 +723,12 @@ void mptcp_subflow_drop_ctx(struct sock *ssk) if (!ctx) return; - subflow_ulp_fallback(ssk, ctx); - if (ctx->conn) - sock_put(ctx->conn); + list_del(&mptcp_subflow_ctx(ssk)->node); + if (inet_csk(ssk)->icsk_ulp_ops) { + subflow_ulp_fallback(ssk, ctx); + if (ctx->conn) + sock_put(ctx->conn); + } kfree_rcu(ctx, rcu); } @@ -1824,6 +1827,7 @@ void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_s struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; struct mptcp_sock *msk, *next, *head = NULL; struct request_sock *req; + struct sock *sk; /* build a list of all unaccepted mptcp sockets */ spin_lock_bh(&queue->rskq_lock); @@ -1839,11 +1843,12 @@ void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_s continue; /* skip if already in list */ - msk = mptcp_sk(subflow->conn); + sk = subflow->conn; + msk = mptcp_sk(sk); if (msk->dl_next || msk == head) continue; - sock_hold(subflow->conn); + sock_hold(sk); msk->dl_next = head; head = msk; } @@ -1857,16 +1862,13 @@ void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_s release_sock(listener_ssk); for (msk = head; msk; msk = next) { - struct sock *sk = (struct sock *)msk; + sk = (struct sock *)msk; lock_sock_nested(sk, SINGLE_DEPTH_NESTING); next = msk->dl_next; msk->dl_next = NULL; - /* prevent the stack from later re-schedule the worker for - * this socket - */ - inet_sk_state_store(sk, TCP_CLOSE); + __mptcp_unaccepted_force_close(sk); release_sock(sk); /* lockdep will report a false positive ABBA deadlock From 1f64757ee2bb22a93ec89b4c71707297e8cca0ba Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 17 Apr 2023 18:52:51 +0200 Subject: [PATCH 24/33] mlxsw: pci: Fix possible crash during initialization During initialization the driver issues a reset command via its command interface in order to remove previous configuration from the device. After issuing the reset, the driver waits for 200ms before polling on the "system_status" register using memory-mapped IO until the device reaches a ready state (0x5E). The wait is necessary because the reset command only triggers the reset, but the reset itself happens asynchronously. If the driver starts polling too soon, the read of the "system_status" register will never return and the system will crash [1]. The issue was discovered when the device was flashed with a development firmware version where the reset routine took longer to complete. The issue was fixed in the firmware, but it exposed the fact that the current wait time is borderline. Fix by increasing the wait time from 200ms to 400ms. With this patch and the buggy firmware version, the issue did not reproduce in 10 reboots whereas without the patch the issue is reproduced quite consistently. [1] mce: CPUs not responding to MCE broadcast (may include false positives): 0,4 mce: CPUs not responding to MCE broadcast (may include false positives): 0,4 Kernel panic - not syncing: Timeout: Not all CPUs entered broadcast exception handler Shutting down cpus with NMI Kernel Offset: 0x12000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) Fixes: ac004e84164e ("mlxsw: pci: Wait longer before accessing the device after reset") Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/pci_hw.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h index 48dbfea0a2a1..7cdf0ce24f28 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h +++ b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h @@ -26,7 +26,7 @@ #define MLXSW_PCI_CIR_TIMEOUT_MSECS 1000 #define MLXSW_PCI_SW_RESET_TIMEOUT_MSECS 900000 -#define MLXSW_PCI_SW_RESET_WAIT_MSECS 200 +#define MLXSW_PCI_SW_RESET_WAIT_MSECS 400 #define MLXSW_PCI_FW_READY 0xA1844 #define MLXSW_PCI_FW_READY_MASK 0xFFFF #define MLXSW_PCI_FW_READY_MAGIC 0x5E From fcd4843a19d50f9e59116b2643e1a7d171b6fca1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 17 Apr 2023 22:50:55 +0200 Subject: [PATCH 25/33] hamradio: drop ISA_DMA_API dependency It looks like the dependency got added accidentally in commit a553260618d8 ("[PATCH] ISA DMA Kconfig fixes - part 3"). Unlike the previously removed dmascc driver, the scc driver never used DMA. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/hamradio/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/hamradio/Kconfig b/drivers/net/hamradio/Kconfig index a9c44f08199d..a94c7bd5db2e 100644 --- a/drivers/net/hamradio/Kconfig +++ b/drivers/net/hamradio/Kconfig @@ -47,7 +47,7 @@ config BPQETHER config SCC tristate "Z8530 SCC driver" - depends on ISA && AX25 && ISA_DMA_API + depends on ISA && AX25 help These cards are used to connect your Linux box to an amateur radio in order to communicate with other computers. If you want to use From 71b547f561247897a0a14f3082730156c0533fed Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 11 Apr 2023 15:24:13 +0000 Subject: [PATCH 26/33] bpf: Fix incorrect verifier pruning due to missing register precision taints Juan Jose et al reported an issue found via fuzzing where the verifier's pruning logic prematurely marks a program path as safe. Consider the following program: 0: (b7) r6 = 1024 1: (b7) r7 = 0 2: (b7) r8 = 0 3: (b7) r9 = -2147483648 4: (97) r6 %= 1025 5: (05) goto pc+0 6: (bd) if r6 <= r9 goto pc+2 7: (97) r6 %= 1 8: (b7) r9 = 0 9: (bd) if r6 <= r9 goto pc+1 10: (b7) r6 = 0 11: (b7) r0 = 0 12: (63) *(u32 *)(r10 -4) = r0 13: (18) r4 = 0xffff888103693400 // map_ptr(ks=4,vs=48) 15: (bf) r1 = r4 16: (bf) r2 = r10 17: (07) r2 += -4 18: (85) call bpf_map_lookup_elem#1 19: (55) if r0 != 0x0 goto pc+1 20: (95) exit 21: (77) r6 >>= 10 22: (27) r6 *= 8192 23: (bf) r1 = r0 24: (0f) r0 += r6 25: (79) r3 = *(u64 *)(r0 +0) 26: (7b) *(u64 *)(r1 +0) = r3 27: (95) exit The verifier treats this as safe, leading to oob read/write access due to an incorrect verifier conclusion: func#0 @0 0: R1=ctx(off=0,imm=0) R10=fp0 0: (b7) r6 = 1024 ; R6_w=1024 1: (b7) r7 = 0 ; R7_w=0 2: (b7) r8 = 0 ; R8_w=0 3: (b7) r9 = -2147483648 ; R9_w=-2147483648 4: (97) r6 %= 1025 ; R6_w=scalar() 5: (05) goto pc+0 6: (bd) if r6 <= r9 goto pc+2 ; R6_w=scalar(umin=18446744071562067969,var_off=(0xffffffff00000000; 0xffffffff)) R9_w=-2147483648 7: (97) r6 %= 1 ; R6_w=scalar() 8: (b7) r9 = 0 ; R9=0 9: (bd) if r6 <= r9 goto pc+1 ; R6=scalar(umin=1) R9=0 10: (b7) r6 = 0 ; R6_w=0 11: (b7) r0 = 0 ; R0_w=0 12: (63) *(u32 *)(r10 -4) = r0 last_idx 12 first_idx 9 regs=1 stack=0 before 11: (b7) r0 = 0 13: R0_w=0 R10=fp0 fp-8=0000???? 13: (18) r4 = 0xffff8ad3886c2a00 ; R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 15: (bf) r1 = r4 ; R1_w=map_ptr(off=0,ks=4,vs=48,imm=0) R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 16: (bf) r2 = r10 ; R2_w=fp0 R10=fp0 17: (07) r2 += -4 ; R2_w=fp-4 18: (85) call bpf_map_lookup_elem#1 ; R0=map_value_or_null(id=1,off=0,ks=4,vs=48,imm=0) 19: (55) if r0 != 0x0 goto pc+1 ; R0=0 20: (95) exit from 19 to 21: R0=map_value(off=0,ks=4,vs=48,imm=0) R6=0 R7=0 R8=0 R9=0 R10=fp0 fp-8=mmmm???? 21: (77) r6 >>= 10 ; R6_w=0 22: (27) r6 *= 8192 ; R6_w=0 23: (bf) r1 = r0 ; R0=map_value(off=0,ks=4,vs=48,imm=0) R1_w=map_value(off=0,ks=4,vs=48,imm=0) 24: (0f) r0 += r6 last_idx 24 first_idx 19 regs=40 stack=0 before 23: (bf) r1 = r0 regs=40 stack=0 before 22: (27) r6 *= 8192 regs=40 stack=0 before 21: (77) r6 >>= 10 regs=40 stack=0 before 19: (55) if r0 != 0x0 goto pc+1 parent didn't have regs=40 stack=0 marks: R0_rw=map_value_or_null(id=1,off=0,ks=4,vs=48,imm=0) R6_rw=P0 R7=0 R8=0 R9=0 R10=fp0 fp-8=mmmm???? last_idx 18 first_idx 9 regs=40 stack=0 before 18: (85) call bpf_map_lookup_elem#1 regs=40 stack=0 before 17: (07) r2 += -4 regs=40 stack=0 before 16: (bf) r2 = r10 regs=40 stack=0 before 15: (bf) r1 = r4 regs=40 stack=0 before 13: (18) r4 = 0xffff8ad3886c2a00 regs=40 stack=0 before 12: (63) *(u32 *)(r10 -4) = r0 regs=40 stack=0 before 11: (b7) r0 = 0 regs=40 stack=0 before 10: (b7) r6 = 0 25: (79) r3 = *(u64 *)(r0 +0) ; R0_w=map_value(off=0,ks=4,vs=48,imm=0) R3_w=scalar() 26: (7b) *(u64 *)(r1 +0) = r3 ; R1_w=map_value(off=0,ks=4,vs=48,imm=0) R3_w=scalar() 27: (95) exit from 9 to 11: R1=ctx(off=0,imm=0) R6=0 R7=0 R8=0 R9=0 R10=fp0 11: (b7) r0 = 0 ; R0_w=0 12: (63) *(u32 *)(r10 -4) = r0 last_idx 12 first_idx 11 regs=1 stack=0 before 11: (b7) r0 = 0 13: R0_w=0 R10=fp0 fp-8=0000???? 13: (18) r4 = 0xffff8ad3886c2a00 ; R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 15: (bf) r1 = r4 ; R1_w=map_ptr(off=0,ks=4,vs=48,imm=0) R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 16: (bf) r2 = r10 ; R2_w=fp0 R10=fp0 17: (07) r2 += -4 ; R2_w=fp-4 18: (85) call bpf_map_lookup_elem#1 frame 0: propagating r6 last_idx 19 first_idx 11 regs=40 stack=0 before 18: (85) call bpf_map_lookup_elem#1 regs=40 stack=0 before 17: (07) r2 += -4 regs=40 stack=0 before 16: (bf) r2 = r10 regs=40 stack=0 before 15: (bf) r1 = r4 regs=40 stack=0 before 13: (18) r4 = 0xffff8ad3886c2a00 regs=40 stack=0 before 12: (63) *(u32 *)(r10 -4) = r0 regs=40 stack=0 before 11: (b7) r0 = 0 parent didn't have regs=40 stack=0 marks: R1=ctx(off=0,imm=0) R6_r=P0 R7=0 R8=0 R9=0 R10=fp0 last_idx 9 first_idx 9 regs=40 stack=0 before 9: (bd) if r6 <= r9 goto pc+1 parent didn't have regs=40 stack=0 marks: R1=ctx(off=0,imm=0) R6_rw=Pscalar() R7_w=0 R8_w=0 R9_rw=0 R10=fp0 last_idx 8 first_idx 0 regs=40 stack=0 before 8: (b7) r9 = 0 regs=40 stack=0 before 7: (97) r6 %= 1 regs=40 stack=0 before 6: (bd) if r6 <= r9 goto pc+2 regs=40 stack=0 before 5: (05) goto pc+0 regs=40 stack=0 before 4: (97) r6 %= 1025 regs=40 stack=0 before 3: (b7) r9 = -2147483648 regs=40 stack=0 before 2: (b7) r8 = 0 regs=40 stack=0 before 1: (b7) r7 = 0 regs=40 stack=0 before 0: (b7) r6 = 1024 19: safe frame 0: propagating r6 last_idx 9 first_idx 0 regs=40 stack=0 before 6: (bd) if r6 <= r9 goto pc+2 regs=40 stack=0 before 5: (05) goto pc+0 regs=40 stack=0 before 4: (97) r6 %= 1025 regs=40 stack=0 before 3: (b7) r9 = -2147483648 regs=40 stack=0 before 2: (b7) r8 = 0 regs=40 stack=0 before 1: (b7) r7 = 0 regs=40 stack=0 before 0: (b7) r6 = 1024 from 6 to 9: safe verification time 110 usec stack depth 4 processed 36 insns (limit 1000000) max_states_per_insn 0 total_states 3 peak_states 3 mark_read 2 The verifier considers this program as safe by mistakenly pruning unsafe code paths. In the above func#0, code lines 0-10 are of interest. In line 0-3 registers r6 to r9 are initialized with known scalar values. In line 4 the register r6 is reset to an unknown scalar given the verifier does not track modulo operations. Due to this, the verifier can also not determine precisely which branches in line 6 and 9 are taken, therefore it needs to explore them both. As can be seen, the verifier starts with exploring the false/fall-through paths first. The 'from 19 to 21' path has both r6=0 and r9=0 and the pointer arithmetic on r0 += r6 is therefore considered safe. Given the arithmetic, r6 is correctly marked for precision tracking where backtracking kicks in where it walks back the current path all the way where r6 was set to 0 in the fall-through branch. Next, the pruning logics pops the path 'from 9 to 11' from the stack. Also here, the state of the registers is the same, that is, r6=0 and r9=0, so that at line 19 the path can be pruned as it is considered safe. It is interesting to note that the conditional in line 9 turned r6 into a more precise state, that is, in the fall-through path at the beginning of line 10, it is R6=scalar(umin=1), and in the branch-taken path (which is analyzed here) at the beginning of line 11, r6 turned into a known const r6=0 as r9=0 prior to that and therefore (unsigned) r6 <= 0 concludes that r6 must be 0 (**): [...] ; R6_w=scalar() 9: (bd) if r6 <= r9 goto pc+1 ; R6=scalar(umin=1) R9=0 [...] from 9 to 11: R1=ctx(off=0,imm=0) R6=0 R7=0 R8=0 R9=0 R10=fp0 [...] The next path is 'from 6 to 9'. The verifier considers the old and current state equivalent, and therefore prunes the search incorrectly. Looking into the two states which are being compared by the pruning logic at line 9, the old state consists of R6_rwD=Pscalar() R9_rwD=0 R10=fp0 and the new state consists of R1=ctx(off=0,imm=0) R6_w=scalar(umax=18446744071562067968) R7_w=0 R8_w=0 R9_w=-2147483648 R10=fp0. While r6 had the reg->precise flag correctly set in the old state, r9 did not. Both r6'es are considered as equivalent given the old one is a superset of the current, more precise one, however, r9's actual values (0 vs 0x80000000) mismatch. Given the old r9 did not have reg->precise flag set, the verifier does not consider the register as contributing to the precision state of r6, and therefore it considered both r9 states as equivalent. However, for this specific pruned path (which is also the actual path taken at runtime), register r6 will be 0x400 and r9 0x80000000 when reaching line 21, thus oob-accessing the map. The purpose of precision tracking is to initially mark registers (including spilled ones) as imprecise to help verifier's pruning logic finding equivalent states it can then prune if they don't contribute to the program's safety aspects. For example, if registers are used for pointer arithmetic or to pass constant length to a helper, then the verifier sets reg->precise flag and backtracks the BPF program instruction sequence and chain of verifier states to ensure that the given register or stack slot including their dependencies are marked as precisely tracked scalar. This also includes any other registers and slots that contribute to a tracked state of given registers/stack slot. This backtracking relies on recorded jmp_history and is able to traverse entire chain of parent states. This process ends only when all the necessary registers/slots and their transitive dependencies are marked as precise. The backtrack_insn() is called from the current instruction up to the first instruction, and its purpose is to compute a bitmask of registers and stack slots that need precision tracking in the parent's verifier state. For example, if a current instruction is r6 = r7, then r6 needs precision after this instruction and r7 needs precision before this instruction, that is, in the parent state. Hence for the latter r7 is marked and r6 unmarked. For the class of jmp/jmp32 instructions, backtrack_insn() today only looks at call and exit instructions and for all other conditionals the masks remain as-is. However, in the given situation register r6 has a dependency on r9 (as described above in **), so also that one needs to be marked for precision tracking. In other words, if an imprecise register influences a precise one, then the imprecise register should also be marked precise. Meaning, in the parent state both dest and src register need to be tracked for precision and therefore the marking must be more conservative by setting reg->precise flag for both. The precision propagation needs to cover both for the conditional: if the src reg was marked but not the dst reg and vice versa. After the fix the program is correctly rejected: func#0 @0 0: R1=ctx(off=0,imm=0) R10=fp0 0: (b7) r6 = 1024 ; R6_w=1024 1: (b7) r7 = 0 ; R7_w=0 2: (b7) r8 = 0 ; R8_w=0 3: (b7) r9 = -2147483648 ; R9_w=-2147483648 4: (97) r6 %= 1025 ; R6_w=scalar() 5: (05) goto pc+0 6: (bd) if r6 <= r9 goto pc+2 ; R6_w=scalar(umin=18446744071562067969,var_off=(0xffffffff80000000; 0x7fffffff),u32_min=-2147483648) R9_w=-2147483648 7: (97) r6 %= 1 ; R6_w=scalar() 8: (b7) r9 = 0 ; R9=0 9: (bd) if r6 <= r9 goto pc+1 ; R6=scalar(umin=1) R9=0 10: (b7) r6 = 0 ; R6_w=0 11: (b7) r0 = 0 ; R0_w=0 12: (63) *(u32 *)(r10 -4) = r0 last_idx 12 first_idx 9 regs=1 stack=0 before 11: (b7) r0 = 0 13: R0_w=0 R10=fp0 fp-8=0000???? 13: (18) r4 = 0xffff9290dc5bfe00 ; R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 15: (bf) r1 = r4 ; R1_w=map_ptr(off=0,ks=4,vs=48,imm=0) R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 16: (bf) r2 = r10 ; R2_w=fp0 R10=fp0 17: (07) r2 += -4 ; R2_w=fp-4 18: (85) call bpf_map_lookup_elem#1 ; R0=map_value_or_null(id=1,off=0,ks=4,vs=48,imm=0) 19: (55) if r0 != 0x0 goto pc+1 ; R0=0 20: (95) exit from 19 to 21: R0=map_value(off=0,ks=4,vs=48,imm=0) R6=0 R7=0 R8=0 R9=0 R10=fp0 fp-8=mmmm???? 21: (77) r6 >>= 10 ; R6_w=0 22: (27) r6 *= 8192 ; R6_w=0 23: (bf) r1 = r0 ; R0=map_value(off=0,ks=4,vs=48,imm=0) R1_w=map_value(off=0,ks=4,vs=48,imm=0) 24: (0f) r0 += r6 last_idx 24 first_idx 19 regs=40 stack=0 before 23: (bf) r1 = r0 regs=40 stack=0 before 22: (27) r6 *= 8192 regs=40 stack=0 before 21: (77) r6 >>= 10 regs=40 stack=0 before 19: (55) if r0 != 0x0 goto pc+1 parent didn't have regs=40 stack=0 marks: R0_rw=map_value_or_null(id=1,off=0,ks=4,vs=48,imm=0) R6_rw=P0 R7=0 R8=0 R9=0 R10=fp0 fp-8=mmmm???? last_idx 18 first_idx 9 regs=40 stack=0 before 18: (85) call bpf_map_lookup_elem#1 regs=40 stack=0 before 17: (07) r2 += -4 regs=40 stack=0 before 16: (bf) r2 = r10 regs=40 stack=0 before 15: (bf) r1 = r4 regs=40 stack=0 before 13: (18) r4 = 0xffff9290dc5bfe00 regs=40 stack=0 before 12: (63) *(u32 *)(r10 -4) = r0 regs=40 stack=0 before 11: (b7) r0 = 0 regs=40 stack=0 before 10: (b7) r6 = 0 25: (79) r3 = *(u64 *)(r0 +0) ; R0_w=map_value(off=0,ks=4,vs=48,imm=0) R3_w=scalar() 26: (7b) *(u64 *)(r1 +0) = r3 ; R1_w=map_value(off=0,ks=4,vs=48,imm=0) R3_w=scalar() 27: (95) exit from 9 to 11: R1=ctx(off=0,imm=0) R6=0 R7=0 R8=0 R9=0 R10=fp0 11: (b7) r0 = 0 ; R0_w=0 12: (63) *(u32 *)(r10 -4) = r0 last_idx 12 first_idx 11 regs=1 stack=0 before 11: (b7) r0 = 0 13: R0_w=0 R10=fp0 fp-8=0000???? 13: (18) r4 = 0xffff9290dc5bfe00 ; R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 15: (bf) r1 = r4 ; R1_w=map_ptr(off=0,ks=4,vs=48,imm=0) R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 16: (bf) r2 = r10 ; R2_w=fp0 R10=fp0 17: (07) r2 += -4 ; R2_w=fp-4 18: (85) call bpf_map_lookup_elem#1 frame 0: propagating r6 last_idx 19 first_idx 11 regs=40 stack=0 before 18: (85) call bpf_map_lookup_elem#1 regs=40 stack=0 before 17: (07) r2 += -4 regs=40 stack=0 before 16: (bf) r2 = r10 regs=40 stack=0 before 15: (bf) r1 = r4 regs=40 stack=0 before 13: (18) r4 = 0xffff9290dc5bfe00 regs=40 stack=0 before 12: (63) *(u32 *)(r10 -4) = r0 regs=40 stack=0 before 11: (b7) r0 = 0 parent didn't have regs=40 stack=0 marks: R1=ctx(off=0,imm=0) R6_r=P0 R7=0 R8=0 R9=0 R10=fp0 last_idx 9 first_idx 9 regs=40 stack=0 before 9: (bd) if r6 <= r9 goto pc+1 parent didn't have regs=240 stack=0 marks: R1=ctx(off=0,imm=0) R6_rw=Pscalar() R7_w=0 R8_w=0 R9_rw=P0 R10=fp0 last_idx 8 first_idx 0 regs=240 stack=0 before 8: (b7) r9 = 0 regs=40 stack=0 before 7: (97) r6 %= 1 regs=40 stack=0 before 6: (bd) if r6 <= r9 goto pc+2 regs=240 stack=0 before 5: (05) goto pc+0 regs=240 stack=0 before 4: (97) r6 %= 1025 regs=240 stack=0 before 3: (b7) r9 = -2147483648 regs=40 stack=0 before 2: (b7) r8 = 0 regs=40 stack=0 before 1: (b7) r7 = 0 regs=40 stack=0 before 0: (b7) r6 = 1024 19: safe from 6 to 9: R1=ctx(off=0,imm=0) R6_w=scalar(umax=18446744071562067968) R7_w=0 R8_w=0 R9_w=-2147483648 R10=fp0 9: (bd) if r6 <= r9 goto pc+1 last_idx 9 first_idx 0 regs=40 stack=0 before 6: (bd) if r6 <= r9 goto pc+2 regs=240 stack=0 before 5: (05) goto pc+0 regs=240 stack=0 before 4: (97) r6 %= 1025 regs=240 stack=0 before 3: (b7) r9 = -2147483648 regs=40 stack=0 before 2: (b7) r8 = 0 regs=40 stack=0 before 1: (b7) r7 = 0 regs=40 stack=0 before 0: (b7) r6 = 1024 last_idx 9 first_idx 0 regs=200 stack=0 before 6: (bd) if r6 <= r9 goto pc+2 regs=240 stack=0 before 5: (05) goto pc+0 regs=240 stack=0 before 4: (97) r6 %= 1025 regs=240 stack=0 before 3: (b7) r9 = -2147483648 regs=40 stack=0 before 2: (b7) r8 = 0 regs=40 stack=0 before 1: (b7) r7 = 0 regs=40 stack=0 before 0: (b7) r6 = 1024 11: R6=scalar(umax=18446744071562067968) R9=-2147483648 11: (b7) r0 = 0 ; R0_w=0 12: (63) *(u32 *)(r10 -4) = r0 last_idx 12 first_idx 11 regs=1 stack=0 before 11: (b7) r0 = 0 13: R0_w=0 R10=fp0 fp-8=0000???? 13: (18) r4 = 0xffff9290dc5bfe00 ; R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 15: (bf) r1 = r4 ; R1_w=map_ptr(off=0,ks=4,vs=48,imm=0) R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 16: (bf) r2 = r10 ; R2_w=fp0 R10=fp0 17: (07) r2 += -4 ; R2_w=fp-4 18: (85) call bpf_map_lookup_elem#1 ; R0_w=map_value_or_null(id=3,off=0,ks=4,vs=48,imm=0) 19: (55) if r0 != 0x0 goto pc+1 ; R0_w=0 20: (95) exit from 19 to 21: R0=map_value(off=0,ks=4,vs=48,imm=0) R6=scalar(umax=18446744071562067968) R7=0 R8=0 R9=-2147483648 R10=fp0 fp-8=mmmm???? 21: (77) r6 >>= 10 ; R6_w=scalar(umax=18014398507384832,var_off=(0x0; 0x3fffffffffffff)) 22: (27) r6 *= 8192 ; R6_w=scalar(smax=9223372036854767616,umax=18446744073709543424,var_off=(0x0; 0xffffffffffffe000),s32_max=2147475456,u32_max=-8192) 23: (bf) r1 = r0 ; R0=map_value(off=0,ks=4,vs=48,imm=0) R1_w=map_value(off=0,ks=4,vs=48,imm=0) 24: (0f) r0 += r6 last_idx 24 first_idx 21 regs=40 stack=0 before 23: (bf) r1 = r0 regs=40 stack=0 before 22: (27) r6 *= 8192 regs=40 stack=0 before 21: (77) r6 >>= 10 parent didn't have regs=40 stack=0 marks: R0_rw=map_value(off=0,ks=4,vs=48,imm=0) R6_r=Pscalar(umax=18446744071562067968) R7=0 R8=0 R9=-2147483648 R10=fp0 fp-8=mmmm???? last_idx 19 first_idx 11 regs=40 stack=0 before 19: (55) if r0 != 0x0 goto pc+1 regs=40 stack=0 before 18: (85) call bpf_map_lookup_elem#1 regs=40 stack=0 before 17: (07) r2 += -4 regs=40 stack=0 before 16: (bf) r2 = r10 regs=40 stack=0 before 15: (bf) r1 = r4 regs=40 stack=0 before 13: (18) r4 = 0xffff9290dc5bfe00 regs=40 stack=0 before 12: (63) *(u32 *)(r10 -4) = r0 regs=40 stack=0 before 11: (b7) r0 = 0 parent didn't have regs=40 stack=0 marks: R1=ctx(off=0,imm=0) R6_rw=Pscalar(umax=18446744071562067968) R7_w=0 R8_w=0 R9_w=-2147483648 R10=fp0 last_idx 9 first_idx 0 regs=40 stack=0 before 9: (bd) if r6 <= r9 goto pc+1 regs=240 stack=0 before 6: (bd) if r6 <= r9 goto pc+2 regs=240 stack=0 before 5: (05) goto pc+0 regs=240 stack=0 before 4: (97) r6 %= 1025 regs=240 stack=0 before 3: (b7) r9 = -2147483648 regs=40 stack=0 before 2: (b7) r8 = 0 regs=40 stack=0 before 1: (b7) r7 = 0 regs=40 stack=0 before 0: (b7) r6 = 1024 math between map_value pointer and register with unbounded min value is not allowed verification time 886 usec stack depth 4 processed 49 insns (limit 1000000) max_states_per_insn 1 total_states 5 peak_states 5 mark_read 2 Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Reported-by: Juan Jose Lopez Jaimez Reported-by: Meador Inge Reported-by: Simon Scannell Reported-by: Nenad Stojanovski Signed-off-by: Daniel Borkmann Co-developed-by: Andrii Nakryiko Signed-off-by: Andrii Nakryiko Reviewed-by: John Fastabend Reviewed-by: Juan Jose Lopez Jaimez Reviewed-by: Meador Inge Reviewed-by: Simon Scannell --- kernel/bpf/verifier.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d517d13878cf..767e8930b0bd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2967,6 +2967,21 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, } } else if (opcode == BPF_EXIT) { return -ENOTSUPP; + } else if (BPF_SRC(insn->code) == BPF_X) { + if (!(*reg_mask & (dreg | sreg))) + return 0; + /* dreg sreg + * Both dreg and sreg need precision before + * this insn. If only sreg was marked precise + * before it would be equally necessary to + * propagate it to dreg. + */ + *reg_mask |= (sreg | dreg); + /* else dreg K + * Only dreg still needs precision before + * this insn, so for the K-based conditional + * there is nothing new to be marked. + */ } } else if (class == BPF_LD) { if (!(*reg_mask & dreg)) From 3d2f8f1f184c60508f7af3022536651d7ac2dd07 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 17 Apr 2023 20:19:33 +0200 Subject: [PATCH 27/33] net: dsa: microchip: ksz8795: Correctly handle huge frame configuration Because of the logic in place, SW_HUGE_PACKET can never be set. (If the first condition is true, then the 2nd one is also true, but is not executed) Change the logic and update each bit individually. Fixes: 29d1e85f45e0 ("net: dsa: microchip: ksz8: add MTU configuration support") Signed-off-by: Christophe JAILLET Reviewed-by: Oleksij Rempel Reviewed-by: Simon Horman Reviewed-by: Vladimir Oltean Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/43107d9e8b5b8b05f0cbd4e1f47a2bb88c8747b2.1681755535.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz8795.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c index 3fffd5da8d3b..ffcad057d065 100644 --- a/drivers/net/dsa/microchip/ksz8795.c +++ b/drivers/net/dsa/microchip/ksz8795.c @@ -96,7 +96,7 @@ static int ksz8795_change_mtu(struct ksz_device *dev, int frame_size) if (frame_size > KSZ8_LEGAL_PACKET_SIZE) ctrl2 |= SW_LEGAL_PACKET_DISABLE; - else if (frame_size > KSZ8863_NORMAL_PACKET_SIZE) + if (frame_size > KSZ8863_NORMAL_PACKET_SIZE) ctrl1 |= SW_HUGE_PACKET; ret = ksz_rmw8(dev, REG_SW_CTRL_1, SW_HUGE_PACKET, ctrl1); From 8c154d272c3e03b100baaf1df473f22a78fa403e Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Tue, 18 Apr 2023 13:25:11 -0700 Subject: [PATCH 28/33] bnxt_en: fix free-runnig PHC mode The patch in fixes changed the way real-time mode is chosen for PHC on the NIC. Apparently there is one more use case of the check outside of ptp part of the driver which was not converted to the new macro and is making a lot of noise in free-running mode. Fixes: 131db4991622 ("bnxt_en: reset PHC frequency in free-running mode") Signed-off-by: Vadim Fedorenko Reviewed-by: Michael Chan Reviewed-by: Pavan Chebbi Link: https://lore.kernel.org/r/20230418202511.1544735-1-vadfed@meta.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index ef97a4190b39..651b79ce5d80 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -2388,7 +2388,7 @@ static int bnxt_async_event_process(struct bnxt *bp, case ASYNC_EVENT_CMPL_EVENT_ID_PHC_UPDATE: { switch (BNXT_EVENT_PHC_EVENT_TYPE(data1)) { case ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_PHC_RTC_UPDATE: - if (bp->fw_cap & BNXT_FW_CAP_PTP_RTC) { + if (BNXT_PTP_USE_RTC(bp)) { struct bnxt_ptp_cfg *ptp = bp->ptp_cfg; u64 ns; From 67d47b95119ad589b0a0b16b88b1dd9a04061ced Mon Sep 17 00:00:00 2001 From: Sebastian Basierski Date: Mon, 17 Apr 2023 13:53:45 -0700 Subject: [PATCH 29/33] e1000e: Disable TSO on i219-LM card to increase speed While using i219-LM card currently it was only possible to achieve about 60% of maximum speed due to regression introduced in Linux 5.8. This was caused by TSO not being disabled by default despite commit f29801030ac6 ("e1000e: Disable TSO for buffer overrun workaround"). Fix that by disabling TSO during driver probe. Fixes: f29801030ac6 ("e1000e: Disable TSO for buffer overrun workaround") Signed-off-by: Sebastian Basierski Signed-off-by: Mateusz Palczewski Tested-by: Naama Meir Signed-off-by: Tony Nguyen Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230417205345.1030801-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/e1000e/netdev.c | 51 +++++++++++----------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index e1eb1de88bf9..e14d1e45318f 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -5288,31 +5288,6 @@ static void e1000_watchdog_task(struct work_struct *work) ew32(TARC(0), tarc0); } - /* disable TSO for pcie and 10/100 speeds, to avoid - * some hardware issues - */ - if (!(adapter->flags & FLAG_TSO_FORCE)) { - switch (adapter->link_speed) { - case SPEED_10: - case SPEED_100: - e_info("10/100 speed: disabling TSO\n"); - netdev->features &= ~NETIF_F_TSO; - netdev->features &= ~NETIF_F_TSO6; - break; - case SPEED_1000: - netdev->features |= NETIF_F_TSO; - netdev->features |= NETIF_F_TSO6; - break; - default: - /* oops */ - break; - } - if (hw->mac.type == e1000_pch_spt) { - netdev->features &= ~NETIF_F_TSO; - netdev->features &= ~NETIF_F_TSO6; - } - } - /* enable transmits in the hardware, need to do this * after setting TARC(0) */ @@ -7526,6 +7501,32 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent) NETIF_F_RXCSUM | NETIF_F_HW_CSUM); + /* disable TSO for pcie and 10/100 speeds to avoid + * some hardware issues and for i219 to fix transfer + * speed being capped at 60% + */ + if (!(adapter->flags & FLAG_TSO_FORCE)) { + switch (adapter->link_speed) { + case SPEED_10: + case SPEED_100: + e_info("10/100 speed: disabling TSO\n"); + netdev->features &= ~NETIF_F_TSO; + netdev->features &= ~NETIF_F_TSO6; + break; + case SPEED_1000: + netdev->features |= NETIF_F_TSO; + netdev->features |= NETIF_F_TSO6; + break; + default: + /* oops */ + break; + } + if (hw->mac.type == e1000_pch_spt) { + netdev->features &= ~NETIF_F_TSO; + netdev->features &= ~NETIF_F_TSO6; + } + } + /* Set user-changeable features (subset of all device features) */ netdev->hw_features = netdev->features; netdev->hw_features |= NETIF_F_RXFCS; From 7b3aba7ea336d069b30b91502d47792c280bae2b Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Tue, 18 Apr 2023 10:36:59 +0200 Subject: [PATCH 30/33] mailmap: add entries for Mat Martineau Map Mat's old corporate addresses to his kernel.org one. Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Link: https://lore.kernel.org/r/20230418-upstream-net-20230418-mailmap-mat-v1-1-13ca5dc83037@tessares.net Signed-off-by: Jakub Kicinski --- .mailmap | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index e42486317d18..69dae9b6ec95 100644 --- a/.mailmap +++ b/.mailmap @@ -297,6 +297,8 @@ Martin Kepplinger Martin Kepplinger Martyna Szapar-Mudlaw Mathieu Othacehe +Mat Martineau +Mat Martineau Matthew Wilcox Matthew Wilcox Matthew Wilcox From 52b37ae8aa6797a8183e8554672797045e81d9ae Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Tue, 18 Apr 2023 16:13:18 -0700 Subject: [PATCH 31/33] MAINTAINERS: Resume MPTCP co-maintainer role I'm returning to the MPTCP maintainer role I held for most of the subsytem's history. This time I'm using my kernel.org email address. Acked-by: Matthieu Baerts Link: https://lore.kernel.org/mptcp/af85e467-8d0a-4eba-b5f8-e2f2c5d24984@tessares.net/ Signed-off-by: Mat Martineau Link: https://lore.kernel.org/r/20230418231318.115331-1-martineau@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 0e64787aace8..c6545eb54104 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14594,6 +14594,7 @@ F: net/netlabel/ NETWORKING [MPTCP] M: Matthieu Baerts +M: Mat Martineau L: netdev@vger.kernel.org L: mptcp@lists.linux.dev S: Maintained From f52cc627b832e08a7bcf1b7e81e650ec308fe1d8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 13 Apr 2023 15:25:47 -0700 Subject: [PATCH 32/33] Revert "net/mlx5: Enable management PF initialization" This reverts commit fe998a3c77b9f989a30a2a01fb00d3729a6d53a4. Paul reports that it causes a regression with IB on CX4 and FW 12.18.1000. In addition I think that the concept of "management PF" is not fully accepted and requires a discussion. Fixes: fe998a3c77b9 ("net/mlx5: Enable management PF initialization") Reported-by: Paul Moore Link: https://lore.kernel.org/all/CAHC9VhQ7A4+msL38WpbOMYjAqLp0EtOjeLh4Dc6SQtD6OUvCQg@mail.gmail.com/ Link: https://lore.kernel.org/r/20230413222547.56901-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/dev.c | 6 ------ drivers/net/ethernet/mellanox/mlx5/core/ecpf.c | 8 -------- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 2 +- include/linux/mlx5/driver.h | 5 ----- 4 files changed, 1 insertion(+), 20 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c index 445fe30c3d0b..2e7806001fdc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c @@ -59,9 +59,6 @@ bool mlx5_eth_supported(struct mlx5_core_dev *dev) if (!IS_ENABLED(CONFIG_MLX5_CORE_EN)) return false; - if (mlx5_core_is_management_pf(dev)) - return false; - if (MLX5_CAP_GEN(dev, port_type) != MLX5_CAP_PORT_TYPE_ETH) return false; @@ -201,9 +198,6 @@ bool mlx5_rdma_supported(struct mlx5_core_dev *dev) if (!IS_ENABLED(CONFIG_MLX5_INFINIBAND)) return false; - if (mlx5_core_is_management_pf(dev)) - return false; - if (dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_IB_ADEV) return false; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c index 7c9c4e40c019..d000236ddbac 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c @@ -75,10 +75,6 @@ int mlx5_ec_init(struct mlx5_core_dev *dev) if (!mlx5_core_is_ecpf(dev)) return 0; - /* Management PF don't have a peer PF */ - if (mlx5_core_is_management_pf(dev)) - return 0; - return mlx5_host_pf_init(dev); } @@ -89,10 +85,6 @@ void mlx5_ec_cleanup(struct mlx5_core_dev *dev) if (!mlx5_core_is_ecpf(dev)) return; - /* Management PF don't have a peer PF */ - if (mlx5_core_is_management_pf(dev)) - return; - mlx5_host_pf_cleanup(dev); err = mlx5_wait_for_pages(dev, &dev->priv.page_counters[MLX5_HOST_PF]); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 8bdf28762f41..19fed514fc17 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1488,7 +1488,7 @@ int mlx5_esw_sf_max_hpf_functions(struct mlx5_core_dev *dev, u16 *max_sfs, u16 * void *hca_caps; int err; - if (!mlx5_core_is_ecpf(dev) || mlx5_core_is_management_pf(dev)) { + if (!mlx5_core_is_ecpf(dev)) { *max_sfs = 0; return 0; } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index f33389b42209..7e225e41d55b 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1211,11 +1211,6 @@ static inline bool mlx5_core_is_vf(const struct mlx5_core_dev *dev) return dev->coredev_type == MLX5_COREDEV_VF; } -static inline bool mlx5_core_is_management_pf(const struct mlx5_core_dev *dev) -{ - return MLX5_CAP_GEN(dev, num_ports) == 1 && !MLX5_CAP_GEN(dev, native_port_num); -} - static inline bool mlx5_core_is_ecpf(const struct mlx5_core_dev *dev) { return dev->caps.embedded_cpu; From 927cdea5d2095287ddd5246e5aa68eb5d68db2be Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 18 Apr 2023 18:59:02 +0300 Subject: [PATCH 33/33] net: bridge: switchdev: don't notify FDB entries with "master dynamic" There is a structural problem in switchdev, where the flag bits in struct switchdev_notifier_fdb_info (added_by_user, is_local etc) only represent a simplified / denatured view of what's in struct net_bridge_fdb_entry :: flags (BR_FDB_ADDED_BY_USER, BR_FDB_LOCAL etc). Each time we want to pass more information about struct net_bridge_fdb_entry :: flags to struct switchdev_notifier_fdb_info (here, BR_FDB_STATIC), we find that FDB entries were already notified to switchdev with no regard to this flag, and thus, switchdev drivers had no indication whether the notified entries were static or not. For example, this command: ip link add br0 type bridge && ip link set swp0 master br0 bridge fdb add dev swp0 00:01:02:03:04:05 master dynamic has never worked as intended with switchdev. It causes a struct net_bridge_fdb_entry to be passed to br_switchdev_fdb_notify() which has a single flag set: BR_FDB_ADDED_BY_USER. This is further passed to the switchdev notifier chain, where interested drivers have no choice but to assume this is a static (does not age) and sticky (does not migrate) FDB entry. So currently, all drivers offload it to hardware as such, as can be seen below ("offload" is set). bridge fdb get 00:01:02:03:04:05 dev swp0 master 00:01:02:03:04:05 dev swp0 offload master br0 The software FDB entry expires $ageing_time centiseconds after the kernel last sees a packet with this MAC SA, and the bridge notifies its deletion as well, so it eventually disappears from hardware too. This is a problem, because it is actually desirable to start offloading "master dynamic" FDB entries correctly - they should expire $ageing_time centiseconds after the *hardware* port last sees a packet with this MAC SA - and this is how the current incorrect behavior was discovered. With an offloaded data plane, it can be expected that software only sees exception path packets, so an otherwise active dynamic FDB entry would be aged out by software sooner than it should. With the change in place, these FDB entries are no longer offloaded: bridge fdb get 00:01:02:03:04:05 dev swp0 master 00:01:02:03:04:05 dev swp0 master br0 and this also constitutes a better way (assuming a backport to stable kernels) for user space to determine whether the kernel has the capability of doing something sane with these or not. As opposed to "master dynamic" FDB entries, on the current behavior of which no one currently depends on (which can be deduced from the lack of kselftests), Ido Schimmel explains that entries with the "extern_learn" flag (BR_FDB_ADDED_BY_EXT_LEARN) should still be notified to switchdev, since the spectrum driver listens to them (and this is kind of okay, because although they are treated identically to "static", they are expected to not age, and to roam). Fixes: 6b26b51b1d13 ("net: bridge: Add support for notifying devices about FDB add/del") Link: https://lore.kernel.org/netdev/20230327115206.jk5q5l753aoelwus@skbuf/ Signed-off-by: Vladimir Oltean Reviewed-by: Jesse Brandeburg Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Link: https://lore.kernel.org/r/20230418155902.898627-1-vladimir.oltean@nxp.com Signed-off-by: Paolo Abeni --- net/bridge/br_switchdev.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index de18e9c1d7a7..ba95c4d74a60 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -148,6 +148,17 @@ br_switchdev_fdb_notify(struct net_bridge *br, if (test_bit(BR_FDB_LOCKED, &fdb->flags)) return; + /* Entries with these flags were created using ndm_state == NUD_REACHABLE, + * ndm_flags == NTF_MASTER( | NTF_STICKY), ext_flags == 0 by something + * equivalent to 'bridge fdb add ... master dynamic (sticky)'. + * Drivers don't know how to deal with these, so don't notify them to + * avoid confusing them. + */ + if (test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags) && + !test_bit(BR_FDB_STATIC, &fdb->flags) && + !test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) + return; + br_switchdev_fdb_populate(br, &item, fdb, NULL); switch (type) {