From 8222d5910dae08213b6d9d4bc9a7f8502855e624 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 9 Feb 2023 09:09:52 +0800 Subject: [PATCH 001/108] xfrm: Zero padding when dumping algos and encap When copying data to user-space we should ensure that only valid data is copied over. Padding in structures may be filled with random (possibly sensitve) data and should never be given directly to user-space. This patch fixes the copying of xfrm algorithms and the encap template in xfrm_user so that padding is zeroed. Reported-by: syzbot+fa5414772d5c445dac3c@syzkaller.appspotmail.com Reported-by: Hyunwoo Kim Signed-off-by: Herbert Xu Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 45 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index cf5172d4ce68..103af2b3e986 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1012,7 +1012,9 @@ static int copy_to_user_aead(struct xfrm_algo_aead *aead, struct sk_buff *skb) return -EMSGSIZE; ap = nla_data(nla); - memcpy(ap, aead, sizeof(*aead)); + strscpy_pad(ap->alg_name, aead->alg_name, sizeof(ap->alg_name)); + ap->alg_key_len = aead->alg_key_len; + ap->alg_icv_len = aead->alg_icv_len; if (redact_secret && aead->alg_key_len) memset(ap->alg_key, 0, (aead->alg_key_len + 7) / 8); @@ -1032,7 +1034,8 @@ static int copy_to_user_ealg(struct xfrm_algo *ealg, struct sk_buff *skb) return -EMSGSIZE; ap = nla_data(nla); - memcpy(ap, ealg, sizeof(*ealg)); + strscpy_pad(ap->alg_name, ealg->alg_name, sizeof(ap->alg_name)); + ap->alg_key_len = ealg->alg_key_len; if (redact_secret && ealg->alg_key_len) memset(ap->alg_key, 0, (ealg->alg_key_len + 7) / 8); @@ -1043,6 +1046,40 @@ static int copy_to_user_ealg(struct xfrm_algo *ealg, struct sk_buff *skb) return 0; } +static int copy_to_user_calg(struct xfrm_algo *calg, struct sk_buff *skb) +{ + struct nlattr *nla = nla_reserve(skb, XFRMA_ALG_COMP, sizeof(*calg)); + struct xfrm_algo *ap; + + if (!nla) + return -EMSGSIZE; + + ap = nla_data(nla); + strscpy_pad(ap->alg_name, calg->alg_name, sizeof(ap->alg_name)); + ap->alg_key_len = 0; + + return 0; +} + +static int copy_to_user_encap(struct xfrm_encap_tmpl *ep, struct sk_buff *skb) +{ + struct nlattr *nla = nla_reserve(skb, XFRMA_ENCAP, sizeof(*ep)); + struct xfrm_encap_tmpl *uep; + + if (!nla) + return -EMSGSIZE; + + uep = nla_data(nla); + memset(uep, 0, sizeof(*uep)); + + uep->encap_type = ep->encap_type; + uep->encap_sport = ep->encap_sport; + uep->encap_dport = ep->encap_dport; + uep->encap_oa = ep->encap_oa; + + return 0; +} + static int xfrm_smark_put(struct sk_buff *skb, struct xfrm_mark *m) { int ret = 0; @@ -1098,12 +1135,12 @@ static int copy_to_user_state_extra(struct xfrm_state *x, goto out; } if (x->calg) { - ret = nla_put(skb, XFRMA_ALG_COMP, sizeof(*(x->calg)), x->calg); + ret = copy_to_user_calg(x->calg, skb); if (ret) goto out; } if (x->encap) { - ret = nla_put(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap); + ret = copy_to_user_encap(x->encap, skb); if (ret) goto out; } From c276a706ea1f51cf9723ed8484feceaf961b8f89 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 21 Feb 2023 13:54:00 +0800 Subject: [PATCH 002/108] xfrm: Allow transport-mode states with AF_UNSPEC selector xfrm state selectors are matched against the inner-most flow which can be of any address family. Therefore middle states in nested configurations need to carry a wildcard selector in order to work at all. However, this is currently forbidden for transport-mode states. Fix this by removing the unnecessary check. Fixes: 13996378e658 ("[IPSEC]: Rename mode to outer_mode and add inner_mode") Reported-by: David George Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 00afe831c71c..f238048bf786 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2815,11 +2815,6 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload, goto error; } - if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL)) { - NL_SET_ERR_MSG(extack, "Only tunnel modes can accommodate an AF_UNSPEC selector"); - goto error; - } - x->inner_mode = *inner_mode; if (x->props.family == AF_INET) From 068d82e75d537b444303b8c449a11e51ea659565 Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Tue, 7 Mar 2023 23:22:56 +0000 Subject: [PATCH 003/108] netfilter: nft_nat: correct length for loading protocol registers The values in the protocol registers are two bytes wide. However, when parsing the register loads, the code currently uses the larger 16-byte size of a `union nf_inet_addr`. Change it to use the (correct) size of a `union nf_conntrack_man_proto` instead. Fixes: d07db9884a5f ("netfilter: nf_tables: introduce nft_validate_register_load()") Signed-off-by: Jeremy Sowden Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_nat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 047999150390..5c29915ab028 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -226,7 +226,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->flags |= NF_NAT_RANGE_MAP_IPS; } - plen = sizeof_field(struct nf_nat_range, min_addr.all); + plen = sizeof_field(struct nf_nat_range, min_proto.all); if (tb[NFTA_NAT_REG_PROTO_MIN]) { err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MIN], &priv->sreg_proto_min, plen); From ec2c5917eb858428b2083d1c74f445aabbe8316b Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Tue, 7 Mar 2023 23:22:57 +0000 Subject: [PATCH 004/108] netfilter: nft_masq: correct length for loading protocol registers The values in the protocol registers are two bytes wide. However, when parsing the register loads, the code currently uses the larger 16-byte size of a `union nf_inet_addr`. Change it to use the (correct) size of a `union nf_conntrack_man_proto` instead. Fixes: 8a6bf5da1aef ("netfilter: nft_masq: support port range") Signed-off-by: Jeremy Sowden Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_masq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index e55e455275c4..9544c2f16998 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -43,7 +43,7 @@ static int nft_masq_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { - u32 plen = sizeof_field(struct nf_nat_range, min_addr.all); + u32 plen = sizeof_field(struct nf_nat_range, min_proto.all); struct nft_masq *priv = nft_expr_priv(expr); int err; From 1f617b6b4c7a3d5ea7a56abb83a4c27733b60c2f Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Tue, 7 Mar 2023 23:22:58 +0000 Subject: [PATCH 005/108] netfilter: nft_redir: correct length for loading protocol registers The values in the protocol registers are two bytes wide. However, when parsing the register loads, the code currently uses the larger 16-byte size of a `union nf_inet_addr`. Change it to use the (correct) size of a `union nf_conntrack_man_proto` instead. Fixes: d07db9884a5f ("netfilter: nf_tables: introduce nft_validate_register_load()") Signed-off-by: Jeremy Sowden Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_redir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index 5f7739987559..dbc642f5d32a 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -48,7 +48,7 @@ static int nft_redir_init(const struct nft_ctx *ctx, unsigned int plen; int err; - plen = sizeof_field(struct nf_nat_range, min_addr.all); + plen = sizeof_field(struct nf_nat_range, min_proto.all); if (tb[NFTA_REDIR_REG_PROTO_MIN]) { err = nft_parse_register_load(tb[NFTA_REDIR_REG_PROTO_MIN], &priv->sreg_proto_min, plen); From 493924519b1fe3faab13ee621a43b0d0939abab1 Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Tue, 7 Mar 2023 23:22:59 +0000 Subject: [PATCH 006/108] netfilter: nft_redir: correct value of inet type `.maxattrs` `nft_redir_inet_type.maxattrs` was being set, presumably because of a cut-and-paste error, to `NFTA_MASQ_MAX`, instead of `NFTA_REDIR_MAX`. Fixes: 63ce3940f3ab ("netfilter: nft_redir: add inet support") Signed-off-by: Jeremy Sowden Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_redir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index dbc642f5d32a..67cec56bc84a 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -236,7 +236,7 @@ static struct nft_expr_type nft_redir_inet_type __read_mostly = { .name = "redir", .ops = &nft_redir_inet_ops, .policy = nft_redir_policy, - .maxattr = NFTA_MASQ_MAX, + .maxattr = NFTA_REDIR_MAX, .owner = THIS_MODULE, }; From bced3f7db95ff2e6ca29dc4d1c9751ab5e736a09 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 8 Mar 2023 11:07:45 -0800 Subject: [PATCH 007/108] tcp: tcp_make_synack() can be called from process context tcp_rtx_synack() now could be called in process context as explained in 0a375c822497 ("tcp: tcp_rtx_synack() can be called from process context"). tcp_rtx_synack() might call tcp_make_synack(), which will touch per-CPU variables with preemption enabled. This causes the following BUG: BUG: using __this_cpu_add() in preemptible [00000000] code: ThriftIO1/5464 caller is tcp_make_synack+0x841/0xac0 Call Trace: dump_stack_lvl+0x10d/0x1a0 check_preemption_disabled+0x104/0x110 tcp_make_synack+0x841/0xac0 tcp_v6_send_synack+0x5c/0x450 tcp_rtx_synack+0xeb/0x1f0 inet_rtx_syn_ack+0x34/0x60 tcp_check_req+0x3af/0x9e0 tcp_rcv_state_process+0x59b/0x2030 tcp_v6_do_rcv+0x5f5/0x700 release_sock+0x3a/0xf0 tcp_sendmsg+0x33/0x40 ____sys_sendmsg+0x2f2/0x490 __sys_sendmsg+0x184/0x230 do_syscall_64+0x3d/0x90 Avoid calling __TCP_INC_STATS() with will touch per-cpu variables. Use TCP_INC_STATS() which is safe to be called from context switch. Fixes: 8336886f786f ("tcp: TCP Fast Open Server - support TFO listeners") Signed-off-by: Breno Leitao Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20230308190745.780221-1-leitao@debian.org Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 71d01cf3c13e..ba839e441450 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3605,7 +3605,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, th->window = htons(min(req->rsk_rcv_wnd, 65535U)); tcp_options_write(th, NULL, &opts); th->doff = (tcp_header_size >> 2); - __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ From f624bb6fad23df3270580b4fcef415c6e7bf7705 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 1 Mar 2023 12:09:29 +0200 Subject: [PATCH 008/108] wifi: nl80211: fix NULL-ptr deref in offchan check If, e.g. in AP mode, the link was already created by userspace but not activated yet, it has a chandef but the chandef isn't valid and has no channel. Check for this and ignore this link. Fixes: 7b0a0e3c3a88 ("wifi: cfg80211: do some rework towards MLO link APIs") Signed-off-by: Johannes Berg Signed-off-by: Gregory Greenman Link: https://lore.kernel.org/r/20230301115906.71bd4803fbb9.Iee39c0f6c2d3a59a8227674dc55d52e38b1090cf@changeid Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 112b4bb009c8..51f6582eff7b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -8901,7 +8901,7 @@ static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev, struct cfg80211_chan_def *chandef; chandef = wdev_chandef(wdev, link_id); - if (!chandef) + if (!chandef || !chandef->chan) continue; /* From b27f07c50a73e34eefb6b1030b235192b7ded850 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 24 Feb 2023 13:36:57 +0100 Subject: [PATCH 009/108] wifi: nl80211: fix puncturing bitmap policy This was meant to be a u32, and while applying the patch I tried to use policy validation for it. However, not only did I copy/paste it to u8 instead of u32, but also used the policy range erroneously. Fix both of these issues. Fixes: d7c1a9a0ed18 ("wifi: nl80211: validate and configure puncturing bitmap") Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 51f6582eff7b..6869781283e2 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -462,6 +462,11 @@ nl80211_sta_wme_policy[NL80211_STA_WME_MAX + 1] = { [NL80211_STA_WME_MAX_SP] = { .type = NLA_U8 }, }; +static struct netlink_range_validation nl80211_punct_bitmap_range = { + .min = 0, + .max = 0xffff, +}; + static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD }, [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, @@ -805,7 +810,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MLD_ADDR] = NLA_POLICY_EXACT_LEN(ETH_ALEN), [NL80211_ATTR_MLO_SUPPORT] = { .type = NLA_FLAG }, [NL80211_ATTR_MAX_NUM_AKM_SUITES] = { .type = NLA_REJECT }, - [NL80211_ATTR_PUNCT_BITMAP] = NLA_POLICY_RANGE(NLA_U8, 0, 0xffff), + [NL80211_ATTR_PUNCT_BITMAP] = + NLA_POLICY_FULL_RANGE(NLA_U32, &nl80211_punct_bitmap_range), }; /* policy for the key attributes */ From ce04abc3fcc62cd5640af981ebfd7c4dc3bded28 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 24 Feb 2023 10:52:19 +0100 Subject: [PATCH 010/108] wifi: mac80211: check basic rates validity When userspace sets basic rates, it might send us some rates list that's empty or consists of invalid values only. We're currently ignoring invalid values and then may end up with a rates bitmap that's empty, which later results in a warning. Reject the call if there were no valid rates. Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 8eb342300868..d3d861911ed6 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2611,6 +2611,17 @@ static int ieee80211_change_bss(struct wiphy *wiphy, if (!sband) return -EINVAL; + if (params->basic_rates) { + if (!ieee80211_parse_bitrates(link->conf->chandef.width, + wiphy->bands[sband->band], + params->basic_rates, + params->basic_rates_len, + &link->conf->basic_rates)) + return -EINVAL; + changed |= BSS_CHANGED_BASIC_RATES; + ieee80211_check_rate_mask(link); + } + if (params->use_cts_prot >= 0) { link->conf->use_cts_prot = params->use_cts_prot; changed |= BSS_CHANGED_ERP_CTS_PROT; @@ -2632,16 +2643,6 @@ static int ieee80211_change_bss(struct wiphy *wiphy, changed |= BSS_CHANGED_ERP_SLOT; } - if (params->basic_rates) { - ieee80211_parse_bitrates(link->conf->chandef.width, - wiphy->bands[sband->band], - params->basic_rates, - params->basic_rates_len, - &link->conf->basic_rates); - changed |= BSS_CHANGED_BASIC_RATES; - ieee80211_check_rate_mask(link); - } - if (params->ap_isolate >= 0) { if (params->ap_isolate) sdata->flags |= IEEE80211_SDATA_DONT_BRIDGE_PACKETS; From 96c069508377547f913e7265a80fffe9355de592 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 1 Mar 2023 12:09:33 +0200 Subject: [PATCH 011/108] wifi: cfg80211: fix MLO connection ownership When disconnecting from an MLO connection we need the AP MLD address, not an arbitrary BSSID. Fix the code to do that. Fixes: 9ecff10e82a5 ("wifi: nl80211: refactor BSS lookup in nl80211_associate()") Signed-off-by: Johannes Berg Signed-off-by: Gregory Greenman Link: https://lore.kernel.org/r/20230301115906.4c1b3b18980e.I008f070c7f3b8e8bde9278101ef9e40706a82902@changeid Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 6869781283e2..4f63059efd81 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10799,8 +10799,7 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, static struct cfg80211_bss *nl80211_assoc_bss(struct cfg80211_registered_device *rdev, const u8 *ssid, int ssid_len, - struct nlattr **attrs, - const u8 **bssid_out) + struct nlattr **attrs) { struct ieee80211_channel *chan; struct cfg80211_bss *bss; @@ -10827,7 +10826,6 @@ static struct cfg80211_bss *nl80211_assoc_bss(struct cfg80211_registered_device if (!bss) return ERR_PTR(-ENOENT); - *bssid_out = bssid; return bss; } @@ -10837,7 +10835,7 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) struct net_device *dev = info->user_ptr[1]; struct cfg80211_assoc_request req = {}; struct nlattr **attrs = NULL; - const u8 *bssid, *ssid; + const u8 *ap_addr, *ssid; unsigned int link_id; int err, ssid_len; @@ -10974,6 +10972,7 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) return -EINVAL; req.ap_mld_addr = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]); + ap_addr = req.ap_mld_addr; attrs = kzalloc(attrsize, GFP_KERNEL); if (!attrs) @@ -10999,8 +10998,7 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) goto free; } req.links[link_id].bss = - nl80211_assoc_bss(rdev, ssid, ssid_len, attrs, - &bssid); + nl80211_assoc_bss(rdev, ssid, ssid_len, attrs); if (IS_ERR(req.links[link_id].bss)) { err = PTR_ERR(req.links[link_id].bss); req.links[link_id].bss = NULL; @@ -11051,10 +11049,10 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) if (req.link_id >= 0) return -EINVAL; - req.bss = nl80211_assoc_bss(rdev, ssid, ssid_len, info->attrs, - &bssid); + req.bss = nl80211_assoc_bss(rdev, ssid, ssid_len, info->attrs); if (IS_ERR(req.bss)) return PTR_ERR(req.bss); + ap_addr = req.bss->bssid; } err = nl80211_crypto_settings(rdev, info, &req.crypto, 1); @@ -11067,7 +11065,7 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid; memcpy(dev->ieee80211_ptr->disconnect_bssid, - bssid, ETH_ALEN); + ap_addr, ETH_ALEN); } wdev_unlock(dev->ieee80211_ptr); From 484b7059796e3bc1cb527caa61dfc60da649b4f6 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Thu, 9 Mar 2023 19:50:50 +0300 Subject: [PATCH 012/108] nfc: pn533: initialize struct pn533_out_arg properly struct pn533_out_arg used as a temporary context for out_urb is not initialized properly. Its uninitialized 'phy' field can be dereferenced in error cases inside pn533_out_complete() callback function. It causes the following failure: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 6.2.0-rc3-next-20230110-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022 RIP: 0010:pn533_out_complete.cold+0x15/0x44 drivers/nfc/pn533/usb.c:441 Call Trace: __usb_hcd_giveback_urb+0x2b6/0x5c0 drivers/usb/core/hcd.c:1671 usb_hcd_giveback_urb+0x384/0x430 drivers/usb/core/hcd.c:1754 dummy_timer+0x1203/0x32d0 drivers/usb/gadget/udc/dummy_hcd.c:1988 call_timer_fn+0x1da/0x800 kernel/time/timer.c:1700 expire_timers+0x234/0x330 kernel/time/timer.c:1751 __run_timers kernel/time/timer.c:2022 [inline] __run_timers kernel/time/timer.c:1995 [inline] run_timer_softirq+0x326/0x910 kernel/time/timer.c:2035 __do_softirq+0x1fb/0xaf6 kernel/softirq.c:571 invoke_softirq kernel/softirq.c:445 [inline] __irq_exit_rcu+0x123/0x180 kernel/softirq.c:650 irq_exit_rcu+0x9/0x20 kernel/softirq.c:662 sysvec_apic_timer_interrupt+0x97/0xc0 arch/x86/kernel/apic/apic.c:1107 Initialize the field with the pn533_usb_phy currently used. Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Fixes: 9dab880d675b ("nfc: pn533: Wait for out_urb's completion in pn533_usb_send_frame()") Reported-by: syzbot+1e608ba4217c96d1952f@syzkaller.appspotmail.com Signed-off-by: Fedor Pchelkin Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230309165050.207390-1-pchelkin@ispras.ru Signed-off-by: Jakub Kicinski --- drivers/nfc/pn533/usb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nfc/pn533/usb.c b/drivers/nfc/pn533/usb.c index ed9c5e2cf3ad..a187f0e0b0f7 100644 --- a/drivers/nfc/pn533/usb.c +++ b/drivers/nfc/pn533/usb.c @@ -175,6 +175,7 @@ static int pn533_usb_send_frame(struct pn533 *dev, print_hex_dump_debug("PN533 TX: ", DUMP_PREFIX_NONE, 16, 1, out->data, out->len, false); + arg.phy = phy; init_completion(&arg.done); cntx = phy->out_urb->context; phy->out_urb->context = &arg; From 25074a44ac4e5dae5b4a25dcb9bbfcbd00f15ae2 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Wed, 8 Mar 2023 10:49:33 +0800 Subject: [PATCH 013/108] virtio_net: reorder some funcs The purpose of this is to facilitate the subsequent addition of new functions without introducing a separate declaration. Signed-off-by: Xuan Zhuo Acked-by: Michael S. Tsirkin Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 92 ++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index fb5e68ed3ec2..8b31a04052f2 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -545,6 +545,52 @@ ok: return skb; } +static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi) +{ + unsigned int len; + unsigned int packets = 0; + unsigned int bytes = 0; + void *ptr; + + while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { + if (likely(!is_xdp_frame(ptr))) { + struct sk_buff *skb = ptr; + + pr_debug("Sent skb %p\n", skb); + + bytes += skb->len; + napi_consume_skb(skb, in_napi); + } else { + struct xdp_frame *frame = ptr_to_xdp(ptr); + + bytes += xdp_get_frame_len(frame); + xdp_return_frame(frame); + } + packets++; + } + + /* Avoid overhead when no packets have been processed + * happens when called speculatively from start_xmit. + */ + if (!packets) + return; + + u64_stats_update_begin(&sq->stats.syncp); + sq->stats.bytes += bytes; + sq->stats.packets += packets; + u64_stats_update_end(&sq->stats.syncp); +} + +static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q) +{ + if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs)) + return false; + else if (q < vi->curr_queue_pairs) + return true; + else + return false; +} + static int __virtnet_xdp_xmit_one(struct virtnet_info *vi, struct send_queue *sq, struct xdp_frame *xdpf) @@ -1714,52 +1760,6 @@ static int virtnet_receive(struct receive_queue *rq, int budget, return stats.packets; } -static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi) -{ - unsigned int len; - unsigned int packets = 0; - unsigned int bytes = 0; - void *ptr; - - while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { - if (likely(!is_xdp_frame(ptr))) { - struct sk_buff *skb = ptr; - - pr_debug("Sent skb %p\n", skb); - - bytes += skb->len; - napi_consume_skb(skb, in_napi); - } else { - struct xdp_frame *frame = ptr_to_xdp(ptr); - - bytes += xdp_get_frame_len(frame); - xdp_return_frame(frame); - } - packets++; - } - - /* Avoid overhead when no packets have been processed - * happens when called speculatively from start_xmit. - */ - if (!packets) - return; - - u64_stats_update_begin(&sq->stats.syncp); - sq->stats.bytes += bytes; - sq->stats.packets += packets; - u64_stats_update_end(&sq->stats.syncp); -} - -static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q) -{ - if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs)) - return false; - else if (q < vi->curr_queue_pairs) - return true; - else - return false; -} - static void virtnet_poll_cleantx(struct receive_queue *rq) { struct virtnet_info *vi = rq->vq->vdev->priv; From b8ef4809bc7faa22e63de921ef56de21ed191af0 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Wed, 8 Mar 2023 10:49:34 +0800 Subject: [PATCH 014/108] virtio_net: separate the logic of checking whether sq is full Separate the logic of checking whether sq is full. The subsequent patch will reuse this func. Signed-off-by: Xuan Zhuo Reviewed-by: Alexander Duyck Acked-by: Michael S. Tsirkin Acked-by: Jason Wang Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 60 ++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 8b31a04052f2..46bbddaadb0d 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -591,6 +591,41 @@ static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q) return false; } +static void check_sq_full_and_disable(struct virtnet_info *vi, + struct net_device *dev, + struct send_queue *sq) +{ + bool use_napi = sq->napi.weight; + int qnum; + + qnum = sq - vi->sq; + + /* If running out of space, stop queue to avoid getting packets that we + * are then unable to transmit. + * An alternative would be to force queuing layer to requeue the skb by + * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be + * returned in a normal path of operation: it means that driver is not + * maintaining the TX queue stop/start state properly, and causes + * the stack to do a non-trivial amount of useless work. + * Since most packets only take 1 or 2 ring slots, stopping the queue + * early means 16 slots are typically wasted. + */ + if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { + netif_stop_subqueue(dev, qnum); + if (use_napi) { + if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) + virtqueue_napi_schedule(&sq->napi, sq->vq); + } else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { + /* More just got used, free them then recheck. */ + free_old_xmit_skbs(sq, false); + if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { + netif_start_subqueue(dev, qnum); + virtqueue_disable_cb(sq->vq); + } + } + } +} + static int __virtnet_xdp_xmit_one(struct virtnet_info *vi, struct send_queue *sq, struct xdp_frame *xdpf) @@ -1989,30 +2024,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) nf_reset_ct(skb); } - /* If running out of space, stop queue to avoid getting packets that we - * are then unable to transmit. - * An alternative would be to force queuing layer to requeue the skb by - * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be - * returned in a normal path of operation: it means that driver is not - * maintaining the TX queue stop/start state properly, and causes - * the stack to do a non-trivial amount of useless work. - * Since most packets only take 1 or 2 ring slots, stopping the queue - * early means 16 slots are typically wasted. - */ - if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { - netif_stop_subqueue(dev, qnum); - if (use_napi) { - if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) - virtqueue_napi_schedule(&sq->napi, sq->vq); - } else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { - /* More just got used, free them then recheck. */ - free_old_xmit_skbs(sq, false); - if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { - netif_start_subqueue(dev, qnum); - virtqueue_disable_cb(sq->vq); - } - } - } + check_sq_full_and_disable(vi, dev, sq); if (kick || netif_xmit_stopped(txq)) { if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { From cd1c604aa1d8c641f5edcb58b76352d4eba06ec1 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Wed, 8 Mar 2023 10:49:35 +0800 Subject: [PATCH 015/108] virtio_net: add checking sq is full inside xdp xmit If the queue of xdp xmit is not an independent queue, then when the xdp xmit used all the desc, the xmit from the __dev_queue_xmit() may encounter the following error. net ens4: Unexpected TXQ (0) queue failure: -28 This patch adds a check whether sq is full in xdp xmit. Fixes: 56434a01b12e ("virtio_net: add XDP_TX support") Reported-by: Yichun Zhang Signed-off-by: Xuan Zhuo Reviewed-by: Alexander Duyck Acked-by: Michael S. Tsirkin Acked-by: Jason Wang Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 46bbddaadb0d..1a309cfb4976 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -767,6 +767,9 @@ static int virtnet_xdp_xmit(struct net_device *dev, } ret = nxmit; + if (!is_xdp_raw_buffer_queue(vi, sq - vi->sq)) + check_sq_full_and_disable(vi, dev, sq); + if (flags & XDP_XMIT_FLUSH) { if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) kicks = 1; From 71582371a5ee09272b4b4b0a07fa6eb78c9d2f90 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 9 Mar 2023 12:49:11 +0100 Subject: [PATCH 016/108] MAINTAINERS: make my email address consistent Use jiri@resnulli.us in all MAINTAINERS entries and fixup .mailmap so all other addresses point to that one. Signed-off-by: Jiri Pirko Link: https://lore.kernel.org/r/20230309114911.923460-1-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- .mailmap | 3 +++ MAINTAINERS | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.mailmap b/.mailmap index 66754ca6656d..438d79c27717 100644 --- a/.mailmap +++ b/.mailmap @@ -210,6 +210,9 @@ Jens Axboe Jens Osterkamp Jernej Skrabec Jessica Zhang +Jiri Pirko +Jiri Pirko +Jiri Pirko Jiri Slaby Jiri Slaby Jiri Slaby diff --git a/MAINTAINERS b/MAINTAINERS index 8d5bc223f305..d86c7807aa20 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5971,7 +5971,7 @@ F: include/linux/dm-*.h F: include/uapi/linux/dm-*.h DEVLINK -M: Jiri Pirko +M: Jiri Pirko L: netdev@vger.kernel.org S: Supported F: Documentation/networking/devlink @@ -15079,7 +15079,7 @@ F: Documentation/hwmon/nzxt-smart2.rst F: drivers/hwmon/nzxt-smart2.c OBJAGG -M: Jiri Pirko +M: Jiri Pirko L: netdev@vger.kernel.org S: Supported F: include/linux/objagg.h @@ -15853,7 +15853,7 @@ F: drivers/video/logo/logo_parisc* F: include/linux/hp_sdc.h PARMAN -M: Jiri Pirko +M: Jiri Pirko L: netdev@vger.kernel.org S: Supported F: include/linux/parman.h From 8ba572052a4b8fe5b205854d27e54e3486049b71 Mon Sep 17 00:00:00 2001 From: "Radu Pirea (OSS)" Date: Thu, 9 Mar 2023 12:01:11 +0200 Subject: [PATCH 017/108] net: phy: nxp-c45-tja11xx: fix MII_BASIC_CONFIG_REV bit According to the TJA1103 user manual, the bit for the reversed role in MII or RMII modes is bit 4. Cc: # 5.15+ Fixes: b050f2f15e04 ("phy: nxp-c45: add driver for tja1103") Signed-off-by: Radu Pirea (OSS) Link: https://lore.kernel.org/r/20230309100111.1246214-1-radu-nicolae.pirea@oss.nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/nxp-c45-tja11xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/nxp-c45-tja11xx.c b/drivers/net/phy/nxp-c45-tja11xx.c index 047c581457e3..5813b07242ce 100644 --- a/drivers/net/phy/nxp-c45-tja11xx.c +++ b/drivers/net/phy/nxp-c45-tja11xx.c @@ -79,7 +79,7 @@ #define SGMII_ABILITY BIT(0) #define VEND1_MII_BASIC_CONFIG 0xAFC6 -#define MII_BASIC_CONFIG_REV BIT(8) +#define MII_BASIC_CONFIG_REV BIT(4) #define MII_BASIC_CONFIG_SGMII 0x9 #define MII_BASIC_CONFIG_RGMII 0x7 #define MII_BASIC_CONFIG_RMII 0x5 From 59a0b022aa249e3f5735d93de0849341722c4754 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Thu, 9 Mar 2023 10:03:36 +0800 Subject: [PATCH 018/108] ipvlan: Make skb->skb_iif track skb->dev for l3s mode For l3s mode, skb->dev is set to ipvlan interface in ipvlan_nf_input(): skb->dev = addr->master->dev but, skb->skb_iif remain unchanged, this will cause socket lookup failed if a target socket is bound to a interface, like the following example: ip link add ipvlan0 link eth0 type ipvlan mode l3s ip addr add dev ipvlan0 192.168.124.111/24 ip link set ipvlan0 up ping -c 1 -I ipvlan0 8.8.8.8 100% packet loss This is because there is no match sk in __raw_v4_lookup() as sk->sk_bound_dev_if != dif(skb->skb_iif). Fix this by make skb->skb_iif track skb->dev in ipvlan_nf_input(). Fixes: c675e06a98a4 ("ipvlan: decouple l3s mode dependencies from other modes") Signed-off-by: Jianguo Wu Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/29865b1f-6db7-c07a-de89-949d3721ea30@163.com Signed-off-by: Jakub Kicinski --- drivers/net/ipvlan/ipvlan_l3s.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ipvlan/ipvlan_l3s.c b/drivers/net/ipvlan/ipvlan_l3s.c index 943d26cbf39f..71712ea25403 100644 --- a/drivers/net/ipvlan/ipvlan_l3s.c +++ b/drivers/net/ipvlan/ipvlan_l3s.c @@ -101,6 +101,7 @@ static unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb, goto out; skb->dev = addr->master->dev; + skb->skb_iif = skb->dev->ifindex; len = skb->len + ETH_HLEN; ipvlan_count_rx(addr->master, len, true, false); out: From 7e4f8a0c495413a50413e8c9f1032ce1bc633bae Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Thu, 9 Mar 2023 10:45:09 -0800 Subject: [PATCH 019/108] i40e: Fix kernel crash during reboot when adapter is in recovery mode If the driver detects during probe that firmware is in recovery mode then i40e_init_recovery_mode() is called and the rest of probe function is skipped including pci_set_drvdata(). Subsequent i40e_shutdown() called during shutdown/reboot dereferences NULL pointer as pci_get_drvdata() returns NULL. To fix call pci_set_drvdata() also during entering to recovery mode. Reproducer: 1) Lets have i40e NIC with firmware in recovery mode 2) Run reboot Result: [ 139.084698] i40e: Intel(R) Ethernet Connection XL710 Network Driver [ 139.090959] i40e: Copyright (c) 2013 - 2019 Intel Corporation. [ 139.108438] i40e 0000:02:00.0: Firmware recovery mode detected. Limiting functionality. [ 139.116439] i40e 0000:02:00.0: Refer to the Intel(R) Ethernet Adapters and Devices User Guide for details on firmware recovery mode. [ 139.129499] i40e 0000:02:00.0: fw 8.3.64775 api 1.13 nvm 8.30 0x8000b78d 1.3106.0 [8086:1583] [15d9:084a] [ 139.215932] i40e 0000:02:00.0 enp2s0f0: renamed from eth0 [ 139.223292] i40e 0000:02:00.1: Firmware recovery mode detected. Limiting functionality. [ 139.231292] i40e 0000:02:00.1: Refer to the Intel(R) Ethernet Adapters and Devices User Guide for details on firmware recovery mode. [ 139.244406] i40e 0000:02:00.1: fw 8.3.64775 api 1.13 nvm 8.30 0x8000b78d 1.3106.0 [8086:1583] [15d9:084a] [ 139.329209] i40e 0000:02:00.1 enp2s0f1: renamed from eth0 ... [ 156.311376] BUG: kernel NULL pointer dereference, address: 00000000000006c2 [ 156.318330] #PF: supervisor write access in kernel mode [ 156.323546] #PF: error_code(0x0002) - not-present page [ 156.328679] PGD 0 P4D 0 [ 156.331210] Oops: 0002 [#1] PREEMPT SMP NOPTI [ 156.335567] CPU: 26 PID: 15119 Comm: reboot Tainted: G E 6.2.0+ #1 [ 156.343126] Hardware name: Abacus electric, s.r.o. - servis@abacus.cz Super Server/H12SSW-iN, BIOS 2.4 04/13/2022 [ 156.353369] RIP: 0010:i40e_shutdown+0x15/0x130 [i40e] [ 156.358430] Code: c1 fc ff ff 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 0f 1f 44 00 00 55 48 89 fd 53 48 8b 9f 48 01 00 00 80 8b c2 06 00 00 04 f0 80 8b c0 06 00 00 08 48 8d bb 08 08 00 [ 156.377168] RSP: 0018:ffffb223c8447d90 EFLAGS: 00010282 [ 156.382384] RAX: ffffffffc073ee70 RBX: 0000000000000000 RCX: 0000000000000001 [ 156.389510] RDX: 0000000080000001 RSI: 0000000000000246 RDI: ffff95db49988000 [ 156.396634] RBP: ffff95db49988000 R08: ffffffffffffffff R09: ffffffff8bd17d40 [ 156.403759] R10: 0000000000000001 R11: ffffffff8a5e3d28 R12: ffff95db49988000 [ 156.410882] R13: ffffffff89a6fe17 R14: ffff95db49988150 R15: 0000000000000000 [ 156.418007] FS: 00007fe7c0cc3980(0000) GS:ffff95ea8ee80000(0000) knlGS:0000000000000000 [ 156.426083] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 156.431819] CR2: 00000000000006c2 CR3: 00000003092fc005 CR4: 0000000000770ee0 [ 156.438944] PKRU: 55555554 [ 156.441647] Call Trace: [ 156.444096] [ 156.446199] pci_device_shutdown+0x38/0x60 [ 156.450297] device_shutdown+0x163/0x210 [ 156.454215] kernel_restart+0x12/0x70 [ 156.457872] __do_sys_reboot+0x1ab/0x230 [ 156.461789] ? vfs_writev+0xa6/0x1a0 [ 156.465362] ? __pfx_file_free_rcu+0x10/0x10 [ 156.469635] ? __call_rcu_common.constprop.85+0x109/0x5a0 [ 156.475034] do_syscall_64+0x3e/0x90 [ 156.478611] entry_SYSCALL_64_after_hwframe+0x72/0xdc [ 156.483658] RIP: 0033:0x7fe7bff37ab7 Fixes: 4ff0ee1af016 ("i40e: Introduce recovery mode support") Signed-off-by: Ivan Vecera Tested-by: Arpana Arland (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20230309184509.984639-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 467001db5070..228cd502bb48 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -15525,6 +15525,7 @@ static int i40e_init_recovery_mode(struct i40e_pf *pf, struct i40e_hw *hw) int err; int v_idx; + pci_set_drvdata(pf->pdev, pf); pci_save_state(pf->pdev); /* set up periodic task facility */ From 8f76a4f80fba8096a611b6b60c40a0f4cab3ddfb Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 9 Mar 2023 13:25:25 +0100 Subject: [PATCH 020/108] tools: ynl: fix render-max for flags definition Properly manage render-max property for flags definition type introducing mask value and setting it to (last_element << 1) - 1 instead of adding max value set to last_element + 1 Fixes: be5bea1cc0bf ("net: add basic C code generators for Netlink") Signed-off-by: Lorenzo Bianconi Signed-off-by: Jakub Kicinski --- tools/net/ynl/ynl-gen-c.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index 1bcc5354d800..d47376f19de7 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -1931,9 +1931,14 @@ def render_uapi(family, cw): if const.get('render-max', False): cw.nl() - max_name = c_upper(name_pfx + 'max') - cw.p('__' + max_name + ',') - cw.p(max_name + ' = (__' + max_name + ' - 1)') + if const['type'] == 'flags': + max_name = c_upper(name_pfx + 'mask') + max_val = f' = {enum.get_mask()},' + cw.p(max_name + max_val) + else: + max_name = c_upper(name_pfx + 'max') + cw.p('__' + max_name + ',') + cw.p(max_name + ' = (__' + max_name + ' - 1)') cw.block_end(line=';') cw.nl() elif const['type'] == 'const': From bf51d27704c963ea52c0843096e23c9f404b13af Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 9 Mar 2023 13:25:26 +0100 Subject: [PATCH 021/108] tools: ynl: fix get_mask utility routine Fix get_mask utility routine in order to take into account possible gaps in the elements list. Fixes: be5bea1cc0bf ("net: add basic C code generators for Netlink") Signed-off-by: Lorenzo Bianconi Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/nlspec.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/net/ynl/lib/nlspec.py b/tools/net/ynl/lib/nlspec.py index a34d088f6743..960a356e8225 100644 --- a/tools/net/ynl/lib/nlspec.py +++ b/tools/net/ynl/lib/nlspec.py @@ -138,10 +138,8 @@ class SpecEnumSet(SpecElement): def get_mask(self): mask = 0 - idx = self.yaml.get('value-start', 0) - for _ in self.entries.values(): - mask |= 1 << idx - idx += 1 + for e in self.entries.values(): + mask += e.user_value() return mask From f85949f98206b3b11d92d695cea4efda6a81f00e Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 9 Mar 2023 13:25:27 +0100 Subject: [PATCH 022/108] xdp: add xdp_set_features_flag utility routine Introduce xdp_set_features_flag utility routine in order to update dynamically xdp_features according to the dynamic hw configuration via ethtool (e.g. changing number of hw rx/tx queues). Add xdp_clear_features_flag() in order to clear all xdp_feature flag. Reviewed-by: Shay Agroskin Signed-off-by: Lorenzo Bianconi Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 1 + include/net/xdp.h | 11 +++++++++++ include/uapi/linux/netdev.h | 2 ++ net/core/xdp.c | 26 ++++++++++++++++++------- tools/include/uapi/linux/netdev.h | 2 ++ 5 files changed, 35 insertions(+), 7 deletions(-) diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 24de747b5344..753e5914a8b7 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -9,6 +9,7 @@ definitions: - type: flags name: xdp-act + render-max: true entries: - name: basic diff --git a/include/net/xdp.h b/include/net/xdp.h index d517bfac937b..41c57b8b1671 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -428,12 +428,18 @@ MAX_XDP_METADATA_KFUNC, #ifdef CONFIG_NET u32 bpf_xdp_metadata_kfunc_id(int id); bool bpf_dev_bound_kfunc_id(u32 btf_id); +void xdp_set_features_flag(struct net_device *dev, xdp_features_t val); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg); void xdp_features_clear_redirect_target(struct net_device *dev); #else static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; } static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; } +static inline void +xdp_set_features_flag(struct net_device *dev, xdp_features_t val) +{ +} + static inline void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) { @@ -445,4 +451,9 @@ xdp_features_clear_redirect_target(struct net_device *dev) } #endif +static inline void xdp_clear_features_flag(struct net_device *dev) +{ + xdp_set_features_flag(dev, 0); +} + #endif /* __LINUX_NET_XDP_H__ */ diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 8c4e3e536c04..ed134fbdfd32 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -33,6 +33,8 @@ enum netdev_xdp_act { NETDEV_XDP_ACT_HW_OFFLOAD = 16, NETDEV_XDP_ACT_RX_SG = 32, NETDEV_XDP_ACT_NDO_XMIT_SG = 64, + + NETDEV_XDP_ACT_MASK = 127, }; enum { diff --git a/net/core/xdp.c b/net/core/xdp.c index 8c92fc553317..87e654b7d06c 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -774,20 +774,32 @@ static int __init xdp_metadata_init(void) } late_initcall(xdp_metadata_init); +void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) +{ + val &= NETDEV_XDP_ACT_MASK; + if (dev->xdp_features == val) + return; + + dev->xdp_features = val; + call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev); +} +EXPORT_SYMBOL_GPL(xdp_set_features_flag); + void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) { - dev->xdp_features |= NETDEV_XDP_ACT_NDO_XMIT; - if (support_sg) - dev->xdp_features |= NETDEV_XDP_ACT_NDO_XMIT_SG; + xdp_features_t val = (dev->xdp_features | NETDEV_XDP_ACT_NDO_XMIT); - call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev); + if (support_sg) + val |= NETDEV_XDP_ACT_NDO_XMIT_SG; + xdp_set_features_flag(dev, val); } EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target); void xdp_features_clear_redirect_target(struct net_device *dev) { - dev->xdp_features &= ~(NETDEV_XDP_ACT_NDO_XMIT | - NETDEV_XDP_ACT_NDO_XMIT_SG); - call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev); + xdp_features_t val = dev->xdp_features; + + val &= ~(NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_NDO_XMIT_SG); + xdp_set_features_flag(dev, val); } EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target); diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 8c4e3e536c04..ed134fbdfd32 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -33,6 +33,8 @@ enum netdev_xdp_act { NETDEV_XDP_ACT_HW_OFFLOAD = 16, NETDEV_XDP_ACT_RX_SG = 32, NETDEV_XDP_ACT_NDO_XMIT_SG = 64, + + NETDEV_XDP_ACT_MASK = 127, }; enum { From 3c249fe4de1608a9af56563606d4f6eb3a64a47f Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 9 Mar 2023 13:25:28 +0100 Subject: [PATCH 023/108] net: thunderx: take into account xdp_features setting tx/rx queues thunderx nic allows xdp just if enough hw queues are available for XDP. Take into account queues configuration setting xdp_features. Fixes: 66c0e13ad236 ("drivers: net: turn on XDP features") Signed-off-by: Lorenzo Bianconi Signed-off-by: Jakub Kicinski --- .../net/ethernet/cavium/thunder/nicvf_ethtool.c | 17 +++++++++++------ .../net/ethernet/cavium/thunder/nicvf_main.c | 4 +++- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c b/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c index e5c71f907852..d8d71bf97983 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c @@ -735,12 +735,17 @@ static int nicvf_set_channels(struct net_device *dev, if (channel->tx_count > nic->max_queues) return -EINVAL; - if (nic->xdp_prog && - ((channel->tx_count + channel->rx_count) > nic->max_queues)) { - netdev_err(nic->netdev, - "XDP mode, RXQs + TXQs > Max %d\n", - nic->max_queues); - return -EINVAL; + if (channel->tx_count + channel->rx_count > nic->max_queues) { + if (nic->xdp_prog) { + netdev_err(nic->netdev, + "XDP mode, RXQs + TXQs > Max %d\n", + nic->max_queues); + return -EINVAL; + } + + xdp_clear_features_flag(nic->netdev); + } else if (!pass1_silicon(nic->pdev)) { + xdp_set_features_flag(dev, NETDEV_XDP_ACT_BASIC); } if (if_up) diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 8b25313c7f6b..eff350e0bc2a 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -2218,7 +2218,9 @@ static int nicvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) netdev->netdev_ops = &nicvf_netdev_ops; netdev->watchdog_timeo = NICVF_TX_TIMEOUT; - netdev->xdp_features = NETDEV_XDP_ACT_BASIC; + if (!pass1_silicon(nic->pdev) && + nic->rx_queues + nic->tx_queues <= nic->max_queues) + netdev->xdp_features = NETDEV_XDP_ACT_BASIC; /* MTU range: 64 - 9200 */ netdev->min_mtu = NIC_HW_MIN_FRS; From 7aa6dc351b92abbfead9ebe25ce7a7ac0384ea6d Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 9 Mar 2023 13:25:29 +0100 Subject: [PATCH 024/108] net: ena: take into account xdp_features setting tx/rx queues ena nic allows xdp just if enough hw queues are available for XDP. Take into account queues configuration setting xdp_features. Fixes: 66c0e13ad236 ("drivers: net: turn on XDP features") Reviewed-by: Shay Agroskin Signed-off-by: Lorenzo Bianconi Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amazon/ena/ena_ethtool.c | 15 ++++++++++++--- drivers/net/ethernet/amazon/ena/ena_netdev.c | 6 ++++-- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_ethtool.c b/drivers/net/ethernet/amazon/ena/ena_ethtool.c index 8da79eedc057..1d4f2f4d10f2 100644 --- a/drivers/net/ethernet/amazon/ena/ena_ethtool.c +++ b/drivers/net/ethernet/amazon/ena/ena_ethtool.c @@ -850,11 +850,20 @@ static int ena_set_channels(struct net_device *netdev, struct ena_adapter *adapter = netdev_priv(netdev); u32 count = channels->combined_count; /* The check for max value is already done in ethtool */ - if (count < ENA_MIN_NUM_IO_QUEUES || - (ena_xdp_present(adapter) && - !ena_xdp_legal_queue_count(adapter, count))) + if (count < ENA_MIN_NUM_IO_QUEUES) return -EINVAL; + if (!ena_xdp_legal_queue_count(adapter, count)) { + if (ena_xdp_present(adapter)) + return -EINVAL; + + xdp_clear_features_flag(netdev); + } else { + xdp_set_features_flag(netdev, + NETDEV_XDP_ACT_BASIC | + NETDEV_XDP_ACT_REDIRECT); + } + return ena_update_queue_count(adapter, count); } diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index d3999db7c6a2..cbfe7f977270 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -4105,8 +4105,6 @@ static void ena_set_conf_feat_params(struct ena_adapter *adapter, /* Set offload features */ ena_set_dev_offloads(feat, netdev); - netdev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT; - adapter->max_mtu = feat->dev_attr.max_mtu; netdev->max_mtu = adapter->max_mtu; netdev->min_mtu = ENA_MIN_MTU; @@ -4393,6 +4391,10 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent) ena_config_debug_area(adapter); + if (ena_xdp_legal_queue_count(adapter, adapter->num_io_queues)) + netdev->xdp_features = NETDEV_XDP_ACT_BASIC | + NETDEV_XDP_ACT_REDIRECT; + memcpy(adapter->netdev->perm_addr, adapter->mac_addr, netdev->addr_len); netif_carrier_off(netdev); From fccca038f3003daa8f28a5e5d97efe50f04b8d9d Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 9 Mar 2023 13:25:30 +0100 Subject: [PATCH 025/108] veth: take into account device reconfiguration for xdp_features flag Take into account tx/rx queues reconfiguration setting device xdp_features flag. Moreover consider NETIF_F_GRO flag in order to enable ndo_xdp_xmit callback. Fixes: 66c0e13ad236 ("drivers: net: turn on XDP features") Signed-off-by: Lorenzo Bianconi Signed-off-by: Jakub Kicinski --- drivers/net/veth.c | 42 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 1bb54de7124d..293dc3b2c84a 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1257,6 +1257,26 @@ static int veth_enable_range_safe(struct net_device *dev, int start, int end) return 0; } +static void veth_set_xdp_features(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + struct net_device *peer; + + peer = rcu_dereference(priv->peer); + if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) { + xdp_features_t val = NETDEV_XDP_ACT_BASIC | + NETDEV_XDP_ACT_REDIRECT | + NETDEV_XDP_ACT_RX_SG; + + if (priv->_xdp_prog || veth_gro_requested(dev)) + val |= NETDEV_XDP_ACT_NDO_XMIT | + NETDEV_XDP_ACT_NDO_XMIT_SG; + xdp_set_features_flag(dev, val); + } else { + xdp_clear_features_flag(dev); + } +} + static int veth_set_channels(struct net_device *dev, struct ethtool_channels *ch) { @@ -1323,6 +1343,12 @@ out: if (peer) netif_carrier_on(peer); } + + /* update XDP supported features */ + veth_set_xdp_features(dev); + if (peer) + veth_set_xdp_features(peer); + return err; revert: @@ -1489,7 +1515,10 @@ static int veth_set_features(struct net_device *dev, err = veth_napi_enable(dev); if (err) return err; + + xdp_features_set_redirect_target(dev, true); } else { + xdp_features_clear_redirect_target(dev); veth_napi_del(dev); } return 0; @@ -1570,10 +1599,15 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; peer->max_mtu = max_mtu; } + + xdp_features_set_redirect_target(dev, true); } if (old_prog) { if (!prog) { + if (!veth_gro_requested(dev)) + xdp_features_clear_redirect_target(dev); + if (dev->flags & IFF_UP) veth_disable_xdp(dev); @@ -1686,10 +1720,6 @@ static void veth_setup(struct net_device *dev) dev->hw_enc_features = VETH_FEATURES; dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; netif_set_tso_max_size(dev, GSO_MAX_SIZE); - - dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | - NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_RX_SG | - NETDEV_XDP_ACT_NDO_XMIT_SG; } /* @@ -1857,6 +1887,10 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, goto err_queues; veth_disable_gro(dev); + /* update XDP supported features */ + veth_set_xdp_features(dev); + veth_set_xdp_features(peer); + return 0; err_queues: From 4d5ab0ad964df178beba031b89429a601893ff61 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 9 Mar 2023 13:25:31 +0100 Subject: [PATCH 026/108] net/mlx5e: take into account device reconfiguration for xdp_features flag Take into account LRO and GRO configuration setting device xdp_features flag. Consider channel rq_wq_type enabling rx scatter-gatter support in xdp_features flag and disable NETDEV_XDP_ACT_NDO_XMIT_SG since it is not supported yet by the driver. Moreover always enable NETDEV_XDP_ACT_NDO_XMIT as the ndo_xdp_xmit callback does not require to load a dummy xdp program on the NIC. Fixes: 66c0e13ad236 ("drivers: net: turn on XDP features") Co-developed-by: Tariq Toukan Signed-off-by: Tariq Toukan Signed-off-by: Lorenzo Bianconi Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 + .../ethernet/mellanox/mlx5/core/en_ethtool.c | 10 ++++- .../net/ethernet/mellanox/mlx5/core/en_main.c | 37 +++++++++++++------ .../net/ethernet/mellanox/mlx5/core/en_rep.c | 3 ++ 4 files changed, 39 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 88460b7796e5..4276c6eb6820 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -1243,6 +1243,7 @@ void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 void mlx5e_rx_dim_work(struct work_struct *work); void mlx5e_tx_dim_work(struct work_struct *work); +void mlx5e_set_xdp_feature(struct net_device *netdev); netdev_features_t mlx5e_features_check(struct sk_buff *skb, struct net_device *netdev, netdev_features_t features); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 7708acc9b2ab..79fd21ecb9cb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -1985,6 +1985,7 @@ static int set_pflag_rx_striding_rq(struct net_device *netdev, bool enable) struct mlx5e_priv *priv = netdev_priv(netdev); struct mlx5_core_dev *mdev = priv->mdev; struct mlx5e_params new_params; + int err; if (enable) { /* Checking the regular RQ here; mlx5e_validate_xsk_param called @@ -2005,7 +2006,14 @@ static int set_pflag_rx_striding_rq(struct net_device *netdev, bool enable) MLX5E_SET_PFLAG(&new_params, MLX5E_PFLAG_RX_STRIDING_RQ, enable); mlx5e_set_rq_type(mdev, &new_params); - return mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, true); + err = mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, true); + if (err) + return err; + + /* update XDP supported features */ + mlx5e_set_xdp_feature(netdev); + + return 0; } static int set_pflag_rx_no_csum_complete(struct net_device *netdev, bool enable) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 76a9c5194a70..51b5f3cca504 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4004,6 +4004,25 @@ static int mlx5e_handle_feature(struct net_device *netdev, return 0; } +void mlx5e_set_xdp_feature(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_params *params = &priv->channels.params; + xdp_features_t val; + + if (params->packet_merge.type != MLX5E_PACKET_MERGE_NONE) { + xdp_clear_features_flag(netdev); + return; + } + + val = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | + NETDEV_XDP_ACT_XSK_ZEROCOPY | + NETDEV_XDP_ACT_NDO_XMIT; + if (params->rq_wq_type == MLX5_WQ_TYPE_CYCLIC) + val |= NETDEV_XDP_ACT_RX_SG; + xdp_set_features_flag(netdev, val); +} + int mlx5e_set_features(struct net_device *netdev, netdev_features_t features) { netdev_features_t oper_features = features; @@ -4030,6 +4049,9 @@ int mlx5e_set_features(struct net_device *netdev, netdev_features_t features) return -EINVAL; } + /* update XDP supported features */ + mlx5e_set_xdp_feature(netdev); + return 0; } @@ -4761,13 +4783,6 @@ static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog) if (old_prog) bpf_prog_put(old_prog); - if (reset) { - if (prog) - xdp_features_set_redirect_target(netdev, true); - else - xdp_features_clear_redirect_target(netdev); - } - if (!test_bit(MLX5E_STATE_OPENED, &priv->state) || reset) goto unlock; @@ -5163,13 +5178,10 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev) netdev->features |= NETIF_F_HIGHDMA; netdev->features |= NETIF_F_HW_VLAN_STAG_FILTER; - netdev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | - NETDEV_XDP_ACT_XSK_ZEROCOPY | - NETDEV_XDP_ACT_RX_SG; - netdev->priv_flags |= IFF_UNICAST_FLT; netif_set_tso_max_size(netdev, GSO_MAX_SIZE); + mlx5e_set_xdp_feature(netdev); mlx5e_set_netdev_dev_addr(netdev); mlx5e_macsec_build_netdev(priv); mlx5e_ipsec_build_netdev(priv); @@ -5241,6 +5253,9 @@ static int mlx5e_nic_init(struct mlx5_core_dev *mdev, mlx5_core_err(mdev, "TLS initialization failed, %d\n", err); mlx5e_health_create_reporters(priv); + /* update XDP supported features */ + mlx5e_set_xdp_feature(netdev); + return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 9b9203443085..43fd12fb87b8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -747,6 +747,9 @@ static void mlx5e_build_rep_params(struct net_device *netdev) /* RQ */ mlx5e_build_rq_params(mdev, params); + /* update XDP supported features */ + mlx5e_set_xdp_feature(netdev); + /* CQ moderation params */ params->rx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation); mlx5e_set_rx_cq_mode_params(params, cq_period_mode); From 481e96fc1307eb52c0c449608e629921ecbbaf15 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Thu, 9 Mar 2023 13:25:32 +0100 Subject: [PATCH 027/108] mvpp2: take care of xdp_features when reconfiguring queues XDP is supported only if enough queues are present, so when reconfiguring the queues set xdp_features accordingly. Fixes: 66c0e13ad236 ("drivers: net: turn on XDP features") Suggested-by: Lorenzo Bianconi Signed-off-by: Matteo Croce Signed-off-by: Lorenzo Bianconi Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 9b4ecbe4f36d..3ea00bc9b91c 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -4996,6 +4996,14 @@ static int mvpp2_bm_switch_buffers(struct mvpp2 *priv, bool percpu) for (i = 0; i < priv->port_count; i++) { port = priv->port_list[i]; + if (percpu && port->ntxqs >= num_possible_cpus() * 2) + xdp_set_features_flag(port->dev, + NETDEV_XDP_ACT_BASIC | + NETDEV_XDP_ACT_REDIRECT | + NETDEV_XDP_ACT_NDO_XMIT); + else + xdp_clear_features_flag(port->dev); + mvpp2_swf_bm_pool_init(port); if (status[i]) mvpp2_open(port->dev); @@ -6863,13 +6871,14 @@ static int mvpp2_port_probe(struct platform_device *pdev, if (!port->priv->percpu_pools) mvpp2_set_hw_csum(port, port->pool_long->id); + else if (port->ntxqs >= num_possible_cpus() * 2) + dev->xdp_features = NETDEV_XDP_ACT_BASIC | + NETDEV_XDP_ACT_REDIRECT | + NETDEV_XDP_ACT_NDO_XMIT; dev->vlan_features |= features; netif_set_tso_max_segs(dev, MVPP2_MAX_TSO_SEGS); - dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | - NETDEV_XDP_ACT_NDO_XMIT; - dev->priv_flags |= IFF_UNICAST_FLT; /* MTU range: 68 - 9704 */ From b7a679ba7c652587b85294f4953f33ac0b756d40 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 9 Mar 2023 15:49:57 +0100 Subject: [PATCH 028/108] mptcp: fix possible deadlock in subflow_error_report Christoph reported a possible deadlock while the TCP stack destroys an unaccepted subflow due to an incoming reset: the MPTCP socket error path tries to acquire the msk-level socket lock while TCP still owns the listener socket accept queue spinlock, and the reverse dependency already exists in the TCP stack. Note that the above is actually a lockdep false positive, as the chain involves two separate sockets. A different per-socket lockdep key will address the issue, but such a change will be quite invasive. Instead, we can simply stop earlier the socket error handling for orphaned or unaccepted subflows, breaking the critical lockdep chain. Error handling in such a scenario is a no-op. Reported-and-tested-by: Christoph Paasch Fixes: 15cc10453398 ("mptcp: deliver ssk errors to msk") Cc: stable@vger.kernel.org Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/355 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 4ae1a7304cf0..5070dc33675d 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1432,6 +1432,13 @@ static void subflow_error_report(struct sock *ssk) { struct sock *sk = mptcp_subflow_ctx(ssk)->conn; + /* bail early if this is a no-op, so that we avoid introducing a + * problematic lockdep dependency between TCP accept queue lock + * and msk socket spinlock + */ + if (!sk->sk_socket) + return; + mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_error_report(sk); From 3a236aef280ed5122b2d47087eb514d0921ae033 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 9 Mar 2023 15:49:58 +0100 Subject: [PATCH 029/108] mptcp: refactor passive socket initialization After commit 30e51b923e43 ("mptcp: fix unreleased socket in accept queue") unaccepted msk sockets go throu complete shutdown, we don't need anymore to delay inserting the first subflow into the subflow lists. The reference counting deserve some extra care, as __mptcp_close() is unaware of the request socket linkage to the first subflow. Please note that this is more a refactoring than a fix but because this modification is needed to include other corrections, see the following commits. Then a Fixes tag has been added here to help the stable team. Fixes: 30e51b923e43 ("mptcp: fix unreleased socket in accept queue") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Tested-by: Christoph Paasch Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 17 ----------------- net/mptcp/subflow.c | 27 +++++++++++++++++++++------ 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3ad9c46202fc..447641d34c2c 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -825,7 +825,6 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) if (sk->sk_socket && !ssk->sk_socket) mptcp_sock_graft(ssk, sk->sk_socket); - mptcp_propagate_sndbuf((struct sock *)msk, ssk); mptcp_sockopt_sync_locked(msk, ssk); return true; } @@ -3708,22 +3707,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, lock_sock(newsk); - /* PM/worker can now acquire the first subflow socket - * lock without racing with listener queue cleanup, - * we can notify it, if needed. - * - * Even if remote has reset the initial subflow by now - * the refcnt is still at least one. - */ - subflow = mptcp_subflow_ctx(msk->first); - list_add(&subflow->node, &msk->conn_list); - sock_hold(msk->first); - if (mptcp_is_fully_established(newsk)) - mptcp_pm_fully_established(msk, msk->first, GFP_KERNEL); - - mptcp_rcv_space_init(msk, msk->first); - mptcp_propagate_sndbuf(newsk, msk->first); - /* set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 5070dc33675d..a631a5e6fc7b 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -397,6 +397,12 @@ void mptcp_subflow_reset(struct sock *ssk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; + /* mptcp_mp_fail_no_response() can reach here on an already closed + * socket + */ + if (ssk->sk_state == TCP_CLOSE) + return; + /* must hold: tcp_done() could drop last reference on parent */ sock_hold(sk); @@ -750,6 +756,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct mptcp_options_received mp_opt; bool fallback, fallback_is_fatal; struct sock *new_msk = NULL; + struct mptcp_sock *owner; struct sock *child; pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); @@ -824,6 +831,8 @@ create_child: ctx->setsockopt_seq = listener->setsockopt_seq; if (ctx->mp_capable) { + owner = mptcp_sk(new_msk); + /* this can't race with mptcp_close(), as the msk is * not yet exposted to user-space */ @@ -832,14 +841,14 @@ create_child: /* record the newly created socket as the first msk * subflow, but don't link it yet into conn_list */ - WRITE_ONCE(mptcp_sk(new_msk)->first, child); + WRITE_ONCE(owner->first, child); /* new mpc subflow takes ownership of the newly * created mptcp socket */ mptcp_sk(new_msk)->setsockopt_seq = ctx->setsockopt_seq; - mptcp_pm_new_connection(mptcp_sk(new_msk), child, 1); - mptcp_token_accept(subflow_req, mptcp_sk(new_msk)); + mptcp_pm_new_connection(owner, child, 1); + mptcp_token_accept(subflow_req, owner); ctx->conn = new_msk; new_msk = NULL; @@ -847,15 +856,21 @@ create_child: * uses the correct data */ mptcp_copy_inaddrs(ctx->conn, child); + mptcp_propagate_sndbuf(ctx->conn, child); + + mptcp_rcv_space_init(owner, child); + list_add(&ctx->node, &owner->conn_list); + sock_hold(child); /* with OoO packets we can reach here without ingress * mpc option */ - if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK) + if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK) { mptcp_subflow_fully_established(ctx, &mp_opt); + mptcp_pm_fully_established(owner, child, GFP_ATOMIC); + ctx->pm_notified = 1; + } } else if (ctx->mp_join) { - struct mptcp_sock *owner; - owner = subflow_req->msk; if (!owner) { subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); From b6985b9b82954caa53f862d6059d06c0526254f0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 9 Mar 2023 15:49:59 +0100 Subject: [PATCH 030/108] mptcp: use the workqueue to destroy unaccepted sockets Christoph reported a UaF at token lookup time after having refactored the passive socket initialization part: BUG: KASAN: use-after-free in __token_bucket_busy+0x253/0x260 Read of size 4 at addr ffff88810698d5b0 by task syz-executor653/3198 CPU: 1 PID: 3198 Comm: syz-executor653 Not tainted 6.2.0-rc59af4eaa31c1f6c00c8f1e448ed99a45c66340dd5 #6 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x6e/0x91 print_report+0x16a/0x46f kasan_report+0xad/0x130 __token_bucket_busy+0x253/0x260 mptcp_token_new_connect+0x13d/0x490 mptcp_connect+0x4ed/0x860 __inet_stream_connect+0x80e/0xd90 tcp_sendmsg_fastopen+0x3ce/0x710 mptcp_sendmsg+0xff1/0x1a20 inet_sendmsg+0x11d/0x140 __sys_sendto+0x405/0x490 __x64_sys_sendto+0xdc/0x1b0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc We need to properly clean-up all the paired MPTCP-level resources and be sure to release the msk last, even when the unaccepted subflow is destroyed by the TCP internals via inet_child_forget(). We can re-use the existing MPTCP_WORK_CLOSE_SUBFLOW infra, explicitly checking that for the critical scenario: the closed subflow is the MPC one, the msk is not accepted and eventually going through full cleanup. With such change, __mptcp_destroy_sock() is always called on msk sockets, even on accepted ones. We don't need anymore to transiently drop one sk reference at msk clone time. Please note this commit depends on the parent one: mptcp: refactor passive socket initialization Fixes: 58b09919626b ("mptcp: create msk early") Cc: stable@vger.kernel.org Reported-and-tested-by: Christoph Paasch Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/347 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 40 ++++++++++++++++++++++++++++++---------- net/mptcp/protocol.h | 5 ++++- net/mptcp/subflow.c | 17 ++++++++++++----- 3 files changed, 46 insertions(+), 16 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 447641d34c2c..2a2093d61835 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2342,7 +2342,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, goto out; } - sock_orphan(ssk); subflow->disposable = 1; /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops @@ -2350,7 +2349,20 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, * reference owned by msk; */ if (!inet_csk(ssk)->icsk_ulp_ops) { + WARN_ON_ONCE(!sock_flag(ssk, SOCK_DEAD)); kfree_rcu(subflow, rcu); + } else if (msk->in_accept_queue && msk->first == ssk) { + /* if the first subflow moved to a close state, e.g. due to + * incoming reset and we reach here before inet_child_forget() + * the TCP stack could later try to close it via + * inet_csk_listen_stop(), or deliver it to the user space via + * accept(). + * We can't delete the subflow - or risk a double free - nor let + * the msk survive - or will be leaked in the non accept scenario: + * fallback and let TCP cope with the subflow cleanup. + */ + WARN_ON_ONCE(sock_flag(ssk, SOCK_DEAD)); + mptcp_subflow_drop_ctx(ssk); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ if (ssk->sk_state == TCP_LISTEN) { @@ -2398,9 +2410,10 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) return 0; } -static void __mptcp_close_subflow(struct mptcp_sock *msk) +static void __mptcp_close_subflow(struct sock *sk) { struct mptcp_subflow_context *subflow, *tmp; + struct mptcp_sock *msk = mptcp_sk(sk); might_sleep(); @@ -2414,7 +2427,15 @@ static void __mptcp_close_subflow(struct mptcp_sock *msk) if (!skb_queue_empty_lockless(&ssk->sk_receive_queue)) continue; - mptcp_close_ssk((struct sock *)msk, ssk, subflow); + mptcp_close_ssk(sk, ssk, subflow); + } + + /* if the MPC subflow has been closed before the msk is accepted, + * msk will never be accept-ed, close it now + */ + if (!msk->first && msk->in_accept_queue) { + sock_set_flag(sk, SOCK_DEAD); + inet_sk_state_store(sk, TCP_CLOSE); } } @@ -2623,6 +2644,9 @@ static void mptcp_worker(struct work_struct *work) __mptcp_check_send_data_fin(sk); mptcp_check_data_fin(sk); + if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) + __mptcp_close_subflow(sk); + /* There is no point in keeping around an orphaned sk timedout or * closed, but we need the msk around to reply to incoming DATA_FIN, * even if it is orphaned and in FIN_WAIT2 state @@ -2638,9 +2662,6 @@ static void mptcp_worker(struct work_struct *work) } } - if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) - __mptcp_close_subflow(msk); - if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) __mptcp_retrans(sk); @@ -3078,6 +3099,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->local_key = subflow_req->local_key; msk->token = subflow_req->token; msk->subflow = NULL; + msk->in_accept_queue = 1; WRITE_ONCE(msk->fully_established, false); if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD) WRITE_ONCE(msk->csum_enabled, true); @@ -3095,8 +3117,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, security_inet_csk_clone(nsk, req); bh_unlock_sock(nsk); - /* keep a single reference */ - __sock_put(nsk); + /* note: the newly allocated socket refcount is 2 now */ return nsk; } @@ -3152,8 +3173,6 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, goto out; } - /* acquire the 2nd reference for the owning socket */ - sock_hold(new_mptcp_sock); newsk = new_mptcp_sock; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); } else { @@ -3704,6 +3723,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, struct sock *newsk = newsock->sk; set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags); + msk->in_accept_queue = 0; lock_sock(newsk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 61fd8eabfca2..3a2db1b862dd 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -295,7 +295,8 @@ struct mptcp_sock { u8 recvmsg_inq:1, cork:1, nodelay:1, - fastopening:1; + fastopening:1, + in_accept_queue:1; int connect_flags; struct work_struct work; struct sk_buff *ooo_last_skb; @@ -666,6 +667,8 @@ void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow); bool mptcp_subflow_active(struct mptcp_subflow_context *subflow); +void mptcp_subflow_drop_ctx(struct sock *ssk); + static inline void mptcp_subflow_tcp_fallback(struct sock *sk, struct mptcp_subflow_context *ctx) { diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index a631a5e6fc7b..932a3e0eb22d 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -699,9 +699,10 @@ static bool subflow_hmac_valid(const struct request_sock *req, static void mptcp_force_close(struct sock *sk) { - /* the msk is not yet exposed to user-space */ + /* the msk is not yet exposed to user-space, and refcount is 2 */ inet_sk_state_store(sk, TCP_CLOSE); sk_common_release(sk); + sock_put(sk); } static void subflow_ulp_fallback(struct sock *sk, @@ -717,7 +718,7 @@ static void subflow_ulp_fallback(struct sock *sk, mptcp_subflow_ops_undo_override(sk); } -static void subflow_drop_ctx(struct sock *ssk) +void mptcp_subflow_drop_ctx(struct sock *ssk) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); @@ -823,7 +824,7 @@ create_child: if (new_msk) mptcp_copy_inaddrs(new_msk, child); - subflow_drop_ctx(child); + mptcp_subflow_drop_ctx(child); goto out; } @@ -914,7 +915,7 @@ out: return child; dispose_child: - subflow_drop_ctx(child); + mptcp_subflow_drop_ctx(child); tcp_rsk(req)->drop_req = true; inet_csk_prepare_for_destroy_sock(child); tcp_done(child); @@ -1866,7 +1867,6 @@ void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_s struct sock *sk = (struct sock *)msk; bool do_cancel_work; - sock_hold(sk); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); next = msk->dl_next; msk->first = NULL; @@ -1954,6 +1954,13 @@ static void subflow_ulp_release(struct sock *ssk) * when the subflow is still unaccepted */ release = ctx->disposable || list_empty(&ctx->node); + + /* inet_child_forget() does not call sk_state_change(), + * explicitly trigger the socket close machinery + */ + if (!release && !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, + &mptcp_sk(sk)->flags)) + mptcp_schedule_work(sk); sock_put(sk); } From 0a3f4f1f9c27215e4ddcd312558342e57b93e518 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 9 Mar 2023 15:50:00 +0100 Subject: [PATCH 031/108] mptcp: fix UaF in listener shutdown As reported by Christoph after having refactored the passive socket initialization, the mptcp listener shutdown path is prone to an UaF issue. BUG: KASAN: use-after-free in _raw_spin_lock_bh+0x73/0xe0 Write of size 4 at addr ffff88810cb23098 by task syz-executor731/1266 CPU: 1 PID: 1266 Comm: syz-executor731 Not tainted 6.2.0-rc59af4eaa31c1f6c00c8f1e448ed99a45c66340dd5 #6 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x6e/0x91 print_report+0x16a/0x46f kasan_report+0xad/0x130 kasan_check_range+0x14a/0x1a0 _raw_spin_lock_bh+0x73/0xe0 subflow_error_report+0x6d/0x110 sk_error_report+0x3b/0x190 tcp_disconnect+0x138c/0x1aa0 inet_child_forget+0x6f/0x2e0 inet_csk_listen_stop+0x209/0x1060 __mptcp_close_ssk+0x52d/0x610 mptcp_destroy_common+0x165/0x640 mptcp_destroy+0x13/0x80 __mptcp_destroy_sock+0xe7/0x270 __mptcp_close+0x70e/0x9b0 mptcp_close+0x2b/0x150 inet_release+0xe9/0x1f0 __sock_release+0xd2/0x280 sock_close+0x15/0x20 __fput+0x252/0xa20 task_work_run+0x169/0x250 exit_to_user_mode_prepare+0x113/0x120 syscall_exit_to_user_mode+0x1d/0x40 do_syscall_64+0x48/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc The msk grace period can legitly expire in between the last reference count dropped in mptcp_subflow_queue_clean() and the later eventual access in inet_csk_listen_stop() After the previous patch we don't need anymore special-casing msk listener socket cleanup: the mptcp worker will process each of the unaccepted msk sockets. Just drop the now unnecessary code. Please note this commit depends on the two parent ones: mptcp: refactor passive socket initialization mptcp: use the workqueue to destroy unaccepted sockets Fixes: 6aeed9045071 ("mptcp: fix race on unaccepted mptcp sockets") Cc: stable@vger.kernel.org Reported-and-tested-by: Christoph Paasch Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/346 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 7 ++--- net/mptcp/protocol.h | 1 - net/mptcp/subflow.c | 72 -------------------------------------------- 3 files changed, 2 insertions(+), 78 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2a2093d61835..60b23b2716c4 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2365,12 +2365,9 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, mptcp_subflow_drop_ctx(ssk); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ - if (ssk->sk_state == TCP_LISTEN) { - tcp_set_state(ssk, TCP_CLOSE); - mptcp_subflow_queue_clean(sk, ssk); - inet_csk_listen_stop(ssk); + if (ssk->sk_state == TCP_LISTEN) mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); - } + __tcp_close(ssk, 0); /* close acquired an extra ref */ diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 3a2db1b862dd..339a6f072989 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -629,7 +629,6 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow); void __mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); -void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 932a3e0eb22d..9c57575df84c 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1826,78 +1826,6 @@ static void subflow_state_change(struct sock *sk) } } -void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) -{ - struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; - struct mptcp_sock *msk, *next, *head = NULL; - struct request_sock *req; - - /* build a list of all unaccepted mptcp sockets */ - spin_lock_bh(&queue->rskq_lock); - for (req = queue->rskq_accept_head; req; req = req->dl_next) { - struct mptcp_subflow_context *subflow; - struct sock *ssk = req->sk; - struct mptcp_sock *msk; - - if (!sk_is_mptcp(ssk)) - continue; - - subflow = mptcp_subflow_ctx(ssk); - if (!subflow || !subflow->conn) - continue; - - /* skip if already in list */ - msk = mptcp_sk(subflow->conn); - if (msk->dl_next || msk == head) - continue; - - msk->dl_next = head; - head = msk; - } - spin_unlock_bh(&queue->rskq_lock); - if (!head) - return; - - /* can't acquire the msk socket lock under the subflow one, - * or will cause ABBA deadlock - */ - release_sock(listener_ssk); - - for (msk = head; msk; msk = next) { - struct sock *sk = (struct sock *)msk; - bool do_cancel_work; - - lock_sock_nested(sk, SINGLE_DEPTH_NESTING); - next = msk->dl_next; - msk->first = NULL; - msk->dl_next = NULL; - - do_cancel_work = __mptcp_close(sk, 0); - release_sock(sk); - if (do_cancel_work) { - /* lockdep will report a false positive ABBA deadlock - * between cancel_work_sync and the listener socket. - * The involved locks belong to different sockets WRT - * the existing AB chain. - * Using a per socket key is problematic as key - * deregistration requires process context and must be - * performed at socket disposal time, in atomic - * context. - * Just tell lockdep to consider the listener socket - * released here. - */ - mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_); - mptcp_cancel_work(sk); - mutex_acquire(&listener_sk->sk_lock.dep_map, - SINGLE_DEPTH_NESTING, 0, _RET_IP_); - } - sock_put(sk); - } - - /* we are still under the listener msk socket lock */ - lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING); -} - static int subflow_ulp_init(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); From 840742b7ed0e123e47af9c5c3902746f3d6b64a2 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Thu, 9 Mar 2023 15:50:01 +0100 Subject: [PATCH 032/108] selftests: mptcp: userspace pm: fix printed values In case of errors, the printed message had the expected and the seen value inverted. This patch simply correct the order: first the expected value, then the one that has been seen. Fixes: 10d4273411be ("selftests: mptcp: userspace: print error details if any") Cc: stable@vger.kernel.org Acked-by: Geliang Tang Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/userspace_pm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 66c5be25c13d..48e52f995a98 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -240,7 +240,7 @@ check_expected_one() fi stdbuf -o0 -e0 printf "\tExpected value for '%s': '%s', got '%s'.\n" \ - "${var}" "${!var}" "${!exp}" + "${var}" "${!exp}" "${!var}" return 1 } From 822467a48e938e661965d09df5fcac66f7291050 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 9 Mar 2023 15:50:02 +0100 Subject: [PATCH 033/108] mptcp: add ro_after_init for tcp{,v6}_prot_override Add __ro_after_init labels for the variables tcp_prot_override and tcpv6_prot_override, just like other variables adjacent to them, to indicate that they are initialised from the init hooks and no writes occur afterwards. Fixes: b19bc2945b40 ("mptcp: implement delegated actions") Cc: stable@vger.kernel.org Fixes: 51fa7f8ebf0e ("mptcp: mark ops structures as ro_after_init") Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 9c57575df84c..2aadc8733369 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -628,7 +628,7 @@ static struct request_sock_ops mptcp_subflow_v6_request_sock_ops __ro_after_init static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init; static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init; static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init; -static struct proto tcpv6_prot_override; +static struct proto tcpv6_prot_override __ro_after_init; static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) { @@ -926,7 +926,7 @@ dispose_child: } static struct inet_connection_sock_af_ops subflow_specific __ro_after_init; -static struct proto tcp_prot_override; +static struct proto tcp_prot_override __ro_after_init; enum mapping_status { MAPPING_OK, From 3ba14528684f528566fb7d956bfbfb958b591d86 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Thu, 9 Mar 2023 15:50:03 +0100 Subject: [PATCH 034/108] mptcp: avoid setting TCP_CLOSE state twice tcp_set_state() is called from tcp_done() already. There is then no need to first set the state to TCP_CLOSE, then call tcp_done(). Fixes: d582484726c4 ("mptcp: fix fallback for MP_JOIN subflows") Cc: stable@vger.kernel.org Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/362 Acked-by: Paolo Abeni Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 2aadc8733369..a0041360ee9d 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -406,7 +406,6 @@ void mptcp_subflow_reset(struct sock *ssk) /* must hold: tcp_done() could drop last reference on parent */ sock_hold(sk); - tcp_set_state(ssk, TCP_CLOSE); tcp_send_active_reset(ssk, GFP_ATOMIC); tcp_done(ssk); if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) && From cee4034a3db1d30c3243dd51506a9d4ab1a849fa Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 9 Mar 2023 15:50:04 +0100 Subject: [PATCH 035/108] mptcp: fix lockdep false positive in mptcp_pm_nl_create_listen_socket() Christoph reports a lockdep splat in the mptcp_subflow_create_socket() error path, when such function is invoked by mptcp_pm_nl_create_listen_socket(). Such code path acquires two separates, nested socket lock, with the internal lock operation lacking the "nested" annotation. Adding that in sock_release() for mptcp's sake only could be confusing. Instead just add a new lockclass to the in-kernel msk socket, re-initializing the lockdep infra after the socket creation. Fixes: ad2171009d96 ("mptcp: fix locking for in-kernel listener creation") Cc: stable@vger.kernel.org Reported-by: Christoph Paasch Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/354 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Tested-by: Christoph Paasch Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 56628b52d100..5c8dea49626c 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -997,9 +997,13 @@ out: return ret; } +static struct lock_class_key mptcp_slock_keys[2]; +static struct lock_class_key mptcp_keys[2]; + static int mptcp_pm_nl_create_listen_socket(struct sock *sk, struct mptcp_pm_addr_entry *entry) { + bool is_ipv6 = sk->sk_family == AF_INET6; int addrlen = sizeof(struct sockaddr_in); struct sockaddr_storage addr; struct socket *ssock; @@ -1016,6 +1020,18 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, if (!newsk) return -EINVAL; + /* The subflow socket lock is acquired in a nested to the msk one + * in several places, even by the TCP stack, and this msk is a kernel + * socket: lockdep complains. Instead of propagating the _nested + * modifiers in several places, re-init the lock class for the msk + * socket to an mptcp specific one. + */ + sock_lock_init_class_and_name(newsk, + is_ipv6 ? "mlock-AF_INET6" : "mlock-AF_INET", + &mptcp_slock_keys[is_ipv6], + is_ipv6 ? "msk_lock-AF_INET6" : "msk_lock-AF_INET", + &mptcp_keys[is_ipv6]); + lock_sock(newsk); ssock = __mptcp_nmpc_socket(mptcp_sk(newsk)); release_sock(newsk); From 131db499162274858bdbd7b5323a639da4aab86c Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Fri, 10 Mar 2023 07:13:56 -0800 Subject: [PATCH 036/108] bnxt_en: reset PHC frequency in free-running mode When using a PHC in shared between multiple hosts, the previous frequency value may not be reset and could lead to host being unable to compensate the offset with timecounter adjustments. To avoid such state reset the hardware frequency of PHC to zero on init. Some refactoring is needed to make code readable. Fixes: 85036aee1938 ("bnxt_en: Add a non-real time mode to access NIC clock") Signed-off-by: Vadim Fedorenko Reviewed-by: Pavan Chebbi Link: https://lore.kernel.org/r/20230310151356.678059-1-vadfed@meta.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 6 +- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 + drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c | 56 ++++++++++--------- 3 files changed, 35 insertions(+), 29 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 808236dc898b..e2e2c986c82b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6990,11 +6990,9 @@ static int bnxt_hwrm_func_qcfg(struct bnxt *bp) if (flags & FUNC_QCFG_RESP_FLAGS_FW_DCBX_AGENT_ENABLED) bp->fw_cap |= BNXT_FW_CAP_DCBX_AGENT; } - if (BNXT_PF(bp) && (flags & FUNC_QCFG_RESP_FLAGS_MULTI_HOST)) { + if (BNXT_PF(bp) && (flags & FUNC_QCFG_RESP_FLAGS_MULTI_HOST)) bp->flags |= BNXT_FLAG_MULTI_HOST; - if (bp->fw_cap & BNXT_FW_CAP_PTP_RTC) - bp->fw_cap &= ~BNXT_FW_CAP_PTP_RTC; - } + if (flags & FUNC_QCFG_RESP_FLAGS_RING_MONITOR_ENABLED) bp->fw_cap |= BNXT_FW_CAP_RING_MONITOR; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index dcb09fbe4007..c0628ac1b798 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2000,6 +2000,8 @@ struct bnxt { u32 fw_dbg_cap; #define BNXT_NEW_RM(bp) ((bp)->fw_cap & BNXT_FW_CAP_NEW_RM) +#define BNXT_PTP_USE_RTC(bp) (!BNXT_MH(bp) && \ + ((bp)->fw_cap & BNXT_FW_CAP_PTP_RTC)) u32 hwrm_spec_code; u16 hwrm_cmd_seq; u16 hwrm_cmd_kong_seq; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c index 4ec8bba18cdd..a3a3978a4d1c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c @@ -63,7 +63,7 @@ static int bnxt_ptp_settime(struct ptp_clock_info *ptp_info, ptp_info); u64 ns = timespec64_to_ns(ts); - if (ptp->bp->fw_cap & BNXT_FW_CAP_PTP_RTC) + if (BNXT_PTP_USE_RTC(ptp->bp)) return bnxt_ptp_cfg_settime(ptp->bp, ns); spin_lock_bh(&ptp->ptp_lock); @@ -196,7 +196,7 @@ static int bnxt_ptp_adjtime(struct ptp_clock_info *ptp_info, s64 delta) struct bnxt_ptp_cfg *ptp = container_of(ptp_info, struct bnxt_ptp_cfg, ptp_info); - if (ptp->bp->fw_cap & BNXT_FW_CAP_PTP_RTC) + if (BNXT_PTP_USE_RTC(ptp->bp)) return bnxt_ptp_adjphc(ptp, delta); spin_lock_bh(&ptp->ptp_lock); @@ -205,34 +205,39 @@ static int bnxt_ptp_adjtime(struct ptp_clock_info *ptp_info, s64 delta) return 0; } +static int bnxt_ptp_adjfine_rtc(struct bnxt *bp, long scaled_ppm) +{ + s32 ppb = scaled_ppm_to_ppb(scaled_ppm); + struct hwrm_port_mac_cfg_input *req; + int rc; + + rc = hwrm_req_init(bp, req, HWRM_PORT_MAC_CFG); + if (rc) + return rc; + + req->ptp_freq_adj_ppb = cpu_to_le32(ppb); + req->enables = cpu_to_le32(PORT_MAC_CFG_REQ_ENABLES_PTP_FREQ_ADJ_PPB); + rc = hwrm_req_send(bp, req); + if (rc) + netdev_err(bp->dev, + "ptp adjfine failed. rc = %d\n", rc); + return rc; +} + static int bnxt_ptp_adjfine(struct ptp_clock_info *ptp_info, long scaled_ppm) { struct bnxt_ptp_cfg *ptp = container_of(ptp_info, struct bnxt_ptp_cfg, ptp_info); - struct hwrm_port_mac_cfg_input *req; struct bnxt *bp = ptp->bp; - int rc = 0; - if (!(ptp->bp->fw_cap & BNXT_FW_CAP_PTP_RTC)) { - spin_lock_bh(&ptp->ptp_lock); - timecounter_read(&ptp->tc); - ptp->cc.mult = adjust_by_scaled_ppm(ptp->cmult, scaled_ppm); - spin_unlock_bh(&ptp->ptp_lock); - } else { - s32 ppb = scaled_ppm_to_ppb(scaled_ppm); + if (BNXT_PTP_USE_RTC(bp)) + return bnxt_ptp_adjfine_rtc(bp, scaled_ppm); - rc = hwrm_req_init(bp, req, HWRM_PORT_MAC_CFG); - if (rc) - return rc; - - req->ptp_freq_adj_ppb = cpu_to_le32(ppb); - req->enables = cpu_to_le32(PORT_MAC_CFG_REQ_ENABLES_PTP_FREQ_ADJ_PPB); - rc = hwrm_req_send(ptp->bp, req); - if (rc) - netdev_err(ptp->bp->dev, - "ptp adjfine failed. rc = %d\n", rc); - } - return rc; + spin_lock_bh(&ptp->ptp_lock); + timecounter_read(&ptp->tc); + ptp->cc.mult = adjust_by_scaled_ppm(ptp->cmult, scaled_ppm); + spin_unlock_bh(&ptp->ptp_lock); + return 0; } void bnxt_ptp_pps_event(struct bnxt *bp, u32 data1, u32 data2) @@ -879,7 +884,7 @@ int bnxt_ptp_init_rtc(struct bnxt *bp, bool phc_cfg) u64 ns; int rc; - if (!bp->ptp_cfg || !(bp->fw_cap & BNXT_FW_CAP_PTP_RTC)) + if (!bp->ptp_cfg || !BNXT_PTP_USE_RTC(bp)) return -ENODEV; if (!phc_cfg) { @@ -932,13 +937,14 @@ int bnxt_ptp_init(struct bnxt *bp, bool phc_cfg) atomic_set(&ptp->tx_avail, BNXT_MAX_TX_TS); spin_lock_init(&ptp->ptp_lock); - if (bp->fw_cap & BNXT_FW_CAP_PTP_RTC) { + if (BNXT_PTP_USE_RTC(bp)) { bnxt_ptp_timecounter_init(bp, false); rc = bnxt_ptp_init_rtc(bp, phc_cfg); if (rc) goto out; } else { bnxt_ptp_timecounter_init(bp, true); + bnxt_ptp_adjfine_rtc(bp, 0); } ptp->ptp_info = bnxt_ptp_caps; From 22a825c541d775c1dbe7b2402786025acad6727b Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Wed, 8 Mar 2023 16:17:12 +0800 Subject: [PATCH 037/108] net/smc: fix NULL sndbuf_desc in smc_cdc_tx_handler() When performing a stress test on SMC-R by rmmod mlx5_ib driver during the wrk/nginx test, we found that there is a probability of triggering a panic while terminating all link groups. This issue dues to the race between smc_smcr_terminate_all() and smc_buf_create(). smc_smcr_terminate_all smc_buf_create /* init */ conn->sndbuf_desc = NULL; ... __smc_lgr_terminate smc_conn_kill smc_close_abort smc_cdc_get_slot_and_msg_send __softirqentry_text_start smc_wr_tx_process_cqe smc_cdc_tx_handler READ(conn->sndbuf_desc->len); /* panic dues to NULL sndbuf_desc */ conn->sndbuf_desc = xxx; This patch tries to fix the issue by always to check the sndbuf_desc before send any cdc msg, to make sure that no null pointer is seen during cqe processing. Fixes: 0b29ec643613 ("net/smc: immediate termination for SMCR link groups") Signed-off-by: D. Wythe Reviewed-by: Tony Lu Reviewed-by: Wenjia Zhang Link: https://lore.kernel.org/r/1678263432-17329-1-git-send-email-alibuda@linux.alibaba.com Signed-off-by: Jakub Kicinski --- net/smc/smc_cdc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 53f63bfbaf5f..89105e95b452 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -114,6 +114,9 @@ int smc_cdc_msg_send(struct smc_connection *conn, union smc_host_cursor cfed; int rc; + if (unlikely(!READ_ONCE(conn->sndbuf_desc))) + return -ENOBUFS; + smc_cdc_add_pending_send(conn, pend); conn->tx_cdc_seq++; From 1a9dc5610ef89d807acdcfbff93a558f341a44da Mon Sep 17 00:00:00 2001 From: Daniil Tatianin Date: Thu, 9 Mar 2023 23:15:56 +0300 Subject: [PATCH 038/108] qed/qed_dev: guard against a possible division by zero Previously we would divide total_left_rate by zero if num_vports happened to be 1 because non_requested_count is calculated as num_vports - req_count. Guard against this by validating num_vports at the beginning and returning an error otherwise. Found by Linux Verification Center (linuxtesting.org) with the SVACE static analysis tool. Fixes: bcd197c81f63 ("qed: Add vport WFQ configuration APIs") Signed-off-by: Daniil Tatianin Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230309201556.191392-1-d-tatianin@yandex-team.ru Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qed/qed_dev.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index d61cd32ec3b6..86a93cac2647 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -5083,6 +5083,11 @@ static int qed_init_wfq_param(struct qed_hwfn *p_hwfn, num_vports = p_hwfn->qm_info.num_vports; + if (num_vports < 2) { + DP_NOTICE(p_hwfn, "Unexpected num_vports: %d\n", num_vports); + return -EINVAL; + } + /* Accounting for the vports which are configured for WFQ explicitly */ for (i = 0; i < num_vports; i++) { u32 tmp_speed; From feb03fd11c5616f3a47e4714d2f9917d0f1a2edd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ar=C4=B1n=C3=A7=20=C3=9CNAL?= Date: Fri, 10 Mar 2023 10:33:37 +0300 Subject: [PATCH 039/108] net: dsa: mt7530: remove now incorrect comment regarding port 5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove now incorrect comment regarding port 5 as GMAC5. This is supposed to be supported since commit 38f790a80560 ("net: dsa: mt7530: Add support for port 5") under mt7530_setup_port5(). Fixes: 38f790a80560 ("net: dsa: mt7530: Add support for port 5") Signed-off-by: Arınç ÜNAL Link: https://lore.kernel.org/r/20230310073338.5836-1-arinc.unal@arinc9.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mt7530.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index a508402c4ecb..b1a79460df0e 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -2201,7 +2201,7 @@ mt7530_setup(struct dsa_switch *ds) mt7530_pll_setup(priv); - /* Enable Port 6 only; P5 as GMAC5 which currently is not supported */ + /* Enable port 6 */ val = mt7530_read(priv, MT7530_MHWTRAP); val &= ~MHWTRAP_P6_DIS & ~MHWTRAP_PHY_ACCESS; val |= MHWTRAP_MANUAL; From 0b086d76e7b011772b0ac214c6e5fd5816eff2df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ar=C4=B1n=C3=A7=20=C3=9CNAL?= Date: Fri, 10 Mar 2023 10:33:38 +0300 Subject: [PATCH 040/108] net: dsa: mt7530: set PLL frequency and trgmii only when trgmii is used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As my testing on the MCM MT7530 switch on MT7621 SoC shows, setting the PLL frequency does not affect MII modes other than trgmii on port 5 and port 6. So the assumption is that the operation here called "setting the PLL frequency" actually sets the frequency of the TRGMII TX clock. Make it so that it and the rest of the trgmii setup run only when the trgmii mode is used. Tested rgmii and trgmii modes of port 6 on MCM MT7530 on MT7621AT Unielec U7621-06 and standalone MT7530 on MT7623NI Bananapi BPI-R2. Fixes: b8f126a8d543 ("net-next: dsa: add dsa support for Mediatek MT7530 switch") Tested-by: Arınç ÜNAL Signed-off-by: Arınç ÜNAL Link: https://lore.kernel.org/r/20230310073338.5836-2-arinc.unal@arinc9.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mt7530.c | 56 ++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index b1a79460df0e..c2d81b7a429d 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -430,8 +430,6 @@ mt7530_pad_clk_setup(struct dsa_switch *ds, phy_interface_t interface) switch (interface) { case PHY_INTERFACE_MODE_RGMII: trgint = 0; - /* PLL frequency: 125MHz */ - ncpo1 = 0x0c80; break; case PHY_INTERFACE_MODE_TRGMII: trgint = 1; @@ -462,38 +460,40 @@ mt7530_pad_clk_setup(struct dsa_switch *ds, phy_interface_t interface) mt7530_rmw(priv, MT7530_P6ECR, P6_INTF_MODE_MASK, P6_INTF_MODE(trgint)); - /* Lower Tx Driving for TRGMII path */ - for (i = 0 ; i < NUM_TRGMII_CTRL ; i++) - mt7530_write(priv, MT7530_TRGMII_TD_ODT(i), - TD_DM_DRVP(8) | TD_DM_DRVN(8)); + if (trgint) { + /* Lower Tx Driving for TRGMII path */ + for (i = 0 ; i < NUM_TRGMII_CTRL ; i++) + mt7530_write(priv, MT7530_TRGMII_TD_ODT(i), + TD_DM_DRVP(8) | TD_DM_DRVN(8)); - /* Disable MT7530 core and TRGMII Tx clocks */ - core_clear(priv, CORE_TRGMII_GSW_CLK_CG, - REG_GSWCK_EN | REG_TRGMIICK_EN); + /* Disable MT7530 core and TRGMII Tx clocks */ + core_clear(priv, CORE_TRGMII_GSW_CLK_CG, + REG_GSWCK_EN | REG_TRGMIICK_EN); - /* Setup the MT7530 TRGMII Tx Clock */ - core_write(priv, CORE_PLL_GROUP5, RG_LCDDS_PCW_NCPO1(ncpo1)); - core_write(priv, CORE_PLL_GROUP6, RG_LCDDS_PCW_NCPO0(0)); - core_write(priv, CORE_PLL_GROUP10, RG_LCDDS_SSC_DELTA(ssc_delta)); - core_write(priv, CORE_PLL_GROUP11, RG_LCDDS_SSC_DELTA1(ssc_delta)); - core_write(priv, CORE_PLL_GROUP4, - RG_SYSPLL_DDSFBK_EN | RG_SYSPLL_BIAS_EN | - RG_SYSPLL_BIAS_LPF_EN); - core_write(priv, CORE_PLL_GROUP2, - RG_SYSPLL_EN_NORMAL | RG_SYSPLL_VODEN | - RG_SYSPLL_POSDIV(1)); - core_write(priv, CORE_PLL_GROUP7, - RG_LCDDS_PCW_NCPO_CHG | RG_LCCDS_C(3) | - RG_LCDDS_PWDB | RG_LCDDS_ISO_EN); + /* Setup the MT7530 TRGMII Tx Clock */ + core_write(priv, CORE_PLL_GROUP5, RG_LCDDS_PCW_NCPO1(ncpo1)); + core_write(priv, CORE_PLL_GROUP6, RG_LCDDS_PCW_NCPO0(0)); + core_write(priv, CORE_PLL_GROUP10, RG_LCDDS_SSC_DELTA(ssc_delta)); + core_write(priv, CORE_PLL_GROUP11, RG_LCDDS_SSC_DELTA1(ssc_delta)); + core_write(priv, CORE_PLL_GROUP4, + RG_SYSPLL_DDSFBK_EN | RG_SYSPLL_BIAS_EN | + RG_SYSPLL_BIAS_LPF_EN); + core_write(priv, CORE_PLL_GROUP2, + RG_SYSPLL_EN_NORMAL | RG_SYSPLL_VODEN | + RG_SYSPLL_POSDIV(1)); + core_write(priv, CORE_PLL_GROUP7, + RG_LCDDS_PCW_NCPO_CHG | RG_LCCDS_C(3) | + RG_LCDDS_PWDB | RG_LCDDS_ISO_EN); - /* Enable MT7530 core and TRGMII Tx clocks */ - core_set(priv, CORE_TRGMII_GSW_CLK_CG, - REG_GSWCK_EN | REG_TRGMIICK_EN); - - if (!trgint) + /* Enable MT7530 core and TRGMII Tx clocks */ + core_set(priv, CORE_TRGMII_GSW_CLK_CG, + REG_GSWCK_EN | REG_TRGMIICK_EN); + } else { for (i = 0 ; i < NUM_TRGMII_CTRL; i++) mt7530_rmw(priv, MT7530_TRGMII_RD(i), RD_TAP_MASK, RD_TAP(16)); + } + return 0; } From 512dd354718b98c60d4ff6017ff8c9f66c10d03f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 10 Mar 2023 13:37:09 -0600 Subject: [PATCH 041/108] net: ipa: fix a surprising number of bad offsets A recent commit eliminated a hack that adjusted the offset used for many GSI registers. It became possible because we now specify all GSI register offsets explicitly for every version of IPA. Unfortunately, a large number of register offsets were *not* updated as they should have been in that commit. For IPA v4.5+, the offset for every GSI register *except* the two inter-EE interrupt masking registers were supposed to have been reduced by 0xd000. Tested-by: Luca Weiss Tested-by: Dmitry Baryshkov # SM8350-HDK Fixes: 59b12b1d27f3 ("net: ipa: kill gsi->virt_raw") Signed-off-by: Alex Elder Link: https://lore.kernel.org/r/20230310193709.1477102-1-elder@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/ipa/reg/gsi_reg-v4.5.c | 56 +++++++++++++++--------------- drivers/net/ipa/reg/gsi_reg-v4.9.c | 44 +++++++++++------------ 2 files changed, 50 insertions(+), 50 deletions(-) diff --git a/drivers/net/ipa/reg/gsi_reg-v4.5.c b/drivers/net/ipa/reg/gsi_reg-v4.5.c index 648b51b88d4e..2900e5c3ff88 100644 --- a/drivers/net/ipa/reg/gsi_reg-v4.5.c +++ b/drivers/net/ipa/reg/gsi_reg-v4.5.c @@ -137,17 +137,17 @@ REG_STRIDE(EV_CH_E_SCRATCH_1, ev_ch_e_scratch_1, 0x0001004c + 0x4000 * GSI_EE_AP, 0x80); REG_STRIDE(CH_C_DOORBELL_0, ch_c_doorbell_0, - 0x0001e000 + 0x4000 * GSI_EE_AP, 0x08); + 0x00011000 + 0x4000 * GSI_EE_AP, 0x08); REG_STRIDE(EV_CH_E_DOORBELL_0, ev_ch_e_doorbell_0, - 0x0001e100 + 0x4000 * GSI_EE_AP, 0x08); + 0x00011100 + 0x4000 * GSI_EE_AP, 0x08); static const u32 reg_gsi_status_fmask[] = { [ENABLED] = BIT(0), /* Bits 1-31 reserved */ }; -REG_FIELDS(GSI_STATUS, gsi_status, 0x0001f000 + 0x4000 * GSI_EE_AP); +REG_FIELDS(GSI_STATUS, gsi_status, 0x00012000 + 0x4000 * GSI_EE_AP); static const u32 reg_ch_cmd_fmask[] = { [CH_CHID] = GENMASK(7, 0), @@ -155,7 +155,7 @@ static const u32 reg_ch_cmd_fmask[] = { [CH_OPCODE] = GENMASK(31, 24), }; -REG_FIELDS(CH_CMD, ch_cmd, 0x0001f008 + 0x4000 * GSI_EE_AP); +REG_FIELDS(CH_CMD, ch_cmd, 0x00012008 + 0x4000 * GSI_EE_AP); static const u32 reg_ev_ch_cmd_fmask[] = { [EV_CHID] = GENMASK(7, 0), @@ -163,7 +163,7 @@ static const u32 reg_ev_ch_cmd_fmask[] = { [EV_OPCODE] = GENMASK(31, 24), }; -REG_FIELDS(EV_CH_CMD, ev_ch_cmd, 0x0001f010 + 0x4000 * GSI_EE_AP); +REG_FIELDS(EV_CH_CMD, ev_ch_cmd, 0x00012010 + 0x4000 * GSI_EE_AP); static const u32 reg_generic_cmd_fmask[] = { [GENERIC_OPCODE] = GENMASK(4, 0), @@ -172,7 +172,7 @@ static const u32 reg_generic_cmd_fmask[] = { /* Bits 14-31 reserved */ }; -REG_FIELDS(GENERIC_CMD, generic_cmd, 0x0001f018 + 0x4000 * GSI_EE_AP); +REG_FIELDS(GENERIC_CMD, generic_cmd, 0x00012018 + 0x4000 * GSI_EE_AP); static const u32 reg_hw_param_2_fmask[] = { [IRAM_SIZE] = GENMASK(2, 0), @@ -188,58 +188,58 @@ static const u32 reg_hw_param_2_fmask[] = { [GSI_USE_INTER_EE] = BIT(31), }; -REG_FIELDS(HW_PARAM_2, hw_param_2, 0x0001f040 + 0x4000 * GSI_EE_AP); +REG_FIELDS(HW_PARAM_2, hw_param_2, 0x00012040 + 0x4000 * GSI_EE_AP); -REG(CNTXT_TYPE_IRQ, cntxt_type_irq, 0x0001f080 + 0x4000 * GSI_EE_AP); +REG(CNTXT_TYPE_IRQ, cntxt_type_irq, 0x00012080 + 0x4000 * GSI_EE_AP); -REG(CNTXT_TYPE_IRQ_MSK, cntxt_type_irq_msk, 0x0001f088 + 0x4000 * GSI_EE_AP); +REG(CNTXT_TYPE_IRQ_MSK, cntxt_type_irq_msk, 0x00012088 + 0x4000 * GSI_EE_AP); -REG(CNTXT_SRC_CH_IRQ, cntxt_src_ch_irq, 0x0001f090 + 0x4000 * GSI_EE_AP); +REG(CNTXT_SRC_CH_IRQ, cntxt_src_ch_irq, 0x00012090 + 0x4000 * GSI_EE_AP); -REG(CNTXT_SRC_EV_CH_IRQ, cntxt_src_ev_ch_irq, 0x0001f094 + 0x4000 * GSI_EE_AP); +REG(CNTXT_SRC_EV_CH_IRQ, cntxt_src_ev_ch_irq, 0x00012094 + 0x4000 * GSI_EE_AP); REG(CNTXT_SRC_CH_IRQ_MSK, cntxt_src_ch_irq_msk, - 0x0001f098 + 0x4000 * GSI_EE_AP); + 0x00012098 + 0x4000 * GSI_EE_AP); REG(CNTXT_SRC_EV_CH_IRQ_MSK, cntxt_src_ev_ch_irq_msk, - 0x0001f09c + 0x4000 * GSI_EE_AP); + 0x0001209c + 0x4000 * GSI_EE_AP); REG(CNTXT_SRC_CH_IRQ_CLR, cntxt_src_ch_irq_clr, - 0x0001f0a0 + 0x4000 * GSI_EE_AP); + 0x000120a0 + 0x4000 * GSI_EE_AP); REG(CNTXT_SRC_EV_CH_IRQ_CLR, cntxt_src_ev_ch_irq_clr, - 0x0001f0a4 + 0x4000 * GSI_EE_AP); + 0x000120a4 + 0x4000 * GSI_EE_AP); -REG(CNTXT_SRC_IEOB_IRQ, cntxt_src_ieob_irq, 0x0001f0b0 + 0x4000 * GSI_EE_AP); +REG(CNTXT_SRC_IEOB_IRQ, cntxt_src_ieob_irq, 0x000120b0 + 0x4000 * GSI_EE_AP); REG(CNTXT_SRC_IEOB_IRQ_MSK, cntxt_src_ieob_irq_msk, - 0x0001f0b8 + 0x4000 * GSI_EE_AP); + 0x000120b8 + 0x4000 * GSI_EE_AP); REG(CNTXT_SRC_IEOB_IRQ_CLR, cntxt_src_ieob_irq_clr, - 0x0001f0c0 + 0x4000 * GSI_EE_AP); + 0x000120c0 + 0x4000 * GSI_EE_AP); -REG(CNTXT_GLOB_IRQ_STTS, cntxt_glob_irq_stts, 0x0001f100 + 0x4000 * GSI_EE_AP); +REG(CNTXT_GLOB_IRQ_STTS, cntxt_glob_irq_stts, 0x00012100 + 0x4000 * GSI_EE_AP); -REG(CNTXT_GLOB_IRQ_EN, cntxt_glob_irq_en, 0x0001f108 + 0x4000 * GSI_EE_AP); +REG(CNTXT_GLOB_IRQ_EN, cntxt_glob_irq_en, 0x00012108 + 0x4000 * GSI_EE_AP); -REG(CNTXT_GLOB_IRQ_CLR, cntxt_glob_irq_clr, 0x0001f110 + 0x4000 * GSI_EE_AP); +REG(CNTXT_GLOB_IRQ_CLR, cntxt_glob_irq_clr, 0x00012110 + 0x4000 * GSI_EE_AP); -REG(CNTXT_GSI_IRQ_STTS, cntxt_gsi_irq_stts, 0x0001f118 + 0x4000 * GSI_EE_AP); +REG(CNTXT_GSI_IRQ_STTS, cntxt_gsi_irq_stts, 0x00012118 + 0x4000 * GSI_EE_AP); -REG(CNTXT_GSI_IRQ_EN, cntxt_gsi_irq_en, 0x0001f120 + 0x4000 * GSI_EE_AP); +REG(CNTXT_GSI_IRQ_EN, cntxt_gsi_irq_en, 0x00012120 + 0x4000 * GSI_EE_AP); -REG(CNTXT_GSI_IRQ_CLR, cntxt_gsi_irq_clr, 0x0001f128 + 0x4000 * GSI_EE_AP); +REG(CNTXT_GSI_IRQ_CLR, cntxt_gsi_irq_clr, 0x00012128 + 0x4000 * GSI_EE_AP); static const u32 reg_cntxt_intset_fmask[] = { [INTYPE] = BIT(0) /* Bits 1-31 reserved */ }; -REG_FIELDS(CNTXT_INTSET, cntxt_intset, 0x0001f180 + 0x4000 * GSI_EE_AP); +REG_FIELDS(CNTXT_INTSET, cntxt_intset, 0x00012180 + 0x4000 * GSI_EE_AP); -REG_FIELDS(ERROR_LOG, error_log, 0x0001f200 + 0x4000 * GSI_EE_AP); +REG_FIELDS(ERROR_LOG, error_log, 0x00012200 + 0x4000 * GSI_EE_AP); -REG(ERROR_LOG_CLR, error_log_clr, 0x0001f210 + 0x4000 * GSI_EE_AP); +REG(ERROR_LOG_CLR, error_log_clr, 0x00012210 + 0x4000 * GSI_EE_AP); static const u32 reg_cntxt_scratch_0_fmask[] = { [INTER_EE_RESULT] = GENMASK(2, 0), @@ -248,7 +248,7 @@ static const u32 reg_cntxt_scratch_0_fmask[] = { /* Bits 8-31 reserved */ }; -REG_FIELDS(CNTXT_SCRATCH_0, cntxt_scratch_0, 0x0001f400 + 0x4000 * GSI_EE_AP); +REG_FIELDS(CNTXT_SCRATCH_0, cntxt_scratch_0, 0x00012400 + 0x4000 * GSI_EE_AP); static const struct reg *reg_array[] = { [INTER_EE_SRC_CH_IRQ_MSK] = ®_inter_ee_src_ch_irq_msk, diff --git a/drivers/net/ipa/reg/gsi_reg-v4.9.c b/drivers/net/ipa/reg/gsi_reg-v4.9.c index 4bf45d264d6b..8b5d95425a76 100644 --- a/drivers/net/ipa/reg/gsi_reg-v4.9.c +++ b/drivers/net/ipa/reg/gsi_reg-v4.9.c @@ -27,7 +27,7 @@ static const u32 reg_ch_c_cntxt_0_fmask[] = { }; REG_STRIDE_FIELDS(CH_C_CNTXT_0, ch_c_cntxt_0, - 0x0001c000 + 0x4000 * GSI_EE_AP, 0x80); + 0x0000f000 + 0x4000 * GSI_EE_AP, 0x80); static const u32 reg_ch_c_cntxt_1_fmask[] = { [CH_R_LENGTH] = GENMASK(19, 0), @@ -35,11 +35,11 @@ static const u32 reg_ch_c_cntxt_1_fmask[] = { }; REG_STRIDE_FIELDS(CH_C_CNTXT_1, ch_c_cntxt_1, - 0x0001c004 + 0x4000 * GSI_EE_AP, 0x80); + 0x0000f004 + 0x4000 * GSI_EE_AP, 0x80); -REG_STRIDE(CH_C_CNTXT_2, ch_c_cntxt_2, 0x0001c008 + 0x4000 * GSI_EE_AP, 0x80); +REG_STRIDE(CH_C_CNTXT_2, ch_c_cntxt_2, 0x0000f008 + 0x4000 * GSI_EE_AP, 0x80); -REG_STRIDE(CH_C_CNTXT_3, ch_c_cntxt_3, 0x0001c00c + 0x4000 * GSI_EE_AP, 0x80); +REG_STRIDE(CH_C_CNTXT_3, ch_c_cntxt_3, 0x0000f00c + 0x4000 * GSI_EE_AP, 0x80); static const u32 reg_ch_c_qos_fmask[] = { [WRR_WEIGHT] = GENMASK(3, 0), @@ -53,7 +53,7 @@ static const u32 reg_ch_c_qos_fmask[] = { /* Bits 25-31 reserved */ }; -REG_STRIDE_FIELDS(CH_C_QOS, ch_c_qos, 0x0001c05c + 0x4000 * GSI_EE_AP, 0x80); +REG_STRIDE_FIELDS(CH_C_QOS, ch_c_qos, 0x0000f05c + 0x4000 * GSI_EE_AP, 0x80); static const u32 reg_error_log_fmask[] = { [ERR_ARG3] = GENMASK(3, 0), @@ -67,16 +67,16 @@ static const u32 reg_error_log_fmask[] = { }; REG_STRIDE(CH_C_SCRATCH_0, ch_c_scratch_0, - 0x0001c060 + 0x4000 * GSI_EE_AP, 0x80); + 0x0000f060 + 0x4000 * GSI_EE_AP, 0x80); REG_STRIDE(CH_C_SCRATCH_1, ch_c_scratch_1, - 0x0001c064 + 0x4000 * GSI_EE_AP, 0x80); + 0x0000f064 + 0x4000 * GSI_EE_AP, 0x80); REG_STRIDE(CH_C_SCRATCH_2, ch_c_scratch_2, - 0x0001c068 + 0x4000 * GSI_EE_AP, 0x80); + 0x0000f068 + 0x4000 * GSI_EE_AP, 0x80); REG_STRIDE(CH_C_SCRATCH_3, ch_c_scratch_3, - 0x0001c06c + 0x4000 * GSI_EE_AP, 0x80); + 0x0000f06c + 0x4000 * GSI_EE_AP, 0x80); static const u32 reg_ev_ch_e_cntxt_0_fmask[] = { [EV_CHTYPE] = GENMASK(3, 0), @@ -89,23 +89,23 @@ static const u32 reg_ev_ch_e_cntxt_0_fmask[] = { }; REG_STRIDE_FIELDS(EV_CH_E_CNTXT_0, ev_ch_e_cntxt_0, - 0x0001d000 + 0x4000 * GSI_EE_AP, 0x80); + 0x00010000 + 0x4000 * GSI_EE_AP, 0x80); static const u32 reg_ev_ch_e_cntxt_1_fmask[] = { [R_LENGTH] = GENMASK(15, 0), }; REG_STRIDE_FIELDS(EV_CH_E_CNTXT_1, ev_ch_e_cntxt_1, - 0x0001d004 + 0x4000 * GSI_EE_AP, 0x80); + 0x00010004 + 0x4000 * GSI_EE_AP, 0x80); REG_STRIDE(EV_CH_E_CNTXT_2, ev_ch_e_cntxt_2, - 0x0001d008 + 0x4000 * GSI_EE_AP, 0x80); + 0x00010008 + 0x4000 * GSI_EE_AP, 0x80); REG_STRIDE(EV_CH_E_CNTXT_3, ev_ch_e_cntxt_3, - 0x0001d00c + 0x4000 * GSI_EE_AP, 0x80); + 0x0001000c + 0x4000 * GSI_EE_AP, 0x80); REG_STRIDE(EV_CH_E_CNTXT_4, ev_ch_e_cntxt_4, - 0x0001d010 + 0x4000 * GSI_EE_AP, 0x80); + 0x00010010 + 0x4000 * GSI_EE_AP, 0x80); static const u32 reg_ev_ch_e_cntxt_8_fmask[] = { [EV_MODT] = GENMASK(15, 0), @@ -114,28 +114,28 @@ static const u32 reg_ev_ch_e_cntxt_8_fmask[] = { }; REG_STRIDE_FIELDS(EV_CH_E_CNTXT_8, ev_ch_e_cntxt_8, - 0x0001d020 + 0x4000 * GSI_EE_AP, 0x80); + 0x00010020 + 0x4000 * GSI_EE_AP, 0x80); REG_STRIDE(EV_CH_E_CNTXT_9, ev_ch_e_cntxt_9, - 0x0001d024 + 0x4000 * GSI_EE_AP, 0x80); + 0x00010024 + 0x4000 * GSI_EE_AP, 0x80); REG_STRIDE(EV_CH_E_CNTXT_10, ev_ch_e_cntxt_10, - 0x0001d028 + 0x4000 * GSI_EE_AP, 0x80); + 0x00010028 + 0x4000 * GSI_EE_AP, 0x80); REG_STRIDE(EV_CH_E_CNTXT_11, ev_ch_e_cntxt_11, - 0x0001d02c + 0x4000 * GSI_EE_AP, 0x80); + 0x0001002c + 0x4000 * GSI_EE_AP, 0x80); REG_STRIDE(EV_CH_E_CNTXT_12, ev_ch_e_cntxt_12, - 0x0001d030 + 0x4000 * GSI_EE_AP, 0x80); + 0x00010030 + 0x4000 * GSI_EE_AP, 0x80); REG_STRIDE(EV_CH_E_CNTXT_13, ev_ch_e_cntxt_13, - 0x0001d034 + 0x4000 * GSI_EE_AP, 0x80); + 0x00010034 + 0x4000 * GSI_EE_AP, 0x80); REG_STRIDE(EV_CH_E_SCRATCH_0, ev_ch_e_scratch_0, - 0x0001d048 + 0x4000 * GSI_EE_AP, 0x80); + 0x00010048 + 0x4000 * GSI_EE_AP, 0x80); REG_STRIDE(EV_CH_E_SCRATCH_1, ev_ch_e_scratch_1, - 0x0001d04c + 0x4000 * GSI_EE_AP, 0x80); + 0x0001004c + 0x4000 * GSI_EE_AP, 0x80); REG_STRIDE(CH_C_DOORBELL_0, ch_c_doorbell_0, 0x00011000 + 0x4000 * GSI_EE_AP, 0x08); From 28e8cabe80f3e6e3c98121576eda898eeb20f1b1 Mon Sep 17 00:00:00 2001 From: Kristian Overskeid Date: Tue, 7 Mar 2023 14:32:29 +0100 Subject: [PATCH 042/108] net: hsr: Don't log netdev_err message on unknown prp dst node If no frames has been exchanged with a node for HSR_NODE_FORGET_TIME, the node will be deleted from the node_db list. If a frame is sent to the node after it is deleted, a netdev_err message for each slave interface is produced. This should not happen with dan nodes because of supervision frames, but can happen often with san nodes, which clutters the kernel log. Since the hsr protocol does not support sans, this is only relevant for the prp protocol. Signed-off-by: Kristian Overskeid Signed-off-by: David S. Miller --- net/hsr/hsr_framereg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 00db74d96583..865eda39d601 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -415,7 +415,7 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, node_dst = find_node_by_addr_A(&port->hsr->node_db, eth_hdr(skb)->h_dest); if (!node_dst) { - if (net_ratelimit()) + if (net_ratelimit() && port->hsr->prot_version != PRP_V1) netdev_err(skb->dev, "%s: Unknown node\n", __func__); return; } From 248401cb2c4612d83eb0c352ee8103b78b8eb365 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Fri, 10 Mar 2023 11:48:33 -0800 Subject: [PATCH 043/108] ice: avoid bonding causing auxiliary plug/unplug under RTNL lock RDMA is not supported in ice on a PF that has been added to a bonded interface. To enforce this, when an interface enters a bond, we unplug the auxiliary device that supports RDMA functionality. This unplug currently happens in the context of handling the netdev bonding event. This event is sent to the ice driver under RTNL context. This is causing a deadlock where the RDMA driver is waiting for the RTNL lock to complete the removal. Defer the unplugging/re-plugging of the auxiliary device to the service task so that it is not performed under the RTNL lock context. Cc: stable@vger.kernel.org # 6.1.x Reported-by: Jaroslav Pulchart Link: https://lore.kernel.org/netdev/CAK8fFZ6A_Gphw_3-QMGKEFQk=sfCw1Qmq0TVZK3rtAi7vb621A@mail.gmail.com/ Fixes: 5cb1ebdbc434 ("ice: Fix race condition during interface enslave") Fixes: 4eace75e0853 ("RDMA/irdma: Report the correct link speed") Signed-off-by: Dave Ertman Tested-by: Arpana Arland (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20230310194833.3074601-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice.h | 14 +++++--------- drivers/net/ethernet/intel/ice/ice_main.c | 19 ++++++++----------- 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index b0e29e342401..e809249500e1 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -509,6 +509,7 @@ enum ice_pf_flags { ICE_FLAG_VF_VLAN_PRUNING, ICE_FLAG_LINK_LENIENT_MODE_ENA, ICE_FLAG_PLUG_AUX_DEV, + ICE_FLAG_UNPLUG_AUX_DEV, ICE_FLAG_MTU_CHANGED, ICE_FLAG_GNSS, /* GNSS successfully initialized */ ICE_PF_FLAGS_NBITS /* must be last */ @@ -955,16 +956,11 @@ static inline void ice_set_rdma_cap(struct ice_pf *pf) */ static inline void ice_clear_rdma_cap(struct ice_pf *pf) { - /* We can directly unplug aux device here only if the flag bit - * ICE_FLAG_PLUG_AUX_DEV is not set because ice_unplug_aux_dev() - * could race with ice_plug_aux_dev() called from - * ice_service_task(). In this case we only clear that bit now and - * aux device will be unplugged later once ice_plug_aux_device() - * called from ice_service_task() finishes (see ice_service_task()). + /* defer unplug to service task to avoid RTNL lock and + * clear PLUG bit so that pending plugs don't interfere */ - if (!test_and_clear_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags)) - ice_unplug_aux_dev(pf); - + clear_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags); + set_bit(ICE_FLAG_UNPLUG_AUX_DEV, pf->flags); clear_bit(ICE_FLAG_RDMA_ENA, pf->flags); } #endif /* _ICE_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 567694bf098b..c233464b8f6b 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2316,18 +2316,15 @@ static void ice_service_task(struct work_struct *work) } } - if (test_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags)) { - /* Plug aux device per request */ - ice_plug_aux_dev(pf); + /* unplug aux dev per request, if an unplug request came in + * while processing a plug request, this will handle it + */ + if (test_and_clear_bit(ICE_FLAG_UNPLUG_AUX_DEV, pf->flags)) + ice_unplug_aux_dev(pf); - /* Mark plugging as done but check whether unplug was - * requested during ice_plug_aux_dev() call - * (e.g. from ice_clear_rdma_cap()) and if so then - * plug aux device. - */ - if (!test_and_clear_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags)) - ice_unplug_aux_dev(pf); - } + /* Plug aux device per request */ + if (test_and_clear_bit(ICE_FLAG_PLUG_AUX_DEV, pf->flags)) + ice_plug_aux_dev(pf); if (test_and_clear_bit(ICE_FLAG_MTU_CHANGED, pf->flags)) { struct iidc_event *event; From 4b397c06cb987935b1b097336532aa6b4210e091 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 10 Mar 2023 19:11:09 +0000 Subject: [PATCH 044/108] net: tunnels: annotate lockless accesses to dev->needed_headroom IP tunnels can apparently update dev->needed_headroom in their xmit path. This patch takes care of three tunnels xmit, and also the core LL_RESERVED_SPACE() and LL_RESERVED_SPACE_EXTRA() helpers. More changes might be needed for completeness. BUG: KCSAN: data-race in ip_tunnel_xmit / ip_tunnel_xmit read to 0xffff88815b9da0ec of 2 bytes by task 888 on cpu 1: ip_tunnel_xmit+0x1270/0x1730 net/ipv4/ip_tunnel.c:803 __gre_xmit net/ipv4/ip_gre.c:469 [inline] ipgre_xmit+0x516/0x570 net/ipv4/ip_gre.c:661 __netdev_start_xmit include/linux/netdevice.h:4881 [inline] netdev_start_xmit include/linux/netdevice.h:4895 [inline] xmit_one net/core/dev.c:3580 [inline] dev_hard_start_xmit+0x127/0x400 net/core/dev.c:3596 __dev_queue_xmit+0x1007/0x1eb0 net/core/dev.c:4246 dev_queue_xmit include/linux/netdevice.h:3051 [inline] neigh_direct_output+0x17/0x20 net/core/neighbour.c:1623 neigh_output include/net/neighbour.h:546 [inline] ip_finish_output2+0x740/0x840 net/ipv4/ip_output.c:228 ip_finish_output+0xf4/0x240 net/ipv4/ip_output.c:316 NF_HOOK_COND include/linux/netfilter.h:291 [inline] ip_output+0xe5/0x1b0 net/ipv4/ip_output.c:430 dst_output include/net/dst.h:444 [inline] ip_local_out+0x64/0x80 net/ipv4/ip_output.c:126 iptunnel_xmit+0x34a/0x4b0 net/ipv4/ip_tunnel_core.c:82 ip_tunnel_xmit+0x1451/0x1730 net/ipv4/ip_tunnel.c:813 __gre_xmit net/ipv4/ip_gre.c:469 [inline] ipgre_xmit+0x516/0x570 net/ipv4/ip_gre.c:661 __netdev_start_xmit include/linux/netdevice.h:4881 [inline] netdev_start_xmit include/linux/netdevice.h:4895 [inline] xmit_one net/core/dev.c:3580 [inline] dev_hard_start_xmit+0x127/0x400 net/core/dev.c:3596 __dev_queue_xmit+0x1007/0x1eb0 net/core/dev.c:4246 dev_queue_xmit include/linux/netdevice.h:3051 [inline] neigh_direct_output+0x17/0x20 net/core/neighbour.c:1623 neigh_output include/net/neighbour.h:546 [inline] ip_finish_output2+0x740/0x840 net/ipv4/ip_output.c:228 ip_finish_output+0xf4/0x240 net/ipv4/ip_output.c:316 NF_HOOK_COND include/linux/netfilter.h:291 [inline] ip_output+0xe5/0x1b0 net/ipv4/ip_output.c:430 dst_output include/net/dst.h:444 [inline] ip_local_out+0x64/0x80 net/ipv4/ip_output.c:126 iptunnel_xmit+0x34a/0x4b0 net/ipv4/ip_tunnel_core.c:82 ip_tunnel_xmit+0x1451/0x1730 net/ipv4/ip_tunnel.c:813 __gre_xmit net/ipv4/ip_gre.c:469 [inline] ipgre_xmit+0x516/0x570 net/ipv4/ip_gre.c:661 __netdev_start_xmit include/linux/netdevice.h:4881 [inline] netdev_start_xmit include/linux/netdevice.h:4895 [inline] xmit_one net/core/dev.c:3580 [inline] dev_hard_start_xmit+0x127/0x400 net/core/dev.c:3596 __dev_queue_xmit+0x1007/0x1eb0 net/core/dev.c:4246 dev_queue_xmit include/linux/netdevice.h:3051 [inline] neigh_direct_output+0x17/0x20 net/core/neighbour.c:1623 neigh_output include/net/neighbour.h:546 [inline] ip_finish_output2+0x740/0x840 net/ipv4/ip_output.c:228 ip_finish_output+0xf4/0x240 net/ipv4/ip_output.c:316 NF_HOOK_COND include/linux/netfilter.h:291 [inline] ip_output+0xe5/0x1b0 net/ipv4/ip_output.c:430 dst_output include/net/dst.h:444 [inline] ip_local_out+0x64/0x80 net/ipv4/ip_output.c:126 iptunnel_xmit+0x34a/0x4b0 net/ipv4/ip_tunnel_core.c:82 ip_tunnel_xmit+0x1451/0x1730 net/ipv4/ip_tunnel.c:813 __gre_xmit net/ipv4/ip_gre.c:469 [inline] ipgre_xmit+0x516/0x570 net/ipv4/ip_gre.c:661 __netdev_start_xmit include/linux/netdevice.h:4881 [inline] netdev_start_xmit include/linux/netdevice.h:4895 [inline] xmit_one net/core/dev.c:3580 [inline] dev_hard_start_xmit+0x127/0x400 net/core/dev.c:3596 __dev_queue_xmit+0x1007/0x1eb0 net/core/dev.c:4246 dev_queue_xmit include/linux/netdevice.h:3051 [inline] neigh_direct_output+0x17/0x20 net/core/neighbour.c:1623 neigh_output include/net/neighbour.h:546 [inline] ip_finish_output2+0x740/0x840 net/ipv4/ip_output.c:228 ip_finish_output+0xf4/0x240 net/ipv4/ip_output.c:316 NF_HOOK_COND include/linux/netfilter.h:291 [inline] ip_output+0xe5/0x1b0 net/ipv4/ip_output.c:430 dst_output include/net/dst.h:444 [inline] ip_local_out+0x64/0x80 net/ipv4/ip_output.c:126 iptunnel_xmit+0x34a/0x4b0 net/ipv4/ip_tunnel_core.c:82 ip_tunnel_xmit+0x1451/0x1730 net/ipv4/ip_tunnel.c:813 __gre_xmit net/ipv4/ip_gre.c:469 [inline] ipgre_xmit+0x516/0x570 net/ipv4/ip_gre.c:661 __netdev_start_xmit include/linux/netdevice.h:4881 [inline] netdev_start_xmit include/linux/netdevice.h:4895 [inline] xmit_one net/core/dev.c:3580 [inline] dev_hard_start_xmit+0x127/0x400 net/core/dev.c:3596 __dev_queue_xmit+0x1007/0x1eb0 net/core/dev.c:4246 dev_queue_xmit include/linux/netdevice.h:3051 [inline] neigh_direct_output+0x17/0x20 net/core/neighbour.c:1623 neigh_output include/net/neighbour.h:546 [inline] ip_finish_output2+0x740/0x840 net/ipv4/ip_output.c:228 ip_finish_output+0xf4/0x240 net/ipv4/ip_output.c:316 NF_HOOK_COND include/linux/netfilter.h:291 [inline] ip_output+0xe5/0x1b0 net/ipv4/ip_output.c:430 dst_output include/net/dst.h:444 [inline] ip_local_out+0x64/0x80 net/ipv4/ip_output.c:126 iptunnel_xmit+0x34a/0x4b0 net/ipv4/ip_tunnel_core.c:82 ip_tunnel_xmit+0x1451/0x1730 net/ipv4/ip_tunnel.c:813 __gre_xmit net/ipv4/ip_gre.c:469 [inline] ipgre_xmit+0x516/0x570 net/ipv4/ip_gre.c:661 __netdev_start_xmit include/linux/netdevice.h:4881 [inline] netdev_start_xmit include/linux/netdevice.h:4895 [inline] xmit_one net/core/dev.c:3580 [inline] dev_hard_start_xmit+0x127/0x400 net/core/dev.c:3596 __dev_queue_xmit+0x1007/0x1eb0 net/core/dev.c:4246 dev_queue_xmit include/linux/netdevice.h:3051 [inline] neigh_direct_output+0x17/0x20 net/core/neighbour.c:1623 neigh_output include/net/neighbour.h:546 [inline] ip_finish_output2+0x740/0x840 net/ipv4/ip_output.c:228 ip_finish_output+0xf4/0x240 net/ipv4/ip_output.c:316 NF_HOOK_COND include/linux/netfilter.h:291 [inline] ip_output+0xe5/0x1b0 net/ipv4/ip_output.c:430 dst_output include/net/dst.h:444 [inline] ip_local_out+0x64/0x80 net/ipv4/ip_output.c:126 iptunnel_xmit+0x34a/0x4b0 net/ipv4/ip_tunnel_core.c:82 ip_tunnel_xmit+0x1451/0x1730 net/ipv4/ip_tunnel.c:813 __gre_xmit net/ipv4/ip_gre.c:469 [inline] ipgre_xmit+0x516/0x570 net/ipv4/ip_gre.c:661 __netdev_start_xmit include/linux/netdevice.h:4881 [inline] netdev_start_xmit include/linux/netdevice.h:4895 [inline] xmit_one net/core/dev.c:3580 [inline] dev_hard_start_xmit+0x127/0x400 net/core/dev.c:3596 __dev_queue_xmit+0x1007/0x1eb0 net/core/dev.c:4246 write to 0xffff88815b9da0ec of 2 bytes by task 2379 on cpu 0: ip_tunnel_xmit+0x1294/0x1730 net/ipv4/ip_tunnel.c:804 __gre_xmit net/ipv4/ip_gre.c:469 [inline] ipgre_xmit+0x516/0x570 net/ipv4/ip_gre.c:661 __netdev_start_xmit include/linux/netdevice.h:4881 [inline] netdev_start_xmit include/linux/netdevice.h:4895 [inline] xmit_one net/core/dev.c:3580 [inline] dev_hard_start_xmit+0x127/0x400 net/core/dev.c:3596 __dev_queue_xmit+0x1007/0x1eb0 net/core/dev.c:4246 dev_queue_xmit include/linux/netdevice.h:3051 [inline] neigh_direct_output+0x17/0x20 net/core/neighbour.c:1623 neigh_output include/net/neighbour.h:546 [inline] ip6_finish_output2+0x9bc/0xc50 net/ipv6/ip6_output.c:134 __ip6_finish_output net/ipv6/ip6_output.c:195 [inline] ip6_finish_output+0x39a/0x4e0 net/ipv6/ip6_output.c:206 NF_HOOK_COND include/linux/netfilter.h:291 [inline] ip6_output+0xeb/0x220 net/ipv6/ip6_output.c:227 dst_output include/net/dst.h:444 [inline] NF_HOOK include/linux/netfilter.h:302 [inline] mld_sendpack+0x438/0x6a0 net/ipv6/mcast.c:1820 mld_send_cr net/ipv6/mcast.c:2121 [inline] mld_ifc_work+0x519/0x7b0 net/ipv6/mcast.c:2653 process_one_work+0x3e6/0x750 kernel/workqueue.c:2390 worker_thread+0x5f2/0xa10 kernel/workqueue.c:2537 kthread+0x1ac/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 value changed: 0x0dd4 -> 0x0e14 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 2379 Comm: kworker/0:0 Not tainted 6.3.0-rc1-syzkaller-00002-g8ca09d5fa354-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/02/2023 Workqueue: mld mld_ifc_work Fixes: 8eb30be0352d ("ipv6: Create ip6_tnl_xmit") Reported-by: syzbot Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230310191109.2384387-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 6 ++++-- net/ipv4/ip_tunnel.c | 12 ++++++------ net/ipv6/ip6_tunnel.c | 4 ++-- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6a14b7b11766..470085b121d3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -297,9 +297,11 @@ struct hh_cache { * relationship HH alignment <= LL alignment. */ #define LL_RESERVED_SPACE(dev) \ - ((((dev)->hard_header_len+(dev)->needed_headroom)&~(HH_DATA_MOD - 1)) + HH_DATA_MOD) + ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom)) \ + & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD) #define LL_RESERVED_SPACE_EXTRA(dev,extra) \ - ((((dev)->hard_header_len+(dev)->needed_headroom+(extra))&~(HH_DATA_MOD - 1)) + HH_DATA_MOD) + ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom) + (extra)) \ + & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD) struct header_ops { int (*create) (struct sk_buff *skb, struct net_device *dev, diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index de90b09dfe78..2541083d49ad 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -614,10 +614,10 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; - if (headroom > dev->needed_headroom) - dev->needed_headroom = headroom; + if (headroom > READ_ONCE(dev->needed_headroom)) + WRITE_ONCE(dev->needed_headroom, headroom); - if (skb_cow_head(skb, dev->needed_headroom)) { + if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) { ip_rt_put(rt); goto tx_dropped; } @@ -800,10 +800,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); - if (max_headroom > dev->needed_headroom) - dev->needed_headroom = max_headroom; + if (max_headroom > READ_ONCE(dev->needed_headroom)) + WRITE_ONCE(dev->needed_headroom, max_headroom); - if (skb_cow_head(skb, dev->needed_headroom)) { + if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) { ip_rt_put(rt); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 47b6607a1370..5e80e517f071 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1240,8 +1240,8 @@ route_lookup: */ max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr) + dst->header_len + t->hlen; - if (max_headroom > dev->needed_headroom) - dev->needed_headroom = max_headroom; + if (max_headroom > READ_ONCE(dev->needed_headroom)) + WRITE_ONCE(dev->needed_headroom, max_headroom); err = ip6_tnl_encap(skb, t, &proto, fl6); if (err) From c22c3bbf351e4ce905f082649cffa1ff893ea8c1 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 11 Mar 2023 19:34:45 +0100 Subject: [PATCH 045/108] net: phy: smsc: bail out in lan87xx_read_status if genphy_read_status fails If genphy_read_status fails then further access to the PHY may result in unpredictable behavior. To prevent this bail out immediately if genphy_read_status fails. Fixes: 4223dbffed9f ("net: phy: smsc: Re-enable EDPD mode for LAN87xx") Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/026aa4f2-36f5-1c10-ab9f-cdb17dda6ac4@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/smsc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/smsc.c b/drivers/net/phy/smsc.c index 00d9eff91dcf..df2c5435c5c4 100644 --- a/drivers/net/phy/smsc.c +++ b/drivers/net/phy/smsc.c @@ -199,8 +199,11 @@ static int lan95xx_config_aneg_ext(struct phy_device *phydev) static int lan87xx_read_status(struct phy_device *phydev) { struct smsc_phy_priv *priv = phydev->priv; + int err; - int err = genphy_read_status(phydev); + err = genphy_read_status(phydev); + if (err) + return err; if (!phydev->link && priv->energy_enable && phydev->irq == PHY_POLL) { /* Disable EDPD to wake up PHY */ From d9ba9934285514f1f95d96326a82398a22dc77f2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 11 Mar 2023 19:19:03 -0800 Subject: [PATCH 046/108] tcp: Fix bind() conflict check for dual-stack wildcard address. Paul Holzinger reported [0] that commit 5456262d2baa ("net: Fix incorrect address comparison when searching for a bind2 bucket") introduced a bind() regression. Paul also gave a nice repro that calls two types of bind() on the same port, both of which now succeed, but the second call should fail: bind(fd1, ::, port) + bind(fd2, 127.0.0.1, port) The cited commit added address family tests in three functions to fix the uninit-value KMSAN report. [1] However, the test added to inet_bind2_bucket_match_addr_any() removed a necessary conflict check; the dual-stack wildcard address no longer conflicts with an IPv4 non-wildcard address. If tb->family is AF_INET6 and sk->sk_family is AF_INET in inet_bind2_bucket_match_addr_any(), we still need to check if tb has the dual-stack wildcard address. Note that the IPv4 wildcard address does not conflict with IPv6 non-wildcard addresses. [0]: https://lore.kernel.org/netdev/e21bf153-80b0-9ec0-15ba-e04a4ad42c34@redhat.com/ [1]: https://lore.kernel.org/netdev/CAG_fn=Ud3zSW7AZWXc+asfMhZVL5ETnvuY44Pmyv4NPv-ijN-A@mail.gmail.com/ Fixes: 5456262d2baa ("net: Fix incorrect address comparison when searching for a bind2 bucket") Signed-off-by: Kuniyuki Iwashima Reported-by: Paul Holzinger Link: https://lore.kernel.org/netdev/CAG_fn=Ud3zSW7AZWXc+asfMhZVL5ETnvuY44Pmyv4NPv-ijN-A@mail.gmail.com/ Reviewed-by: Eric Dumazet Tested-by: Paul Holzinger Reviewed-by: Martin KaFai Lau Signed-off-by: Jakub Kicinski --- net/ipv4/inet_hashtables.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index e41fdc38ce19..6edae3886885 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -828,8 +828,14 @@ bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const #if IS_ENABLED(CONFIG_IPV6) struct in6_addr addr_any = {}; - if (sk->sk_family != tb->family) + if (sk->sk_family != tb->family) { + if (sk->sk_family == AF_INET) + return net_eq(ib2_net(tb), net) && tb->port == port && + tb->l3mdev == l3mdev && + ipv6_addr_equal(&tb->v6_rcv_saddr, &addr_any); + return false; + } if (sk->sk_family == AF_INET6) return net_eq(ib2_net(tb), net) && tb->port == port && From 13715acf8ab5b32a6d7e42686fceeb66df114185 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 11 Mar 2023 19:19:04 -0800 Subject: [PATCH 047/108] selftest: Add test for bind() conflicts. The test checks if (IPv4, IPv6) address pair properly conflict or not. * IPv4 * 0.0.0.0 * 127.0.0.1 * IPv6 * :: * ::1 If the IPv6 address is [::], the second bind() always fails. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/.gitignore | 1 + tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/bind_wildcard.c | 114 ++++++++++++++++++++ 3 files changed, 116 insertions(+) create mode 100644 tools/testing/selftests/net/bind_wildcard.c diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index a6911cae368c..80f06aa62034 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only bind_bhash bind_timewait +bind_wildcard csum cmsg_sender diag_uid diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 6cd8993454d7..80fbfe0330f6 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -80,6 +80,7 @@ TEST_GEN_FILES += sctp_hello TEST_GEN_FILES += csum TEST_GEN_FILES += nat6to4.o TEST_GEN_FILES += ip_local_port_range +TEST_GEN_FILES += bind_wildcard TEST_FILES := settings diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c new file mode 100644 index 000000000000..58edfc15d28b --- /dev/null +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright Amazon.com Inc. or its affiliates. */ + +#include +#include + +#include "../kselftest_harness.h" + +FIXTURE(bind_wildcard) +{ + struct sockaddr_in addr4; + struct sockaddr_in6 addr6; + int expected_errno; +}; + +FIXTURE_VARIANT(bind_wildcard) +{ + const __u32 addr4_const; + const struct in6_addr *addr6_const; +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any) +{ + .addr4_const = INADDR_ANY, + .addr6_const = &in6addr_any, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_local) +{ + .addr4_const = INADDR_ANY, + .addr6_const = &in6addr_loopback, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any) +{ + .addr4_const = INADDR_LOOPBACK, + .addr6_const = &in6addr_any, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_local) +{ + .addr4_const = INADDR_LOOPBACK, + .addr6_const = &in6addr_loopback, +}; + +FIXTURE_SETUP(bind_wildcard) +{ + self->addr4.sin_family = AF_INET; + self->addr4.sin_port = htons(0); + self->addr4.sin_addr.s_addr = htonl(variant->addr4_const); + + self->addr6.sin6_family = AF_INET6; + self->addr6.sin6_port = htons(0); + self->addr6.sin6_addr = *variant->addr6_const; + + if (variant->addr6_const == &in6addr_any) + self->expected_errno = EADDRINUSE; + else + self->expected_errno = 0; +} + +FIXTURE_TEARDOWN(bind_wildcard) +{ +} + +void bind_sockets(struct __test_metadata *_metadata, + FIXTURE_DATA(bind_wildcard) *self, + struct sockaddr *addr1, socklen_t addrlen1, + struct sockaddr *addr2, socklen_t addrlen2) +{ + int fd[2]; + int ret; + + fd[0] = socket(addr1->sa_family, SOCK_STREAM, 0); + ASSERT_GT(fd[0], 0); + + ret = bind(fd[0], addr1, addrlen1); + ASSERT_EQ(ret, 0); + + ret = getsockname(fd[0], addr1, &addrlen1); + ASSERT_EQ(ret, 0); + + ((struct sockaddr_in *)addr2)->sin_port = ((struct sockaddr_in *)addr1)->sin_port; + + fd[1] = socket(addr2->sa_family, SOCK_STREAM, 0); + ASSERT_GT(fd[1], 0); + + ret = bind(fd[1], addr2, addrlen2); + if (self->expected_errno) { + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, self->expected_errno); + } else { + ASSERT_EQ(ret, 0); + } + + close(fd[1]); + close(fd[0]); +} + +TEST_F(bind_wildcard, v4_v6) +{ + bind_sockets(_metadata, self, + (struct sockaddr *)&self->addr4, sizeof(self->addr6), + (struct sockaddr *)&self->addr6, sizeof(self->addr6)); +} + +TEST_F(bind_wildcard, v6_v4) +{ + bind_sockets(_metadata, self, + (struct sockaddr *)&self->addr6, sizeof(self->addr6), + (struct sockaddr *)&self->addr4, sizeof(self->addr4)); +} + +TEST_HARNESS_MAIN From 5000fe6c27827a61d8250a7e4a1d26c3298ef4f6 Mon Sep 17 00:00:00 2001 From: Zheng Wang Date: Mon, 13 Mar 2023 00:08:37 +0800 Subject: [PATCH 048/108] nfc: st-nci: Fix use after free bug in ndlc_remove due to race condition This bug influences both st_nci_i2c_remove and st_nci_spi_remove. Take st_nci_i2c_remove as an example. In st_nci_i2c_probe, it called ndlc_probe and bound &ndlc->sm_work with llt_ndlc_sm_work. When it calls ndlc_recv or timeout handler, it will finally call schedule_work to start the work. When we call st_nci_i2c_remove to remove the driver, there may be a sequence as follows: Fix it by finishing the work before cleanup in ndlc_remove CPU0 CPU1 |llt_ndlc_sm_work st_nci_i2c_remove | ndlc_remove | st_nci_remove | nci_free_device| kfree(ndev) | //free ndlc->ndev | |llt_ndlc_rcv_queue |nci_recv_frame |//use ndlc->ndev Fixes: 35630df68d60 ("NFC: st21nfcb: Add driver for STMicroelectronics ST21NFCB NFC chip") Signed-off-by: Zheng Wang Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20230312160837.2040857-1-zyytlz.wz@163.com Signed-off-by: Jakub Kicinski --- drivers/nfc/st-nci/ndlc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nfc/st-nci/ndlc.c b/drivers/nfc/st-nci/ndlc.c index 755460a73c0d..d2aa9f766738 100644 --- a/drivers/nfc/st-nci/ndlc.c +++ b/drivers/nfc/st-nci/ndlc.c @@ -282,13 +282,15 @@ EXPORT_SYMBOL(ndlc_probe); void ndlc_remove(struct llt_ndlc *ndlc) { - st_nci_remove(ndlc->ndev); - /* cancel timers */ del_timer_sync(&ndlc->t1_timer); del_timer_sync(&ndlc->t2_timer); ndlc->t2_active = false; ndlc->t1_active = false; + /* cancel work */ + cancel_work_sync(&ndlc->sm_work); + + st_nci_remove(ndlc->ndev); skb_queue_purge(&ndlc->rcv_q); skb_queue_purge(&ndlc->send_q); From 5ce76fe1eead179c058d9151ee1f4088cfdc1c6b Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 14 Mar 2023 00:08:40 +0100 Subject: [PATCH 049/108] veth: rely on rtnl_dereference() instead of on rcu_dereference() in veth_set_xdp_features() Fix the following kernel warning in veth_set_xdp_features routine relying on rtnl_dereference() instead of on rcu_dereference(): ============================= WARNING: suspicious RCU usage 6.3.0-rc1-00144-g064d70527aaa #149 Not tainted ----------------------------- drivers/net/veth.c:1265 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by ip/135: (net/core/rtnetlink.c:6172) stack backtrace: CPU: 1 PID: 135 Comm: ip Not tainted 6.3.0-rc1-00144-g064d70527aaa #149 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:107) lockdep_rcu_suspicious (include/linux/context_tracking.h:152) veth_set_xdp_features (drivers/net/veth.c:1265 (discriminator 9)) veth_newlink (drivers/net/veth.c:1892) ? veth_set_features (drivers/net/veth.c:1774) ? kasan_save_stack (mm/kasan/common.c:47) ? kasan_save_stack (mm/kasan/common.c:46) ? kasan_set_track (mm/kasan/common.c:52) ? alloc_netdev_mqs (include/linux/slab.h:737) ? rcu_read_lock_sched_held (kernel/rcu/update.c:125) ? trace_kmalloc (include/trace/events/kmem.h:54) ? __xdp_rxq_info_reg (net/core/xdp.c:188) ? alloc_netdev_mqs (net/core/dev.c:10657) ? rtnl_create_link (net/core/rtnetlink.c:3312) rtnl_newlink_create (net/core/rtnetlink.c:3440) ? rtnl_link_get_net_capable.constprop.0 (net/core/rtnetlink.c:3391) __rtnl_newlink (net/core/rtnetlink.c:3657) ? lock_downgrade (kernel/locking/lockdep.c:5321) ? rtnl_link_unregister (net/core/rtnetlink.c:3487) rtnl_newlink (net/core/rtnetlink.c:3671) rtnetlink_rcv_msg (net/core/rtnetlink.c:6174) ? rtnl_link_fill (net/core/rtnetlink.c:6070) ? mark_usage (kernel/locking/lockdep.c:4914) ? mark_usage (kernel/locking/lockdep.c:4914) netlink_rcv_skb (net/netlink/af_netlink.c:2574) ? rtnl_link_fill (net/core/rtnetlink.c:6070) ? netlink_ack (net/netlink/af_netlink.c:2551) ? lock_acquire (kernel/locking/lockdep.c:467) ? net_generic (include/linux/rcupdate.h:805) ? netlink_deliver_tap (include/linux/rcupdate.h:805) netlink_unicast (net/netlink/af_netlink.c:1340) ? netlink_attachskb (net/netlink/af_netlink.c:1350) netlink_sendmsg (net/netlink/af_netlink.c:1942) ? netlink_unicast (net/netlink/af_netlink.c:1861) ? netlink_unicast (net/netlink/af_netlink.c:1861) sock_sendmsg (net/socket.c:727) ____sys_sendmsg (net/socket.c:2501) ? kernel_sendmsg (net/socket.c:2448) ? __copy_msghdr (net/socket.c:2428) ___sys_sendmsg (net/socket.c:2557) ? mark_usage (kernel/locking/lockdep.c:4914) ? do_recvmmsg (net/socket.c:2544) ? lock_acquire (kernel/locking/lockdep.c:467) ? find_held_lock (kernel/locking/lockdep.c:5159) ? __lock_release (kernel/locking/lockdep.c:5345) ? __might_fault (mm/memory.c:5625) ? lock_downgrade (kernel/locking/lockdep.c:5321) ? __fget_light (include/linux/atomic/atomic-arch-fallback.h:227) __sys_sendmsg (include/linux/file.h:31) ? __sys_sendmsg_sock (net/socket.c:2572) ? rseq_get_rseq_cs (kernel/rseq.c:275) ? lockdep_hardirqs_on_prepare.part.0 (kernel/locking/lockdep.c:4263) do_syscall_64 (arch/x86/entry/common.c:50) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) RIP: 0033:0x7f0d1aadeb17 Code: 0f 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b9 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 89 54 24 1c 48 89 74 24 10 Fixes: fccca038f300 ("veth: take into account device reconfiguration for xdp_features flag") Suggested-by: Eric Dumazet Reported-by: Matthieu Baerts Link: https://lore.kernel.org/netdev/cover.1678364612.git.lorenzo@kernel.org/T/#me4c9d8e985ec7ebee981cfdb5bc5ec651ef4035d Signed-off-by: Lorenzo Bianconi Reported-by: syzbot+c3d0d9c42d59ff644ea6@syzkaller.appspotmail.com Reviewed-by: Eric Dumazet Tested-by: Matthieu Baerts Link: https://lore.kernel.org/r/dfd6a9a7d85e9113063165e1f47b466b90ad7b8a.1678748579.git.lorenzo@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/veth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 293dc3b2c84a..4da74ac27f9a 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1262,7 +1262,7 @@ static void veth_set_xdp_features(struct net_device *dev) struct veth_priv *priv = netdev_priv(dev); struct net_device *peer; - peer = rcu_dereference(priv->peer); + peer = rtnl_dereference(priv->peer); if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) { xdp_features_t val = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | From 35c356924fe3669dfbb1185607ce3b37f70bfa80 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 13 Mar 2023 18:21:24 +0100 Subject: [PATCH 050/108] mlxsw: spectrum: Fix incorrect parsing depth after reload Spectrum ASICs have a configurable limit on how deep into the packet they parse. By default, the limit is 96 bytes. There are several cases where this parsing depth is not enough and there is a need to increase it. For example, timestamping of PTP packets and a FIB multipath hash policy that requires hashing on inner fields. The driver therefore maintains a reference count that reflects the number of consumers that require an increased parsing depth. During reload_down() the parsing depth reference count does not necessarily drop to zero, but the parsing depth itself is restored to the default during reload_up() when the firmware is reset. It is therefore possible to end up in situations where the driver thinks that the parsing depth was increased (reference count is non-zero), when it is not. Fix by making sure that all the consumers that increase the parsing depth reference count also decrease it during reload_down(). Specifically, make sure that when the routing code is de-initialized it drops the reference count if it was increased because of a FIB multipath hash policy that requires hashing on inner fields. Add a warning if the reference count is not zero after the driver was de-initialized and explicitly reset it to zero during initialization for good measures. Fixes: 2d91f0803b84 ("mlxsw: spectrum: Add infrastructure for parsing configuration") Reported-by: Maksym Yaremchuk Signed-off-by: Ido Schimmel Signed-off-by: Petr Machata Link: https://lore.kernel.org/r/9c35e1b3e6c1d8f319a2449d14e2b86373f3b3ba.1678727526.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 2 ++ .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index a8f94b7544ee..02a327744a61 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -2937,6 +2937,7 @@ static int mlxsw_sp_netdevice_event(struct notifier_block *unused, static void mlxsw_sp_parsing_init(struct mlxsw_sp *mlxsw_sp) { + refcount_set(&mlxsw_sp->parsing.parsing_depth_ref, 0); mlxsw_sp->parsing.parsing_depth = MLXSW_SP_DEFAULT_PARSING_DEPTH; mlxsw_sp->parsing.vxlan_udp_dport = MLXSW_SP_DEFAULT_VXLAN_UDP_DPORT; mutex_init(&mlxsw_sp->parsing.lock); @@ -2945,6 +2946,7 @@ static void mlxsw_sp_parsing_init(struct mlxsw_sp *mlxsw_sp) static void mlxsw_sp_parsing_fini(struct mlxsw_sp *mlxsw_sp) { mutex_destroy(&mlxsw_sp->parsing.lock); + WARN_ON_ONCE(refcount_read(&mlxsw_sp->parsing.parsing_depth_ref)); } struct mlxsw_sp_ipv6_addr_node { diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 09e32778b012..4a73e2fe95ef 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -10381,11 +10381,23 @@ err_reg_write: old_inc_parsing_depth); return err; } + +static void mlxsw_sp_mp_hash_fini(struct mlxsw_sp *mlxsw_sp) +{ + bool old_inc_parsing_depth = mlxsw_sp->router->inc_parsing_depth; + + mlxsw_sp_mp_hash_parsing_depth_adjust(mlxsw_sp, old_inc_parsing_depth, + false); +} #else static int mlxsw_sp_mp_hash_init(struct mlxsw_sp *mlxsw_sp) { return 0; } + +static void mlxsw_sp_mp_hash_fini(struct mlxsw_sp *mlxsw_sp) +{ +} #endif static int mlxsw_sp_dscp_init(struct mlxsw_sp *mlxsw_sp) @@ -10615,6 +10627,7 @@ err_register_inet6addr_notifier: err_register_inetaddr_notifier: mlxsw_core_flush_owq(); err_dscp_init: + mlxsw_sp_mp_hash_fini(mlxsw_sp); err_mp_hash_init: mlxsw_sp_neigh_fini(mlxsw_sp); err_neigh_init: @@ -10655,6 +10668,7 @@ void mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp) unregister_inet6addr_notifier(&mlxsw_sp->router->inet6addr_nb); unregister_inetaddr_notifier(&mlxsw_sp->router->inetaddr_nb); mlxsw_core_flush_owq(); + mlxsw_sp_mp_hash_fini(mlxsw_sp); mlxsw_sp_neigh_fini(mlxsw_sp); mlxsw_sp_lb_rif_fini(mlxsw_sp); mlxsw_sp_vrs_fini(mlxsw_sp); From 13085e1b5cab8ad802904d72e6a6dae85ae0cd20 Mon Sep 17 00:00:00 2001 From: Wenjia Zhang Date: Mon, 13 Mar 2023 11:08:28 +0100 Subject: [PATCH 051/108] net/smc: fix deadlock triggered by cancel_delayed_work_syn() The following LOCKDEP was detected: Workqueue: events smc_lgr_free_work [smc] WARNING: possible circular locking dependency detected 6.1.0-20221027.rc2.git8.56bc5b569087.300.fc36.s390x+debug #1 Not tainted ------------------------------------------------------ kworker/3:0/176251 is trying to acquire lock: 00000000f1467148 ((wq_completion)smc_tx_wq-00000000#2){+.+.}-{0:0}, at: __flush_workqueue+0x7a/0x4f0 but task is already holding lock: 0000037fffe97dc8 ((work_completion)(&(&lgr->free_work)->work)){+.+.}-{0:0}, at: process_one_work+0x232/0x730 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #4 ((work_completion)(&(&lgr->free_work)->work)){+.+.}-{0:0}: __lock_acquire+0x58e/0xbd8 lock_acquire.part.0+0xe2/0x248 lock_acquire+0xac/0x1c8 __flush_work+0x76/0xf0 __cancel_work_timer+0x170/0x220 __smc_lgr_terminate.part.0+0x34/0x1c0 [smc] smc_connect_rdma+0x15e/0x418 [smc] __smc_connect+0x234/0x480 [smc] smc_connect+0x1d6/0x230 [smc] __sys_connect+0x90/0xc0 __do_sys_socketcall+0x186/0x370 __do_syscall+0x1da/0x208 system_call+0x82/0xb0 -> #3 (smc_client_lgr_pending){+.+.}-{3:3}: __lock_acquire+0x58e/0xbd8 lock_acquire.part.0+0xe2/0x248 lock_acquire+0xac/0x1c8 __mutex_lock+0x96/0x8e8 mutex_lock_nested+0x32/0x40 smc_connect_rdma+0xa4/0x418 [smc] __smc_connect+0x234/0x480 [smc] smc_connect+0x1d6/0x230 [smc] __sys_connect+0x90/0xc0 __do_sys_socketcall+0x186/0x370 __do_syscall+0x1da/0x208 system_call+0x82/0xb0 -> #2 (sk_lock-AF_SMC){+.+.}-{0:0}: __lock_acquire+0x58e/0xbd8 lock_acquire.part.0+0xe2/0x248 lock_acquire+0xac/0x1c8 lock_sock_nested+0x46/0xa8 smc_tx_work+0x34/0x50 [smc] process_one_work+0x30c/0x730 worker_thread+0x62/0x420 kthread+0x138/0x150 __ret_from_fork+0x3c/0x58 ret_from_fork+0xa/0x40 -> #1 ((work_completion)(&(&smc->conn.tx_work)->work)){+.+.}-{0:0}: __lock_acquire+0x58e/0xbd8 lock_acquire.part.0+0xe2/0x248 lock_acquire+0xac/0x1c8 process_one_work+0x2bc/0x730 worker_thread+0x62/0x420 kthread+0x138/0x150 __ret_from_fork+0x3c/0x58 ret_from_fork+0xa/0x40 -> #0 ((wq_completion)smc_tx_wq-00000000#2){+.+.}-{0:0}: check_prev_add+0xd8/0xe88 validate_chain+0x70c/0xb20 __lock_acquire+0x58e/0xbd8 lock_acquire.part.0+0xe2/0x248 lock_acquire+0xac/0x1c8 __flush_workqueue+0xaa/0x4f0 drain_workqueue+0xaa/0x158 destroy_workqueue+0x44/0x2d8 smc_lgr_free+0x9e/0xf8 [smc] process_one_work+0x30c/0x730 worker_thread+0x62/0x420 kthread+0x138/0x150 __ret_from_fork+0x3c/0x58 ret_from_fork+0xa/0x40 other info that might help us debug this: Chain exists of: (wq_completion)smc_tx_wq-00000000#2 --> smc_client_lgr_pending --> (work_completion)(&(&lgr->free_work)->work) Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock((work_completion)(&(&lgr->free_work)->work)); lock(smc_client_lgr_pending); lock((work_completion) (&(&lgr->free_work)->work)); lock((wq_completion)smc_tx_wq-00000000#2); *** DEADLOCK *** 2 locks held by kworker/3:0/176251: #0: 0000000080183548 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x232/0x730 #1: 0000037fffe97dc8 ((work_completion) (&(&lgr->free_work)->work)){+.+.}-{0:0}, at: process_one_work+0x232/0x730 stack backtrace: CPU: 3 PID: 176251 Comm: kworker/3:0 Not tainted Hardware name: IBM 8561 T01 701 (z/VM 7.2.0) Call Trace: [<000000002983c3e4>] dump_stack_lvl+0xac/0x100 [<0000000028b477ae>] check_noncircular+0x13e/0x160 [<0000000028b48808>] check_prev_add+0xd8/0xe88 [<0000000028b49cc4>] validate_chain+0x70c/0xb20 [<0000000028b4bd26>] __lock_acquire+0x58e/0xbd8 [<0000000028b4cf6a>] lock_acquire.part.0+0xe2/0x248 [<0000000028b4d17c>] lock_acquire+0xac/0x1c8 [<0000000028addaaa>] __flush_workqueue+0xaa/0x4f0 [<0000000028addf9a>] drain_workqueue+0xaa/0x158 [<0000000028ae303c>] destroy_workqueue+0x44/0x2d8 [<000003ff8029af26>] smc_lgr_free+0x9e/0xf8 [smc] [<0000000028adf3d4>] process_one_work+0x30c/0x730 [<0000000028adf85a>] worker_thread+0x62/0x420 [<0000000028aeac50>] kthread+0x138/0x150 [<0000000028a63914>] __ret_from_fork+0x3c/0x58 [<00000000298503da>] ret_from_fork+0xa/0x40 INFO: lockdep is turned off. =================================================================== This deadlock occurs because cancel_delayed_work_sync() waits for the work(&lgr->free_work) to finish, while the &lgr->free_work waits for the work(lgr->tx_wq), which needs the sk_lock-AF_SMC, that is already used under the mutex_lock. The solution is to use cancel_delayed_work() instead, which kills off a pending work. Fixes: a52bcc919b14 ("net/smc: improve termination processing") Signed-off-by: Wenjia Zhang Reviewed-by: Jan Karcher Reviewed-by: Karsten Graul Reviewed-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/smc_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index d52060b2680c..454356771cda 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1464,7 +1464,7 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) if (lgr->terminating) return; /* lgr already terminating */ /* cancel free_work sync, will terminate when lgr->freeing is set */ - cancel_delayed_work_sync(&lgr->free_work); + cancel_delayed_work(&lgr->free_work); lgr->terminating = 1; /* kill remaining link group connections */ From 9d876d3ef27fa84355597ad269939772192356d8 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 13 Mar 2023 11:08:29 +0100 Subject: [PATCH 052/108] net/smc: Fix device de-init sequence CLC message initialization was not properly reversed in error handling path. Reported-and-suggested-by: Alexander Gordeev Signed-off-by: Stefan Raspl Signed-off-by: Wenjia Zhang Reviewed-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/af_smc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index ff6dd86bdc9f..c6b4a62276f6 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3501,6 +3501,7 @@ out_pnet: out_nl: smc_nl_exit(); out_ism: + smc_clc_exit(); smc_ism_exit(); out_pernet_subsys_stat: unregister_pernet_subsys(&smc_net_stat_ops); From d8b228318935044dafe3a5bc07ee71a1f1424b8d Mon Sep 17 00:00:00 2001 From: Szymon Heidrich Date: Mon, 13 Mar 2023 23:00:45 +0100 Subject: [PATCH 053/108] net: usb: smsc75xx: Limit packet length to skb->len Packet length retrieved from skb data may be larger than the actual socket buffer length (up to 9026 bytes). In such case the cloned skb passed up the network stack will leak kernel memory contents. Fixes: d0cad871703b ("smsc75xx: SMSC LAN75xx USB gigabit ethernet adapter driver") Signed-off-by: Szymon Heidrich Signed-off-by: David S. Miller --- drivers/net/usb/smsc75xx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/smsc75xx.c b/drivers/net/usb/smsc75xx.c index 95de452ff4da..db34f8d1d605 100644 --- a/drivers/net/usb/smsc75xx.c +++ b/drivers/net/usb/smsc75xx.c @@ -2212,7 +2212,8 @@ static int smsc75xx_rx_fixup(struct usbnet *dev, struct sk_buff *skb) dev->net->stats.rx_frame_errors++; } else { /* MAX_SINGLE_PACKET_SIZE + 4(CRC) + 2(COE) + 4(Vlan) */ - if (unlikely(size > (MAX_SINGLE_PACKET_SIZE + ETH_HLEN + 12))) { + if (unlikely(size > (MAX_SINGLE_PACKET_SIZE + ETH_HLEN + 12) || + size > skb->len)) { netif_dbg(dev, rx_err, dev->net, "size err rx_cmd_a=0x%08x\n", rx_cmd_a); From 611e2dabb4b3243d176739fd6a5a34d007fa3f86 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Tue, 14 Mar 2023 00:34:26 +0000 Subject: [PATCH 054/108] net: ethernet: mtk_eth_soc: reset PCS state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reset the internal PCS state machine when changing interface mode. This prevents confusing the state machine when changing interface modes, e.g. from SGMII to 2500Base-X or vice-versa. Fixes: 7e538372694b ("net: ethernet: mediatek: Re-add support SGMII") Reviewed-by: Russell King (Oracle) Tested-by: Bjørn Mork Signed-off-by: Daniel Golle Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_eth_soc.h | 4 ++++ drivers/net/ethernet/mediatek/mtk_sgmii.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.h b/drivers/net/ethernet/mediatek/mtk_eth_soc.h index b65de174c3d9..084a6badef6d 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h @@ -542,6 +542,10 @@ #define SGMII_SEND_AN_ERROR_EN BIT(11) #define SGMII_IF_MODE_MASK GENMASK(5, 1) +/* Register to reset SGMII design */ +#define SGMII_RESERVED_0 0x34 +#define SGMII_SW_RESET BIT(0) + /* Register to set SGMII speed, ANA RG_ Control Signals III*/ #define SGMSYS_ANA_RG_CS3 0x2028 #define RG_PHY_SPEED_MASK (BIT(2) | BIT(3)) diff --git a/drivers/net/ethernet/mediatek/mtk_sgmii.c b/drivers/net/ethernet/mediatek/mtk_sgmii.c index bb00de1003ac..612f65bb0345 100644 --- a/drivers/net/ethernet/mediatek/mtk_sgmii.c +++ b/drivers/net/ethernet/mediatek/mtk_sgmii.c @@ -88,6 +88,10 @@ static int mtk_pcs_config(struct phylink_pcs *pcs, unsigned int mode, regmap_update_bits(mpcs->regmap, SGMSYS_QPHY_PWR_STATE_CTRL, SGMII_PHYA_PWD, SGMII_PHYA_PWD); + /* Reset SGMII PCS state */ + regmap_update_bits(mpcs->regmap, SGMII_RESERVED_0, + SGMII_SW_RESET, SGMII_SW_RESET); + if (interface == PHY_INTERFACE_MODE_2500BASEX) rgc3 = RG_PHY_SPEED_3_125G; else From 6e933a804c7db8be64f367f33e63cd7dcc302ebb Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Tue, 14 Mar 2023 00:34:45 +0000 Subject: [PATCH 055/108] net: ethernet: mtk_eth_soc: only write values if needed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only restart auto-negotiation and write link timer if actually necessary. This prevents losing the link in case of minor changes. Fixes: 7e538372694b ("net: ethernet: mediatek: Re-add support SGMII") Reviewed-by: Russell King (Oracle) Tested-by: Bjørn Mork Signed-off-by: Daniel Golle Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_sgmii.c | 24 +++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_sgmii.c b/drivers/net/ethernet/mediatek/mtk_sgmii.c index 612f65bb0345..83976dc86887 100644 --- a/drivers/net/ethernet/mediatek/mtk_sgmii.c +++ b/drivers/net/ethernet/mediatek/mtk_sgmii.c @@ -38,20 +38,16 @@ static int mtk_pcs_config(struct phylink_pcs *pcs, unsigned int mode, const unsigned long *advertising, bool permit_pause_to_mac) { + bool mode_changed = false, changed, use_an; struct mtk_pcs *mpcs = pcs_to_mtk_pcs(pcs); unsigned int rgc3, sgm_mode, bmcr; int advertise, link_timer; - bool changed, use_an; advertise = phylink_mii_c22_pcs_encode_advertisement(interface, advertising); if (advertise < 0) return advertise; - link_timer = phylink_get_link_timer_ns(interface); - if (link_timer < 0) - return link_timer; - /* Clearing IF_MODE_BIT0 switches the PCS to BASE-X mode, and * we assume that fixes it's speed at bitrate = line rate (in * other words, 1000Mbps or 2500Mbps). @@ -77,13 +73,16 @@ static int mtk_pcs_config(struct phylink_pcs *pcs, unsigned int mode, } if (use_an) { - /* FIXME: Do we need to set AN_RESTART here? */ - bmcr = SGMII_AN_RESTART | SGMII_AN_ENABLE; + bmcr = SGMII_AN_ENABLE; } else { bmcr = 0; } if (mpcs->interface != interface) { + link_timer = phylink_get_link_timer_ns(interface); + if (link_timer < 0) + return link_timer; + /* PHYA power down */ regmap_update_bits(mpcs->regmap, SGMSYS_QPHY_PWR_STATE_CTRL, SGMII_PHYA_PWD, SGMII_PHYA_PWD); @@ -101,16 +100,17 @@ static int mtk_pcs_config(struct phylink_pcs *pcs, unsigned int mode, regmap_update_bits(mpcs->regmap, mpcs->ana_rgc3, RG_PHY_SPEED_3_125G, rgc3); + /* Setup the link timer */ + regmap_write(mpcs->regmap, SGMSYS_PCS_LINK_TIMER, link_timer / 2 / 8); + mpcs->interface = interface; + mode_changed = true; } /* Update the advertisement, noting whether it has changed */ regmap_update_bits_check(mpcs->regmap, SGMSYS_PCS_ADVERTISE, SGMII_ADVERTISE, advertise, &changed); - /* Setup the link timer and QPHY power up inside SGMIISYS */ - regmap_write(mpcs->regmap, SGMSYS_PCS_LINK_TIMER, link_timer / 2 / 8); - /* Update the sgmsys mode register */ regmap_update_bits(mpcs->regmap, SGMSYS_SGMII_MODE, SGMII_REMOTE_FAULT_DIS | SGMII_SPEED_DUPLEX_AN | @@ -118,7 +118,7 @@ static int mtk_pcs_config(struct phylink_pcs *pcs, unsigned int mode, /* Update the BMCR */ regmap_update_bits(mpcs->regmap, SGMSYS_PCS_CONTROL_1, - SGMII_AN_RESTART | SGMII_AN_ENABLE, bmcr); + SGMII_AN_ENABLE, bmcr); /* Release PHYA power down state * Only removing bit SGMII_PHYA_PWD isn't enough. @@ -132,7 +132,7 @@ static int mtk_pcs_config(struct phylink_pcs *pcs, unsigned int mode, usleep_range(50, 100); regmap_write(mpcs->regmap, SGMSYS_QPHY_PWR_STATE_CTRL, 0); - return changed; + return changed || mode_changed; } static void mtk_pcs_restart_an(struct phylink_pcs *pcs) From 37beabe9a891b92174cd1aafbfa881fe9e05aa87 Mon Sep 17 00:00:00 2001 From: Emeel Hakim Date: Wed, 8 Feb 2023 14:25:54 +0200 Subject: [PATCH 056/108] net/mlx5e: Fix macsec ASO context alignment Currently mlx5e_macsec_umr struct does not satisfy hardware memory alignment requirement. Hence the result of querying advanced steering operation (ASO) is not copied to the memory region as expected. Fix by satisfying hardware memory alignment requirement and move context to be first field in struct for better readability. Fixes: 1f53da676439 ("net/mlx5e: Create advanced steering operation (ASO) object for MACsec") Signed-off-by: Emeel Hakim Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c index 08d0929e8260..8af53178e40d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c @@ -89,8 +89,8 @@ struct mlx5e_macsec_rx_sc { }; struct mlx5e_macsec_umr { + u8 __aligned(64) ctx[MLX5_ST_SZ_BYTES(macsec_aso)]; dma_addr_t dma_addr; - u8 ctx[MLX5_ST_SZ_BYTES(macsec_aso)]; u32 mkey; }; From 9a92fe1db9e57ea94388a1d768e8ee42af858377 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 12 Mar 2021 07:21:29 -0600 Subject: [PATCH 057/108] net/mlx5e: Don't cache tunnel offloads capability When mlx5e attaches again after device health recovery, the device capabilities might have changed by the eswitch manager. For example in one flow when ECPF changes the eswitch mode between legacy and switchdev, it updates the flow table tunnel capability. The cached value is only used in one place, so just check the capability there instead. Fixes: 5bef709d76a2 ("net/mlx5: Enable host PF HCA after eswitch is initialized") Signed-off-by: Parav Pandit Signed-off-by: Daniel Jurgens Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 - drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 +--- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 1 - 4 files changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 4276c6eb6820..4a19ef4a9811 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -313,7 +313,6 @@ struct mlx5e_params { } channel; } mqprio; bool rx_cqe_compress_def; - bool tunneled_offload_en; struct dim_cq_moder rx_cq_moderation; struct dim_cq_moder tx_cq_moderation; struct mlx5e_packet_merge_param packet_merge; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 51b5f3cca504..56fc2aebb9ee 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4979,8 +4979,6 @@ void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 /* TX inline */ mlx5_query_min_inline(mdev, ¶ms->tx_min_inline_mode); - params->tunneled_offload_en = mlx5_tunnel_inner_ft_supported(mdev); - /* AF_XDP */ params->xsk = xsk; @@ -5285,7 +5283,7 @@ static int mlx5e_init_nic_rx(struct mlx5e_priv *priv) } features = MLX5E_RX_RES_FEATURE_PTP; - if (priv->channels.params.tunneled_offload_en) + if (mlx5_tunnel_inner_ft_supported(mdev)) features |= MLX5E_RX_RES_FEATURE_INNER_FT; err = mlx5e_rx_res_init(priv->rx_res, priv->mdev, features, priv->max_nch, priv->drop_rq.rqn, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 43fd12fb87b8..8ff654b4e9e1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -755,7 +755,6 @@ static void mlx5e_build_rep_params(struct net_device *netdev) mlx5e_set_rx_cq_mode_params(params, cq_period_mode); params->mqprio.num_tc = 1; - params->tunneled_offload_en = false; if (rep->vport != MLX5_VPORT_UPLINK) params->vlan_strip_disable = true; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index c2a4f86bc890..baa7ef812313 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -70,7 +70,6 @@ static void mlx5i_build_nic_params(struct mlx5_core_dev *mdev, params->packet_merge.type = MLX5E_PACKET_MERGE_NONE; params->hard_mtu = MLX5_IB_GRH_BYTES + MLX5_IPOIB_HARD_LEN; - params->tunneled_offload_en = false; /* CQE compression is not supported for IPoIB */ params->rx_cqe_compress_def = false; From ba5d8f72b82cc197355c9340ef89dab813815865 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 24 Jun 2021 18:22:57 +0300 Subject: [PATCH 058/108] net/mlx5: Fix setting ec_function bit in MANAGE_PAGES When ECPF is a page supplier, reclaim pages missed to honor the ec_function bit provided by the firmware. It always used the ec_function to true during driver unload flow for ECPF. This is incorrect. Honor the ec_function bit provided by device during page allocation request event. Fixes: d6945242f45d ("net/mlx5: Hold pages RB tree per VF") Signed-off-by: Parav Pandit Signed-off-by: Daniel Jurgens Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/pagealloc.c | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index 64d4e7125e9b..95dc67fb3001 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -82,6 +82,16 @@ static u16 func_id_to_type(struct mlx5_core_dev *dev, u16 func_id, bool ec_funct return func_id <= mlx5_core_max_vfs(dev) ? MLX5_VF : MLX5_SF; } +static u32 mlx5_get_ec_function(u32 function) +{ + return function >> 16; +} + +static u32 mlx5_get_func_id(u32 function) +{ + return function & 0xffff; +} + static struct rb_root *page_root_per_function(struct mlx5_core_dev *dev, u32 function) { struct rb_root *root; @@ -665,20 +675,22 @@ static int optimal_reclaimed_pages(void) } static int mlx5_reclaim_root_pages(struct mlx5_core_dev *dev, - struct rb_root *root, u16 func_id) + struct rb_root *root, u32 function) { u64 recl_pages_to_jiffies = msecs_to_jiffies(mlx5_tout_ms(dev, RECLAIM_PAGES)); unsigned long end = jiffies + recl_pages_to_jiffies; while (!RB_EMPTY_ROOT(root)) { + u32 ec_function = mlx5_get_ec_function(function); + u32 function_id = mlx5_get_func_id(function); int nclaimed; int err; - err = reclaim_pages(dev, func_id, optimal_reclaimed_pages(), - &nclaimed, false, mlx5_core_is_ecpf(dev)); + err = reclaim_pages(dev, function_id, optimal_reclaimed_pages(), + &nclaimed, false, ec_function); if (err) { - mlx5_core_warn(dev, "failed reclaiming pages (%d) for func id 0x%x\n", - err, func_id); + mlx5_core_warn(dev, "reclaim_pages err (%d) func_id=0x%x ec_func=0x%x\n", + err, function_id, ec_function); return err; } From 7ba930fc25def6fd736abcdfa224272948a65cf7 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Thu, 20 Oct 2022 00:13:50 +0300 Subject: [PATCH 059/108] net/mlx5: Disable eswitch before waiting for VF pages The offending commit changed the ordering of moving to legacy mode and waiting for the VF pages. Moving to legacy mode is important in bluefield, because it sends the host driver into error state, and frees its pages. Without this transition we end up waiting 2 minutes for pages that aren't coming before carrying on with the unload process. Fixes: f019679ea5f2 ("net/mlx5: E-switch, Remove dependency between sriov and eswitch mode") Signed-off-by: Daniel Jurgens Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 540840e80493..f36a3aa4b5c8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1364,8 +1364,8 @@ static void mlx5_unload(struct mlx5_core_dev *dev) { mlx5_devlink_traps_unregister(priv_to_devlink(dev)); mlx5_sf_dev_table_destroy(dev); - mlx5_sriov_detach(dev); mlx5_eswitch_disable(dev->priv.eswitch); + mlx5_sriov_detach(dev); mlx5_lag_remove_mdev(dev); mlx5_ec_cleanup(dev); mlx5_sf_hw_table_destroy(dev); From 1313d78ac0c1cfcff7bdece8da54b080e71487c4 Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Tue, 7 Feb 2023 15:07:00 +0200 Subject: [PATCH 060/108] net/mlx5: E-switch, Fix wrong usage of source port rewrite in split rules In few cases, rules with mirror use case are split to two FTEs, one which do the mirror action and forward to second FTE which do the rest of the rule actions and the second redirect action. In case of mirror rules which do split and forward to ovs internal port or VF stack devices, source port rewrite should be used in the second FTE but it is wrongly also set in the first FTE which break the offload. Fix this issue by removing the wrong check if source port rewrite is needed to be used on the first FTE of the split and instead return EOPNOTSUPP which will block offload of rules which mirror to ovs internal port or VF stack devices which isn't supported. Fixes: 10742efc20a4 ("net/mlx5e: VF tunnel TX traffic offloading") Fixes: a508728a4c8b ("net/mlx5e: VF tunnel RX traffic offloading") Signed-off-by: Maor Dickman Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index d766a64b1823..22075943bb58 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -723,11 +723,11 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw, flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; for (i = 0; i < esw_attr->split_count; i++) { - if (esw_is_indir_table(esw, attr)) - err = esw_setup_indir_table(dest, &flow_act, esw, attr, false, &i); - else if (esw_is_chain_src_port_rewrite(esw, esw_attr)) - err = esw_setup_chain_src_port_rewrite(dest, &flow_act, esw, chains, attr, - &i); + if (esw_attr->dests[i].flags & MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE) + /* Source port rewrite (forward to ovs internal port or statck device) isn't + * supported in the rule of split action. + */ + err = -EOPNOTSUPP; else esw_setup_vport_dest(dest, &flow_act, esw, esw_attr, i, i, false); From 28d3815a629cbdee660dd1c9de28d77cb3d77917 Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Wed, 8 Feb 2023 11:37:41 +0200 Subject: [PATCH 061/108] net/mlx5: E-switch, Fix missing set of split_count when forward to ovs internal port Rules with mirror actions are split to two FTEs when the actions after the mirror action contains pedit, vlan push/pop or ct. Forward to ovs internal port adds implicit header rewrite (pedit) but missing trigger to do split. Fix by setting split_count when forwarding to ovs internal port which will trigger split in mirror rules. Fixes: 27484f7170ed ("net/mlx5e: Offload tc rules that redirect to ovs internal port") Signed-off-by: Maor Dickman Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 70b8d2dfa751..90944bf271ce 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -4304,6 +4304,7 @@ int mlx5e_set_fwd_to_int_port_actions(struct mlx5e_priv *priv, esw_attr->dest_int_port = dest_int_port; esw_attr->dests[out_index].flags |= MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE; + esw_attr->split_count = out_index; /* Forward to root fdb for matching against the new source vport */ attr->dest_chain = 0; From c9668f0b1d28570327dbba189f2c61f6f9e43ae7 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Sun, 12 Feb 2023 11:01:43 +0200 Subject: [PATCH 062/108] net/mlx5e: Fix cleanup null-ptr deref on encap lock During module is unloaded while a peer tc flow is still offloaded, first the peer uplink rep profile is changed to a nic profile, and so neigh encap lock is destroyed. Next during unload, the VF reps netdevs are unregistered which causes the original non-peer tc flow to be deleted, which deletes the peer flow. The peer flow deletion detaches the encap entry and try to take the already destroyed encap lock, causing the below trace. Fix this by clearing peer flows during tc eswitch cleanup (mlx5e_tc_esw_cleanup()). Relevant trace: [ 4316.837128] BUG: kernel NULL pointer dereference, address: 00000000000001d8 [ 4316.842239] RIP: 0010:__mutex_lock+0xb5/0xc40 [ 4316.851897] Call Trace: [ 4316.852481] [ 4316.857214] mlx5e_rep_neigh_entry_release+0x93/0x790 [mlx5_core] [ 4316.858258] mlx5e_rep_encap_entry_detach+0xa7/0xf0 [mlx5_core] [ 4316.859134] mlx5e_encap_dealloc+0xa3/0xf0 [mlx5_core] [ 4316.859867] clean_encap_dests.part.0+0x5c/0xe0 [mlx5_core] [ 4316.860605] mlx5e_tc_del_fdb_flow+0x32a/0x810 [mlx5_core] [ 4316.862609] __mlx5e_tc_del_fdb_peer_flow+0x1a2/0x250 [mlx5_core] [ 4316.863394] mlx5e_tc_del_flow+0x(/0x630 [mlx5_core] [ 4316.864090] mlx5e_flow_put+0x5f/0x100 [mlx5_core] [ 4316.864771] mlx5e_delete_flower+0x4de/0xa40 [mlx5_core] [ 4316.865486] tc_setup_cb_reoffload+0x20/0x80 [ 4316.865905] fl_reoffload+0x47c/0x510 [cls_flower] [ 4316.869181] tcf_block_playback_offloads+0x91/0x1d0 [ 4316.869649] tcf_block_unbind+0xe7/0x1b0 [ 4316.870049] tcf_block_offload_cmd.isra.0+0x1ee/0x270 [ 4316.879266] tcf_block_offload_unbind+0x61/0xa0 [ 4316.879711] __tcf_block_put+0xa4/0x310 Fixes: 04de7dda7394 ("net/mlx5e: Infrastructure for duplicated offloading of TC flows") Fixes: 1418ddd96afd ("net/mlx5e: Duplicate offloaded TC eswitch rules under uplink LAG") Signed-off-by: Paul Blakey Reviewed-by: Chris Mi Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 90944bf271ce..cc35cbc9934d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -5464,6 +5464,16 @@ err_tun_mapping: void mlx5e_tc_esw_cleanup(struct mlx5_rep_uplink_priv *uplink_priv) { + struct mlx5e_rep_priv *rpriv; + struct mlx5_eswitch *esw; + struct mlx5e_priv *priv; + + rpriv = container_of(uplink_priv, struct mlx5e_rep_priv, uplink_priv); + priv = netdev_priv(rpriv->netdev); + esw = priv->mdev->priv.eswitch; + + mlx5e_tc_clean_fdb_peer_flows(esw); + mlx5e_tc_tun_cleanup(uplink_priv->encap); mapping_destroy(uplink_priv->tunnel_enc_opts_mapping); From dd64572490c3d7aab04083db8791fab157a941ed Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 24 Jan 2023 17:34:32 +0200 Subject: [PATCH 063/108] net/mlx5e: kTLS, Fix missing error unwind on unsupported cipher type Do proper error unwinding when adding an unsupported TX/RX cipher type. Move the switch case prior to key creation so there's less to unwind, and change the goto label name to describe the action performed instead of what failed. Fixes: 4960c414db35 ("net/mlx5e: Support 256 bit keys with kTLS device offload") Signed-off-by: Gal Pressman Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/en_accel/ktls_rx.c | 24 ++++++++++--------- .../mellanox/mlx5/core/en_accel/ktls_tx.c | 22 +++++++++-------- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c index 4be770443b0c..9b597cb24598 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c @@ -621,15 +621,6 @@ int mlx5e_ktls_add_rx(struct net_device *netdev, struct sock *sk, if (unlikely(!priv_rx)) return -ENOMEM; - dek = mlx5_ktls_create_key(priv->tls->dek_pool, crypto_info); - if (IS_ERR(dek)) { - err = PTR_ERR(dek); - goto err_create_key; - } - priv_rx->dek = dek; - - INIT_LIST_HEAD(&priv_rx->list); - spin_lock_init(&priv_rx->lock); switch (crypto_info->cipher_type) { case TLS_CIPHER_AES_GCM_128: priv_rx->crypto_info.crypto_info_128 = @@ -642,9 +633,20 @@ int mlx5e_ktls_add_rx(struct net_device *netdev, struct sock *sk, default: WARN_ONCE(1, "Unsupported cipher type %u\n", crypto_info->cipher_type); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto err_cipher_type; } + dek = mlx5_ktls_create_key(priv->tls->dek_pool, crypto_info); + if (IS_ERR(dek)) { + err = PTR_ERR(dek); + goto err_cipher_type; + } + priv_rx->dek = dek; + + INIT_LIST_HEAD(&priv_rx->list); + spin_lock_init(&priv_rx->lock); + rxq = mlx5e_ktls_sk_get_rxq(sk); priv_rx->rxq = rxq; priv_rx->sk = sk; @@ -677,7 +679,7 @@ err_post_wqes: mlx5e_tir_destroy(&priv_rx->tir); err_create_tir: mlx5_ktls_destroy_key(priv->tls->dek_pool, priv_rx->dek); -err_create_key: +err_cipher_type: kfree(priv_rx); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c index 60b3e08a1028..0e4c0a093293 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c @@ -469,14 +469,6 @@ int mlx5e_ktls_add_tx(struct net_device *netdev, struct sock *sk, if (IS_ERR(priv_tx)) return PTR_ERR(priv_tx); - dek = mlx5_ktls_create_key(priv->tls->dek_pool, crypto_info); - if (IS_ERR(dek)) { - err = PTR_ERR(dek); - goto err_create_key; - } - priv_tx->dek = dek; - - priv_tx->expected_seq = start_offload_tcp_sn; switch (crypto_info->cipher_type) { case TLS_CIPHER_AES_GCM_128: priv_tx->crypto_info.crypto_info_128 = @@ -489,8 +481,18 @@ int mlx5e_ktls_add_tx(struct net_device *netdev, struct sock *sk, default: WARN_ONCE(1, "Unsupported cipher type %u\n", crypto_info->cipher_type); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto err_pool_push; } + + dek = mlx5_ktls_create_key(priv->tls->dek_pool, crypto_info); + if (IS_ERR(dek)) { + err = PTR_ERR(dek); + goto err_pool_push; + } + + priv_tx->dek = dek; + priv_tx->expected_seq = start_offload_tcp_sn; priv_tx->tx_ctx = tls_offload_ctx_tx(tls_ctx); mlx5e_set_ktls_tx_priv_ctx(tls_ctx, priv_tx); @@ -500,7 +502,7 @@ int mlx5e_ktls_add_tx(struct net_device *netdev, struct sock *sk, return 0; -err_create_key: +err_pool_push: pool_push(pool, priv_tx); return err; } From 031a163f2c476adcb2c01e27a7d323e66174ac11 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 28 Feb 2023 10:36:19 +0200 Subject: [PATCH 064/108] net/mlx5: Set BREAK_FW_WAIT flag first when removing driver Currently, BREAK_FW_WAIT flag is set after syncing with fw_reset. However, fw_reset can call mlx5_load_one() which is waiting for fw init bit and BREAK_FW_WAIT flag is intended to stop. e.g.: the driver might wait on a loop it should exit. Fix it by setting the flag before syncing with fw_reset. Fixes: 8324a02c342a ("net/mlx5: Add exit route when waiting for FW") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index f36a3aa4b5c8..f1de152a6113 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1789,11 +1789,11 @@ static void remove_one(struct pci_dev *pdev) struct mlx5_core_dev *dev = pci_get_drvdata(pdev); struct devlink *devlink = priv_to_devlink(dev); + set_bit(MLX5_BREAK_FW_WAIT, &dev->intf_state); /* mlx5_drain_fw_reset() is using devlink APIs. Hence, we must drain * fw_reset before unregistering the devlink. */ mlx5_drain_fw_reset(dev); - set_bit(MLX5_BREAK_FW_WAIT, &dev->intf_state); devlink_unregister(devlink); mlx5_sriov_disable(pdev); mlx5_crdump_disable(dev); From 78dee7befd56987283c13877b834c0aa97ad51b9 Mon Sep 17 00:00:00 2001 From: Adham Faris Date: Mon, 23 Jan 2023 10:09:01 +0200 Subject: [PATCH 065/108] net/mlx5e: Lower maximum allowed MTU in XSK to match XDP prerequisites XSK redirecting XDP programs require linearity, hence applies restrictions on the MTU. For PAGE_SIZE=4K, MTU shouldn't exceed 3498. Features that contradict with XDP such HW-LRO and HW-GRO are enforced by the driver in advance, during XSK params validation, except for MTU, which was not enforced before this patch. This has been spotted during test scenario described below: Attaching xdpsock program (PAGE_SIZE=4K), with MTU < 3498, detaching XDP program, changing the MTU to arbitrary value in the range [3499, 3754], attaching XDP program again, which ended up with failure since MTU is > 3498. This commit lowers the XSK MTU limitation to be aligned with XDP MTU limitation, since XSK socket is meaningless without XDP program. Signed-off-by: Adham Faris Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 56fc2aebb9ee..a7f2ab22cc40 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4169,13 +4169,17 @@ static bool mlx5e_xsk_validate_mtu(struct net_device *netdev, struct xsk_buff_pool *xsk_pool = mlx5e_xsk_get_pool(&chs->params, chs->params.xsk, ix); struct mlx5e_xsk_param xsk; + int max_xdp_mtu; if (!xsk_pool) continue; mlx5e_build_xsk_param(xsk_pool, &xsk); + max_xdp_mtu = mlx5e_xdp_max_mtu(new_params, &xsk); - if (!mlx5e_validate_xsk_param(new_params, &xsk, mdev)) { + /* Validate XSK params and XDP MTU in advance */ + if (!mlx5e_validate_xsk_param(new_params, &xsk, mdev) || + new_params->sw_mtu > max_xdp_mtu) { u32 hr = mlx5e_get_linear_rq_headroom(new_params, &xsk); int max_mtu_frame, max_mtu_page, max_mtu; @@ -4185,9 +4189,9 @@ static bool mlx5e_xsk_validate_mtu(struct net_device *netdev, */ max_mtu_frame = MLX5E_HW2SW_MTU(new_params, xsk.chunk_size - hr); max_mtu_page = MLX5E_HW2SW_MTU(new_params, SKB_MAX_HEAD(0)); - max_mtu = min(max_mtu_frame, max_mtu_page); + max_mtu = min3(max_mtu_frame, max_mtu_page, max_xdp_mtu); - netdev_err(netdev, "MTU %d is too big for an XSK running on channel %u. Try MTU <= %d\n", + netdev_err(netdev, "MTU %d is too big for an XSK running on channel %u or its redirection XDP program. Try MTU <= %d\n", new_params->sw_mtu, ix, max_mtu); return false; } From d1a0075ad6b693c2bd41e7aedb8a4f3b74b6999c Mon Sep 17 00:00:00 2001 From: Oz Shlomo Date: Thu, 16 Feb 2023 12:34:21 +0000 Subject: [PATCH 066/108] net/sched: TC, fix raw counter initialization Freed counters may be reused by fs core. As such, raw counters may not be initialized to zero. Cache the counter values when the action stats object is initialized to have a proper base value for calculating the difference from the previous query. Fixes: 2b68d659a704 ("net/mlx5e: TC, support per action stats") Signed-off-by: Oz Shlomo Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc/act_stats.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act_stats.c index 626cb7470fa5..07c1895a2b23 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act_stats.c @@ -64,6 +64,7 @@ mlx5e_tc_act_stats_add(struct mlx5e_tc_act_stats_handle *handle, { struct mlx5e_tc_act_stats *act_stats, *old_act_stats; struct rhashtable *ht = &handle->ht; + u64 lastused; int err = 0; act_stats = kvzalloc(sizeof(*act_stats), GFP_KERNEL); @@ -73,6 +74,10 @@ mlx5e_tc_act_stats_add(struct mlx5e_tc_act_stats_handle *handle, act_stats->tc_act_cookie = act_cookie; act_stats->counter = counter; + mlx5_fc_query_cached_raw(counter, + &act_stats->lastbytes, + &act_stats->lastpackets, &lastused); + rcu_read_lock(); old_act_stats = rhashtable_lookup_get_insert_fast(ht, &act_stats->hash, From 1166add424dae14ccdb64a6eefbf26766c9d0ef2 Mon Sep 17 00:00:00 2001 From: Oz Shlomo Date: Tue, 21 Feb 2023 14:46:56 +0000 Subject: [PATCH 067/108] net/mlx5e: TC, fix missing error code Missing error code when mlx5e_tc_act_stats_create fails Fixes: d13674b1d14c ("net/mlx5e: TC, map tc action cookie to a hw counter") Reported-by: Dan Carpenter Signed-off-by: Oz Shlomo Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index cc35cbc9934d..d2e191ee0704 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -5305,8 +5305,10 @@ int mlx5e_tc_nic_init(struct mlx5e_priv *priv) mlx5e_tc_debugfs_init(tc, mlx5e_fs_get_debugfs_root(priv->fs)); tc->action_stats_handle = mlx5e_tc_act_stats_create(); - if (IS_ERR(tc->action_stats_handle)) + if (IS_ERR(tc->action_stats_handle)) { + err = PTR_ERR(tc->action_stats_handle); goto err_act_stats; + } return 0; @@ -5441,8 +5443,10 @@ int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv) } uplink_priv->action_stats_handle = mlx5e_tc_act_stats_create(); - if (IS_ERR(uplink_priv->action_stats_handle)) + if (IS_ERR(uplink_priv->action_stats_handle)) { + err = PTR_ERR(uplink_priv->action_stats_handle); goto err_action_counter; + } return 0; From b23bf10cca59b2955fa5c2b5e4f753962b4f88ca Mon Sep 17 00:00:00 2001 From: Oz Shlomo Date: Tue, 21 Feb 2023 15:24:39 +0000 Subject: [PATCH 068/108] net/mlx5e: TC, fix cloned flow attribute Currently the cloned flow attr resets the original tc action cookies count. Fix that by resetting the cloned flow attribute. Fixes: cca7eac13856 ("net/mlx5e: TC, store tc action cookies per attr") Signed-off-by: Oz Shlomo Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index d2e191ee0704..6bfed633343a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -3752,7 +3752,7 @@ mlx5e_clone_flow_attr_for_post_act(struct mlx5_flow_attr *attr, parse_attr->filter_dev = attr->parse_attr->filter_dev; attr2->action = 0; attr2->counter = NULL; - attr->tc_act_cookies_count = 0; + attr2->tc_act_cookies_count = 0; attr2->flags = 0; attr2->parse_attr = parse_attr; attr2->dest_chain = 0; From c7b7c64ab5821352db0b3fbaa92773e5a60bfaa7 Mon Sep 17 00:00:00 2001 From: Oz Shlomo Date: Wed, 22 Feb 2023 10:03:36 +0000 Subject: [PATCH 069/108] net/mlx5e: TC, Remove error message log print The cited commit attempts to update the hw stats when dumping tc actions. However, the driver may be called to update the stats of a police action that may not be in hardware. In such cases the driver will fail to lookup the police action object and will output an error message both to extack and dmesg. The dmesg error is confusing as it may not indicate an actual error. Remove the dmesg error. Fixes: 2b68d659a704 ("net/mlx5e: TC, support per action stats") Signed-off-by: Oz Shlomo Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/police.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/police.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/police.c index c4378afdec09..1bd1c94fb977 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/police.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/police.c @@ -178,7 +178,6 @@ tc_act_police_stats(struct mlx5e_priv *priv, meter = mlx5e_tc_meter_get(priv->mdev, ¶ms); if (IS_ERR(meter)) { NL_SET_ERR_MSG_MOD(fl_act->extack, "Failed to get flow meter"); - mlx5_core_err(priv->mdev, "Failed to get flow meter %d\n", params.index); return PTR_ERR(meter); } From 7c10131803e45269ddc6c817f19ed649110f3cae Mon Sep 17 00:00:00 2001 From: Shawn Bohrer Date: Tue, 14 Mar 2023 10:33:51 -0500 Subject: [PATCH 070/108] veth: Fix use after free in XDP_REDIRECT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 718a18a0c8a6 ("veth: Rework veth_xdp_rcv_skb in order to accept non-linear skb") introduced a bug where it tried to use pskb_expand_head() if the headroom was less than XDP_PACKET_HEADROOM. This however uses kmalloc to expand the head, which will later allow consume_skb() to free the skb while is it still in use by AF_XDP. Previously if the headroom was less than XDP_PACKET_HEADROOM we continued on to allocate a new skb from pages so this restores that behavior. BUG: KASAN: use-after-free in __xsk_rcv+0x18d/0x2c0 Read of size 78 at addr ffff888976250154 by task napi/iconduit-g/148640 CPU: 5 PID: 148640 Comm: napi/iconduit-g Kdump: loaded Tainted: G O 6.1.4-cloudflare-kasan-2023.1.2 #1 Hardware name: Quanta Computer Inc. QuantaPlex T41S-2U/S2S-MB, BIOS S2S_3B10.03 06/21/2018 Call Trace: dump_stack_lvl+0x34/0x48 print_report+0x170/0x473 ? __xsk_rcv+0x18d/0x2c0 kasan_report+0xad/0x130 ? __xsk_rcv+0x18d/0x2c0 kasan_check_range+0x149/0x1a0 memcpy+0x20/0x60 __xsk_rcv+0x18d/0x2c0 __xsk_map_redirect+0x1f3/0x490 ? veth_xdp_rcv_skb+0x89c/0x1ba0 [veth] xdp_do_redirect+0x5ca/0xd60 veth_xdp_rcv_skb+0x935/0x1ba0 [veth] ? __netif_receive_skb_list_core+0x671/0x920 ? veth_xdp+0x670/0x670 [veth] veth_xdp_rcv+0x304/0xa20 [veth] ? do_xdp_generic+0x150/0x150 ? veth_xdp_rcv_one+0xde0/0xde0 [veth] ? _raw_spin_lock_bh+0xe0/0xe0 ? newidle_balance+0x887/0xe30 ? __perf_event_task_sched_in+0xdb/0x800 veth_poll+0x139/0x571 [veth] ? veth_xdp_rcv+0xa20/0xa20 [veth] ? _raw_spin_unlock+0x39/0x70 ? finish_task_switch.isra.0+0x17e/0x7d0 ? __switch_to+0x5cf/0x1070 ? __schedule+0x95b/0x2640 ? io_schedule_timeout+0x160/0x160 __napi_poll+0xa1/0x440 napi_threaded_poll+0x3d1/0x460 ? __napi_poll+0x440/0x440 ? __kthread_parkme+0xc6/0x1f0 ? __napi_poll+0x440/0x440 kthread+0x2a2/0x340 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 Freed by task 148640: kasan_save_stack+0x23/0x50 kasan_set_track+0x21/0x30 kasan_save_free_info+0x2a/0x40 ____kasan_slab_free+0x169/0x1d0 slab_free_freelist_hook+0xd2/0x190 __kmem_cache_free+0x1a1/0x2f0 skb_release_data+0x449/0x600 consume_skb+0x9f/0x1c0 veth_xdp_rcv_skb+0x89c/0x1ba0 [veth] veth_xdp_rcv+0x304/0xa20 [veth] veth_poll+0x139/0x571 [veth] __napi_poll+0xa1/0x440 napi_threaded_poll+0x3d1/0x460 kthread+0x2a2/0x340 ret_from_fork+0x22/0x30 The buggy address belongs to the object at ffff888976250000 which belongs to the cache kmalloc-2k of size 2048 The buggy address is located 340 bytes inside of 2048-byte region [ffff888976250000, ffff888976250800) The buggy address belongs to the physical page: page:00000000ae18262a refcount:2 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x976250 head:00000000ae18262a order:3 compound_mapcount:0 compound_pincount:0 flags: 0x2ffff800010200(slab|head|node=0|zone=2|lastcpupid=0x1ffff) raw: 002ffff800010200 0000000000000000 dead000000000122 ffff88810004cf00 raw: 0000000000000000 0000000080080008 00000002ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888976250000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888976250080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb > ffff888976250100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888976250180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888976250200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: 718a18a0c8a6 ("veth: Rework veth_xdp_rcv_skb in order to accept non-linear skb") Signed-off-by: Shawn Bohrer Acked-by: Lorenzo Bianconi Acked-by: Toshiaki Makita Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20230314153351.2201328-1-sbohrer@cloudflare.com Signed-off-by: Jakub Kicinski --- drivers/net/veth.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 4da74ac27f9a..a30a66cace14 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -708,7 +708,8 @@ static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, u32 frame_sz; if (skb_shared(skb) || skb_head_is_locked(skb) || - skb_shinfo(skb)->nr_frags) { + skb_shinfo(skb)->nr_frags || + skb_headroom(skb) < XDP_PACKET_HEADROOM) { u32 size, len, max_head_size, off; struct sk_buff *nskb; struct page *page; @@ -773,9 +774,6 @@ static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq, consume_skb(skb); skb = nskb; - } else if (skb_headroom(skb) < XDP_PACKET_HEADROOM && - pskb_expand_head(skb, VETH_XDP_HEADROOM, 0, GFP_ATOMIC)) { - goto drop; } /* SKB "head" area always have tailroom for skb_shared_info */ From cd356010ce4c69ac7e1a40586112df24d22c6a4b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 14 Mar 2023 17:30:25 +0200 Subject: [PATCH 071/108] net: phy: mscc: fix deadlock in phy_ethtool_{get,set}_wol() Since the blamed commit, phy_ethtool_get_wol() and phy_ethtool_set_wol() acquire phydev->lock, but the mscc phy driver implementations, vsc85xx_wol_get() and vsc85xx_wol_set(), acquire the same lock as well, resulting in a deadlock. $ ip link set swp3 down ============================================ WARNING: possible recursive locking detected mscc_felix 0000:00:00.5 swp3: Link is Down -------------------------------------------- ip/375 is trying to acquire lock: ffff3d7e82e987a8 (&dev->lock){+.+.}-{4:4}, at: vsc85xx_wol_get+0x2c/0xf4 but task is already holding lock: ffff3d7e82e987a8 (&dev->lock){+.+.}-{4:4}, at: phy_ethtool_get_wol+0x3c/0x6c other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&dev->lock); lock(&dev->lock); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by ip/375: #0: ffffd43b2a955788 (rtnl_mutex){+.+.}-{4:4}, at: rtnetlink_rcv_msg+0x144/0x58c #1: ffff3d7e82e987a8 (&dev->lock){+.+.}-{4:4}, at: phy_ethtool_get_wol+0x3c/0x6c Call trace: __mutex_lock+0x98/0x454 mutex_lock_nested+0x2c/0x38 vsc85xx_wol_get+0x2c/0xf4 phy_ethtool_get_wol+0x50/0x6c phy_suspend+0x84/0xcc phy_state_machine+0x1b8/0x27c phy_stop+0x70/0x154 phylink_stop+0x34/0xc0 dsa_port_disable_rt+0x2c/0xa4 dsa_slave_close+0x38/0xec __dev_close_many+0xc8/0x16c __dev_change_flags+0xdc/0x218 dev_change_flags+0x24/0x6c do_setlink+0x234/0xea4 __rtnl_newlink+0x46c/0x878 rtnl_newlink+0x50/0x7c rtnetlink_rcv_msg+0x16c/0x58c Removing the mutex_lock(&phydev->lock) calls from the driver restores the functionality. Fixes: 2f987d486610 ("net: phy: Add locks to ethtool functions") Signed-off-by: Vladimir Oltean Reviewed-by: Simon Horman Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20230314153025.2372970-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/mscc/mscc_main.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/drivers/net/phy/mscc/mscc_main.c b/drivers/net/phy/mscc/mscc_main.c index 8a13b1ad9a33..62bf99e45af1 100644 --- a/drivers/net/phy/mscc/mscc_main.c +++ b/drivers/net/phy/mscc/mscc_main.c @@ -280,12 +280,9 @@ static int vsc85xx_wol_set(struct phy_device *phydev, u16 pwd[3] = {0, 0, 0}; struct ethtool_wolinfo *wol_conf = wol; - mutex_lock(&phydev->lock); rc = phy_select_page(phydev, MSCC_PHY_PAGE_EXTENDED_2); - if (rc < 0) { - rc = phy_restore_page(phydev, rc, rc); - goto out_unlock; - } + if (rc < 0) + return phy_restore_page(phydev, rc, rc); if (wol->wolopts & WAKE_MAGIC) { /* Store the device address for the magic packet */ @@ -323,7 +320,7 @@ static int vsc85xx_wol_set(struct phy_device *phydev, rc = phy_restore_page(phydev, rc, rc > 0 ? 0 : rc); if (rc < 0) - goto out_unlock; + return rc; if (wol->wolopts & WAKE_MAGIC) { /* Enable the WOL interrupt */ @@ -331,22 +328,19 @@ static int vsc85xx_wol_set(struct phy_device *phydev, reg_val |= MII_VSC85XX_INT_MASK_WOL; rc = phy_write(phydev, MII_VSC85XX_INT_MASK, reg_val); if (rc) - goto out_unlock; + return rc; } else { /* Disable the WOL interrupt */ reg_val = phy_read(phydev, MII_VSC85XX_INT_MASK); reg_val &= (~MII_VSC85XX_INT_MASK_WOL); rc = phy_write(phydev, MII_VSC85XX_INT_MASK, reg_val); if (rc) - goto out_unlock; + return rc; } /* Clear WOL iterrupt status */ reg_val = phy_read(phydev, MII_VSC85XX_INT_STATUS); -out_unlock: - mutex_unlock(&phydev->lock); - - return rc; + return 0; } static void vsc85xx_wol_get(struct phy_device *phydev, @@ -358,10 +352,9 @@ static void vsc85xx_wol_get(struct phy_device *phydev, u16 pwd[3] = {0, 0, 0}; struct ethtool_wolinfo *wol_conf = wol; - mutex_lock(&phydev->lock); rc = phy_select_page(phydev, MSCC_PHY_PAGE_EXTENDED_2); if (rc < 0) - goto out_unlock; + goto out_restore_page; reg_val = __phy_read(phydev, MSCC_PHY_WOL_MAC_CONTROL); if (reg_val & SECURE_ON_ENABLE) @@ -377,9 +370,8 @@ static void vsc85xx_wol_get(struct phy_device *phydev, } } -out_unlock: +out_restore_page: phy_restore_page(phydev, rc, rc > 0 ? 0 : rc); - mutex_unlock(&phydev->lock); } #if IS_ENABLED(CONFIG_OF_MDIO) From 077706165717686a2a6a71405fef036cd5b37ae0 Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Tue, 14 Mar 2023 14:05:48 +0300 Subject: [PATCH 072/108] virtio/vsock: don't use skbuff state to account credit 'skb->len' can vary when we partially read the data, this complicates the calculation of credit to be updated in 'virtio_transport_inc_rx_pkt()/ virtio_transport_dec_rx_pkt()'. Also in 'virtio_transport_dec_rx_pkt()' we were miscalculating the credit since 'skb->len' was redundant. For these reasons, let's replace the use of skbuff state to calculate new 'rx_bytes'/'fwd_cnt' values with explicit value as input argument. This makes code more simple, because it is not needed to change skbuff state before each call to update 'rx_bytes'/'fwd_cnt'. Fixes: 71dc9ec9ac7d ("virtio/vsock: replace virtio_vsock_pkt with sk_buff") Signed-off-by: Arseniy Krasnov Reviewed-by: Stefano Garzarella Acked-by: Bobby Eshleman Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport_common.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index a1581c77cf84..618680fd9906 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -241,21 +241,18 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, } static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, - struct sk_buff *skb) + u32 len) { - if (vvs->rx_bytes + skb->len > vvs->buf_alloc) + if (vvs->rx_bytes + len > vvs->buf_alloc) return false; - vvs->rx_bytes += skb->len; + vvs->rx_bytes += len; return true; } static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, - struct sk_buff *skb) + u32 len) { - int len; - - len = skb_headroom(skb) - sizeof(struct virtio_vsock_hdr) - skb->len; vvs->rx_bytes -= len; vvs->fwd_cnt += len; } @@ -388,7 +385,9 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, skb_pull(skb, bytes); if (skb->len == 0) { - virtio_transport_dec_rx_pkt(vvs, skb); + u32 pkt_len = le32_to_cpu(virtio_vsock_hdr(skb)->len); + + virtio_transport_dec_rx_pkt(vvs, pkt_len); consume_skb(skb); } else { __skb_queue_head(&vvs->rx_queue, skb); @@ -437,17 +436,17 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, while (!msg_ready) { struct virtio_vsock_hdr *hdr; + size_t pkt_len; skb = __skb_dequeue(&vvs->rx_queue); if (!skb) break; hdr = virtio_vsock_hdr(skb); + pkt_len = (size_t)le32_to_cpu(hdr->len); if (dequeued_len >= 0) { - size_t pkt_len; size_t bytes_to_copy; - pkt_len = (size_t)le32_to_cpu(hdr->len); bytes_to_copy = min(user_buf_len, pkt_len); if (bytes_to_copy) { @@ -484,7 +483,7 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, msg->msg_flags |= MSG_EOR; } - virtio_transport_dec_rx_pkt(vvs, skb); + virtio_transport_dec_rx_pkt(vvs, pkt_len); kfree_skb(skb); } @@ -1040,7 +1039,7 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk, spin_lock_bh(&vvs->rx_lock); - can_enqueue = virtio_transport_inc_rx_pkt(vvs, skb); + can_enqueue = virtio_transport_inc_rx_pkt(vvs, len); if (!can_enqueue) { free_pkt = true; goto out; From 6825e6b4f8e53799d83bc39ca6ec5baed4e2adde Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Tue, 14 Mar 2023 14:06:53 +0300 Subject: [PATCH 073/108] virtio/vsock: remove redundant 'skb_pull()' call Since we now no longer use 'skb->len' to update credit, there is no sense to update skbuff state, because it is used only once after dequeue to copy data and then will be released. Fixes: 71dc9ec9ac7d ("virtio/vsock: replace virtio_vsock_pkt with sk_buff") Signed-off-by: Arseniy Krasnov Reviewed-by: Stefano Garzarella Acked-by: Bobby Eshleman Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport_common.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 618680fd9906..9a411475e201 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -465,7 +465,6 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, dequeued_len = err; } else { user_buf_len -= bytes_to_copy; - skb_pull(skb, bytes_to_copy); } spin_lock_bh(&vvs->rx_lock); From 8daaf39f7f6ef53a11817f6a11ec104016c3545f Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Tue, 14 Mar 2023 14:08:20 +0300 Subject: [PATCH 074/108] virtio/vsock: don't drop skbuff on copy failure This returns behaviour of SOCK_STREAM read as before skbuff usage. When copying to user fails current skbuff won't be dropped, but returned to sockets's queue. Technically instead of 'skb_dequeue()', 'skb_peek()' is called and when skbuff becomes empty, it is removed from queue by '__skb_unlink()'. Fixes: 71dc9ec9ac7d ("virtio/vsock: replace virtio_vsock_pkt with sk_buff") Signed-off-by: Arseniy Krasnov Reviewed-by: Stefano Garzarella Acked-by: Bobby Eshleman Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport_common.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 9a411475e201..6564192e7f20 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -364,7 +364,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, spin_lock_bh(&vvs->rx_lock); while (total < len && !skb_queue_empty(&vvs->rx_queue)) { - skb = __skb_dequeue(&vvs->rx_queue); + skb = skb_peek(&vvs->rx_queue); bytes = len - total; if (bytes > skb->len) @@ -388,9 +388,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, u32 pkt_len = le32_to_cpu(virtio_vsock_hdr(skb)->len); virtio_transport_dec_rx_pkt(vvs, pkt_len); + __skb_unlink(skb, &vvs->rx_queue); consume_skb(skb); - } else { - __skb_queue_head(&vvs->rx_queue, skb); } } From 7e699d2a4e8104d304e921ac5e0a0c73f0f7b623 Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Tue, 14 Mar 2023 14:09:27 +0300 Subject: [PATCH 075/108] test/vsock: copy to user failure test This adds SOCK_STREAM and SOCK_SEQPACKET tests for invalid buffer case. It tries to read data to NULL buffer (data already presents in socket's queue), then uses valid buffer. For SOCK_STREAM second read must return data, because skbuff is not dropped, but for SOCK_SEQPACKET skbuff will be dropped by kernel, and 'recv()' will return EAGAIN. Signed-off-by: Arseniy Krasnov Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- tools/testing/vsock/vsock_test.c | 118 +++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 67e9f9df3a8c..3de10dbb50f5 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -860,6 +860,114 @@ static void test_stream_poll_rcvlowat_client(const struct test_opts *opts) close(fd); } +#define INV_BUF_TEST_DATA_LEN 512 + +static void test_inv_buf_client(const struct test_opts *opts, bool stream) +{ + unsigned char data[INV_BUF_TEST_DATA_LEN] = {0}; + ssize_t ret; + int fd; + + if (stream) + fd = vsock_stream_connect(opts->peer_cid, 1234); + else + fd = vsock_seqpacket_connect(opts->peer_cid, 1234); + + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + control_expectln("SENDDONE"); + + /* Use invalid buffer here. */ + ret = recv(fd, NULL, sizeof(data), 0); + if (ret != -1) { + fprintf(stderr, "expected recv(2) failure, got %zi\n", ret); + exit(EXIT_FAILURE); + } + + if (errno != ENOMEM) { + fprintf(stderr, "unexpected recv(2) errno %d\n", errno); + exit(EXIT_FAILURE); + } + + ret = recv(fd, data, sizeof(data), MSG_DONTWAIT); + + if (stream) { + /* For SOCK_STREAM we must continue reading. */ + if (ret != sizeof(data)) { + fprintf(stderr, "expected recv(2) success, got %zi\n", ret); + exit(EXIT_FAILURE); + } + /* Don't check errno in case of success. */ + } else { + /* For SOCK_SEQPACKET socket's queue must be empty. */ + if (ret != -1) { + fprintf(stderr, "expected recv(2) failure, got %zi\n", ret); + exit(EXIT_FAILURE); + } + + if (errno != EAGAIN) { + fprintf(stderr, "unexpected recv(2) errno %d\n", errno); + exit(EXIT_FAILURE); + } + } + + control_writeln("DONE"); + + close(fd); +} + +static void test_inv_buf_server(const struct test_opts *opts, bool stream) +{ + unsigned char data[INV_BUF_TEST_DATA_LEN] = {0}; + ssize_t res; + int fd; + + if (stream) + fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL); + else + fd = vsock_seqpacket_accept(VMADDR_CID_ANY, 1234, NULL); + + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + res = send(fd, data, sizeof(data), 0); + if (res != sizeof(data)) { + fprintf(stderr, "unexpected send(2) result %zi\n", res); + exit(EXIT_FAILURE); + } + + control_writeln("SENDDONE"); + + control_expectln("DONE"); + + close(fd); +} + +static void test_stream_inv_buf_client(const struct test_opts *opts) +{ + test_inv_buf_client(opts, true); +} + +static void test_stream_inv_buf_server(const struct test_opts *opts) +{ + test_inv_buf_server(opts, true); +} + +static void test_seqpacket_inv_buf_client(const struct test_opts *opts) +{ + test_inv_buf_client(opts, false); +} + +static void test_seqpacket_inv_buf_server(const struct test_opts *opts) +{ + test_inv_buf_server(opts, false); +} + static struct test_case test_cases[] = { { .name = "SOCK_STREAM connection reset", @@ -920,6 +1028,16 @@ static struct test_case test_cases[] = { .run_client = test_seqpacket_bigmsg_client, .run_server = test_seqpacket_bigmsg_server, }, + { + .name = "SOCK_STREAM test invalid buffer", + .run_client = test_stream_inv_buf_client, + .run_server = test_stream_inv_buf_server, + }, + { + .name = "SOCK_SEQPACKET test invalid buffer", + .run_client = test_seqpacket_inv_buf_client, + .run_server = test_seqpacket_inv_buf_server, + }, {}, }; From b830c9642386867863ac64295185f896ff2928ac Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 14 Mar 2023 10:45:43 -0700 Subject: [PATCH 076/108] ice: xsk: disable txq irq before flushing hw ice_qp_dis() intends to stop a given queue pair that is a target of xsk pool attach/detach. One of the steps is to disable interrupts on these queues. It currently is broken in a way that txq irq is turned off *after* HW flush which in turn takes no effect. ice_qp_dis(): -> ice_qvec_dis_irq() --> disable rxq irq --> flush hw -> ice_vsi_stop_tx_ring() -->disable txq irq Below splat can be triggered by following steps: - start xdpsock WITHOUT loading xdp prog - run xdp_rxq_info with XDP_TX action on this interface - start traffic - terminate xdpsock [ 256.312485] BUG: kernel NULL pointer dereference, address: 0000000000000018 [ 256.319560] #PF: supervisor read access in kernel mode [ 256.324775] #PF: error_code(0x0000) - not-present page [ 256.329994] PGD 0 P4D 0 [ 256.332574] Oops: 0000 [#1] PREEMPT SMP NOPTI [ 256.337006] CPU: 3 PID: 32 Comm: ksoftirqd/3 Tainted: G OE 6.2.0-rc5+ #51 [ 256.345218] Hardware name: Intel Corporation S2600WFT/S2600WFT, BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019 [ 256.355807] RIP: 0010:ice_clean_rx_irq_zc+0x9c/0x7d0 [ice] [ 256.361423] Code: b7 8f 8a 00 00 00 66 39 ca 0f 84 f1 04 00 00 49 8b 47 40 4c 8b 24 d0 41 0f b7 45 04 66 25 ff 3f 66 89 04 24 0f 84 85 02 00 00 <49> 8b 44 24 18 0f b7 14 24 48 05 00 01 00 00 49 89 04 24 49 89 44 [ 256.380463] RSP: 0018:ffffc900088bfd20 EFLAGS: 00010206 [ 256.385765] RAX: 000000000000003c RBX: 0000000000000035 RCX: 000000000000067f [ 256.393012] RDX: 0000000000000775 RSI: 0000000000000000 RDI: ffff8881deb3ac80 [ 256.400256] RBP: 000000000000003c R08: ffff889847982710 R09: 0000000000010000 [ 256.407500] R10: ffffffff82c060c0 R11: 0000000000000004 R12: 0000000000000000 [ 256.414746] R13: ffff88811165eea0 R14: ffffc9000d255000 R15: ffff888119b37600 [ 256.421990] FS: 0000000000000000(0000) GS:ffff8897e0cc0000(0000) knlGS:0000000000000000 [ 256.430207] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 256.436036] CR2: 0000000000000018 CR3: 0000000005c0a006 CR4: 00000000007706e0 [ 256.443283] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 256.450527] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 256.457770] PKRU: 55555554 [ 256.460529] Call Trace: [ 256.463015] [ 256.465157] ? ice_xmit_zc+0x6e/0x150 [ice] [ 256.469437] ice_napi_poll+0x46d/0x680 [ice] [ 256.473815] ? _raw_spin_unlock_irqrestore+0x1b/0x40 [ 256.478863] __napi_poll+0x29/0x160 [ 256.482409] net_rx_action+0x136/0x260 [ 256.486222] __do_softirq+0xe8/0x2e5 [ 256.489853] ? smpboot_thread_fn+0x2c/0x270 [ 256.494108] run_ksoftirqd+0x2a/0x50 [ 256.497747] smpboot_thread_fn+0x1c1/0x270 [ 256.501907] ? __pfx_smpboot_thread_fn+0x10/0x10 [ 256.506594] kthread+0xea/0x120 [ 256.509785] ? __pfx_kthread+0x10/0x10 [ 256.513597] ret_from_fork+0x29/0x50 [ 256.517238] In fact, irqs were not disabled and napi managed to be scheduled and run while xsk_pool pointer was still valid, but SW ring of xdp_buff pointers was already freed. To fix this, call ice_qvec_dis_irq() after ice_vsi_stop_tx_ring(). Also while at it, remove redundant ice_clean_rx_ring() call - this is handled in ice_qp_clean_rings(). Fixes: 2d4238f55697 ("ice: Add support for AF_XDP") Signed-off-by: Maciej Fijalkowski Reviewed-by: Larysa Zaremba Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Acked-by: John Fastabend Signed-off-by: Tony Nguyen Reviewed-by: Leon Romanovsky Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ice/ice_xsk.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 31565bbafa22..d1e489da7363 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -184,8 +184,6 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) } netif_tx_stop_queue(netdev_get_tx_queue(vsi->netdev, q_idx)); - ice_qvec_dis_irq(vsi, rx_ring, q_vector); - ice_fill_txq_meta(vsi, tx_ring, &txq_meta); err = ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, 0, tx_ring, &txq_meta); if (err) @@ -200,10 +198,11 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) if (err) return err; } + ice_qvec_dis_irq(vsi, rx_ring, q_vector); + err = ice_vsi_ctrl_one_rx_ring(vsi, false, q_idx, true); if (err) return err; - ice_clean_rx_ring(rx_ring); ice_qvec_toggle_napi(vsi, q_vector, false); ice_qp_clean_rings(vsi, q_idx); From 636e8adf7878eab3614250234341bde45537f47a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 14 Mar 2023 20:24:04 +0200 Subject: [PATCH 077/108] net: dsa: don't error out when drivers return ETH_DATA_LEN in .port_max_mtu() Currently, when dsa_slave_change_mtu() is called on a user port where dev->max_mtu is 1500 (as returned by ds->ops->port_max_mtu()), the code will stumble upon this check: if (new_master_mtu > mtu_limit) return -ERANGE; because new_master_mtu is adjusted for the tagger overhead but mtu_limit is not. But it would be good if the logic went through, for example if the DSA master really depends on an MTU adjustment to accept DSA-tagged frames. To make the code pass through the check, we need to adjust mtu_limit for the overhead as well, if the minimum restriction was caused by the DSA user port's MTU (dev->max_mtu). A DSA user port MTU and a DSA master MTU are always offset by the protocol overhead. Currently no drivers return 1500 .port_max_mtu(), but this is only temporary and a bug in itself - mv88e6xxx should have done that, but since commit b9c587fed61c ("dsa: mv88e6xxx: Include tagger overhead when setting MTU for DSA and CPU ports") it no longer does. This is a preparation for fixing that. Fixes: bfcb813203e6 ("net: dsa: configure the MTU for switch ports") Signed-off-by: Vladimir Oltean Reviewed-by: Simon Horman Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 6957971c2db2..cac17183589f 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1933,6 +1933,7 @@ int dsa_slave_change_mtu(struct net_device *dev, int new_mtu) int new_master_mtu; int old_master_mtu; int mtu_limit; + int overhead; int cpu_mtu; int err; @@ -1961,9 +1962,10 @@ int dsa_slave_change_mtu(struct net_device *dev, int new_mtu) largest_mtu = slave_mtu; } - mtu_limit = min_t(int, master->max_mtu, dev->max_mtu); + overhead = dsa_tag_protocol_overhead(cpu_dp->tag_ops); + mtu_limit = min_t(int, master->max_mtu, dev->max_mtu + overhead); old_master_mtu = master->mtu; - new_master_mtu = largest_mtu + dsa_tag_protocol_overhead(cpu_dp->tag_ops); + new_master_mtu = largest_mtu + overhead; if (new_master_mtu > mtu_limit) return -ERANGE; @@ -1998,8 +2000,7 @@ int dsa_slave_change_mtu(struct net_device *dev, int new_mtu) out_port_failed: if (new_master_mtu != old_master_mtu) - dsa_port_mtu_change(cpu_dp, old_master_mtu - - dsa_tag_protocol_overhead(cpu_dp->tag_ops)); + dsa_port_mtu_change(cpu_dp, old_master_mtu - overhead); out_cpu_failed: if (new_master_mtu != old_master_mtu) dev_set_mtu(master, old_master_mtu); From 7e9517375a14f44ee830ca1c3278076dd65fcc8f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 14 Mar 2023 20:24:05 +0200 Subject: [PATCH 078/108] net: dsa: mv88e6xxx: fix max_mtu of 1492 on 6165, 6191, 6220, 6250, 6290 There are 3 classes of switch families that the driver is aware of, as far as mv88e6xxx_change_mtu() is concerned: - MTU configuration is available per port. Here, the chip->info->ops->port_set_jumbo_size() method will be present. - MTU configuration is global to the switch. Here, the chip->info->ops->set_max_frame_size() method will be present. - We don't know how to change the MTU. Here, none of the above methods will be present. Switch families MV88E6165, MV88E6191, MV88E6220, MV88E6250 and MV88E6290 fall in category 3. The blamed commit has adjusted the MTU for all 3 categories by EDSA_HLEN (8 bytes), resulting in a new maximum MTU of 1492 being reported by the driver for these switches. I don't have the hardware to test, but I do have a MV88E6390 switch on which I can simulate this by commenting out its .port_set_jumbo_size definition from mv88e6390_ops. The result is this set of messages at probe time: mv88e6085 d0032004.mdio-mii:10: nonfatal error -34 setting MTU to 1500 on port 1 mv88e6085 d0032004.mdio-mii:10: nonfatal error -34 setting MTU to 1500 on port 2 mv88e6085 d0032004.mdio-mii:10: nonfatal error -34 setting MTU to 1500 on port 3 mv88e6085 d0032004.mdio-mii:10: nonfatal error -34 setting MTU to 1500 on port 4 mv88e6085 d0032004.mdio-mii:10: nonfatal error -34 setting MTU to 1500 on port 5 mv88e6085 d0032004.mdio-mii:10: nonfatal error -34 setting MTU to 1500 on port 6 mv88e6085 d0032004.mdio-mii:10: nonfatal error -34 setting MTU to 1500 on port 7 mv88e6085 d0032004.mdio-mii:10: nonfatal error -34 setting MTU to 1500 on port 8 It is highly implausible that there exist Ethernet switches which don't support the standard MTU of 1500 octets, and this is what the DSA framework says as well - the error comes from dsa_slave_create() -> dsa_slave_change_mtu(slave_dev, ETH_DATA_LEN). But the error messages are alarming, and it would be good to suppress them. As a consequence of this unlikeliness, we reimplement mv88e6xxx_get_max_mtu() and mv88e6xxx_change_mtu() on switches from the 3rd category as follows: the maximum supported MTU is 1500, and any request to set the MTU to a value larger than that fails in dev_validate_mtu(). Fixes: b9c587fed61c ("dsa: mv88e6xxx: Include tagger overhead when setting MTU for DSA and CPU ports") Signed-off-by: Vladimir Oltean Reviewed-by: Simon Horman Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/chip.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 0a5d6c7bb128..30383c4f8fd0 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -3549,7 +3549,7 @@ static int mv88e6xxx_get_max_mtu(struct dsa_switch *ds, int port) return 10240 - VLAN_ETH_HLEN - EDSA_HLEN - ETH_FCS_LEN; else if (chip->info->ops->set_max_frame_size) return 1632 - VLAN_ETH_HLEN - EDSA_HLEN - ETH_FCS_LEN; - return 1522 - VLAN_ETH_HLEN - EDSA_HLEN - ETH_FCS_LEN; + return ETH_DATA_LEN; } static int mv88e6xxx_change_mtu(struct dsa_switch *ds, int port, int new_mtu) @@ -3557,6 +3557,17 @@ static int mv88e6xxx_change_mtu(struct dsa_switch *ds, int port, int new_mtu) struct mv88e6xxx_chip *chip = ds->priv; int ret = 0; + /* For families where we don't know how to alter the MTU, + * just accept any value up to ETH_DATA_LEN + */ + if (!chip->info->ops->port_set_jumbo_size && + !chip->info->ops->set_max_frame_size) { + if (new_mtu > ETH_DATA_LEN) + return -EINVAL; + + return 0; + } + if (dsa_is_dsa_port(ds, port) || dsa_is_cpu_port(ds, port)) new_mtu += EDSA_HLEN; @@ -3565,9 +3576,6 @@ static int mv88e6xxx_change_mtu(struct dsa_switch *ds, int port, int new_mtu) ret = chip->info->ops->port_set_jumbo_size(chip, port, new_mtu); else if (chip->info->ops->set_max_frame_size) ret = chip->info->ops->set_max_frame_size(chip, new_mtu); - else - if (new_mtu > 1522) - ret = -EINVAL; mv88e6xxx_reg_unlock(chip); return ret; From 1a87e641d8a50c30b63b1d90819bc607b4327596 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 14 Mar 2023 14:18:27 -0500 Subject: [PATCH 079/108] net: Use of_property_read_bool() for boolean properties It is preferred to use typed property access functions (i.e. of_property_read_ functions) rather than low-level of_get_property/of_find_property functions for reading properties. Convert reading boolean properties to of_property_read_bool(). Reviewed-by: Simon Horman Acked-by: Marc Kleine-Budde # for net/can Acked-by: Kalle Valo Acked-by: Nicolas Ferre Acked-by: Francois Romieu Reviewed-by: Wei Fang Signed-off-by: Rob Herring Signed-off-by: David S. Miller --- drivers/net/can/cc770/cc770_platform.c | 12 ++++++------ drivers/net/ethernet/cadence/macb_main.c | 2 +- drivers/net/ethernet/davicom/dm9000.c | 4 ++-- drivers/net/ethernet/freescale/fec_main.c | 2 +- drivers/net/ethernet/freescale/fec_mpc52xx.c | 2 +- drivers/net/ethernet/freescale/gianfar.c | 4 ++-- drivers/net/ethernet/ibm/emac/core.c | 8 ++++---- drivers/net/ethernet/ibm/emac/rgmii.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c | 3 +-- drivers/net/ethernet/sun/niu.c | 2 +- drivers/net/ethernet/ti/cpsw-phy-sel.c | 3 +-- drivers/net/ethernet/ti/netcp_ethss.c | 8 +++----- drivers/net/ethernet/via/via-velocity.c | 3 +-- drivers/net/ethernet/via/via-velocity.h | 2 +- drivers/net/ethernet/xilinx/ll_temac_main.c | 9 ++++----- drivers/net/wan/fsl_ucc_hdlc.c | 11 +++-------- drivers/net/wireless/ti/wlcore/spi.c | 3 +-- net/ncsi/ncsi-manage.c | 4 ++-- 18 files changed, 36 insertions(+), 48 deletions(-) diff --git a/drivers/net/can/cc770/cc770_platform.c b/drivers/net/can/cc770/cc770_platform.c index 8d916e2ee6c2..8dcc32e4e30e 100644 --- a/drivers/net/can/cc770/cc770_platform.c +++ b/drivers/net/can/cc770/cc770_platform.c @@ -93,20 +93,20 @@ static int cc770_get_of_node_data(struct platform_device *pdev, if (priv->can.clock.freq > 8000000) priv->cpu_interface |= CPUIF_DMC; - if (of_get_property(np, "bosch,divide-memory-clock", NULL)) + if (of_property_read_bool(np, "bosch,divide-memory-clock")) priv->cpu_interface |= CPUIF_DMC; - if (of_get_property(np, "bosch,iso-low-speed-mux", NULL)) + if (of_property_read_bool(np, "bosch,iso-low-speed-mux")) priv->cpu_interface |= CPUIF_MUX; if (!of_get_property(np, "bosch,no-comperator-bypass", NULL)) priv->bus_config |= BUSCFG_CBY; - if (of_get_property(np, "bosch,disconnect-rx0-input", NULL)) + if (of_property_read_bool(np, "bosch,disconnect-rx0-input")) priv->bus_config |= BUSCFG_DR0; - if (of_get_property(np, "bosch,disconnect-rx1-input", NULL)) + if (of_property_read_bool(np, "bosch,disconnect-rx1-input")) priv->bus_config |= BUSCFG_DR1; - if (of_get_property(np, "bosch,disconnect-tx1-output", NULL)) + if (of_property_read_bool(np, "bosch,disconnect-tx1-output")) priv->bus_config |= BUSCFG_DT1; - if (of_get_property(np, "bosch,polarity-dominant", NULL)) + if (of_property_read_bool(np, "bosch,polarity-dominant")) priv->bus_config |= BUSCFG_POL; prop = of_get_property(np, "bosch,clock-out-frequency", &prop_size); diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 6e141a8bbf43..66e30561569e 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -4990,7 +4990,7 @@ static int macb_probe(struct platform_device *pdev) bp->jumbo_max_len = macb_config->jumbo_max_len; bp->wol = 0; - if (of_get_property(np, "magic-packet", NULL)) + if (of_property_read_bool(np, "magic-packet")) bp->wol |= MACB_WOL_HAS_MAGIC_PACKET; device_set_wakeup_capable(&pdev->dev, bp->wol & MACB_WOL_HAS_MAGIC_PACKET); diff --git a/drivers/net/ethernet/davicom/dm9000.c b/drivers/net/ethernet/davicom/dm9000.c index b21e56de6167..05a89ab6766c 100644 --- a/drivers/net/ethernet/davicom/dm9000.c +++ b/drivers/net/ethernet/davicom/dm9000.c @@ -1393,9 +1393,9 @@ static struct dm9000_plat_data *dm9000_parse_dt(struct device *dev) if (!pdata) return ERR_PTR(-ENOMEM); - if (of_find_property(np, "davicom,ext-phy", NULL)) + if (of_property_read_bool(np, "davicom,ext-phy")) pdata->flags |= DM9000_PLATF_EXT_PHY; - if (of_find_property(np, "davicom,no-eeprom", NULL)) + if (of_property_read_bool(np, "davicom,no-eeprom")) pdata->flags |= DM9000_PLATF_NO_EEPROM; ret = of_get_mac_address(np, pdata->dev_addr); diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index c73e25f8995e..f3b16a6673e2 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -4251,7 +4251,7 @@ fec_probe(struct platform_device *pdev) if (ret) goto failed_ipc_init; - if (of_get_property(np, "fsl,magic-packet", NULL)) + if (of_property_read_bool(np, "fsl,magic-packet")) fep->wol_flag |= FEC_WOL_HAS_MAGIC_PACKET; ret = fec_enet_init_stop_mode(fep, np); diff --git a/drivers/net/ethernet/freescale/fec_mpc52xx.c b/drivers/net/ethernet/freescale/fec_mpc52xx.c index a7f4c3c29f3e..b88816b71ddf 100644 --- a/drivers/net/ethernet/freescale/fec_mpc52xx.c +++ b/drivers/net/ethernet/freescale/fec_mpc52xx.c @@ -937,7 +937,7 @@ static int mpc52xx_fec_probe(struct platform_device *op) priv->phy_node = of_parse_phandle(np, "phy-handle", 0); /* the 7-wire property means don't use MII mode */ - if (of_find_property(np, "fsl,7-wire-mode", NULL)) { + if (of_property_read_bool(np, "fsl,7-wire-mode")) { priv->seven_wire_mode = 1; dev_info(&ndev->dev, "using 7-wire PHY mode\n"); } diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index b2def295523a..38d5013c6fed 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -787,10 +787,10 @@ static int gfar_of_init(struct platform_device *ofdev, struct net_device **pdev) else priv->interface = gfar_get_interface(dev); - if (of_find_property(np, "fsl,magic-packet", NULL)) + if (of_property_read_bool(np, "fsl,magic-packet")) priv->device_flags |= FSL_GIANFAR_DEV_HAS_MAGIC_PACKET; - if (of_get_property(np, "fsl,wake-on-filer", NULL)) + if (of_property_read_bool(np, "fsl,wake-on-filer")) priv->device_flags |= FSL_GIANFAR_DEV_HAS_WAKE_ON_FILER; priv->phy_node = of_parse_phandle(np, "phy-handle", 0); diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c index 9b08e41ccc29..c97095abd26a 100644 --- a/drivers/net/ethernet/ibm/emac/core.c +++ b/drivers/net/ethernet/ibm/emac/core.c @@ -2939,9 +2939,9 @@ static int emac_init_config(struct emac_instance *dev) } /* Fixup some feature bits based on the device tree */ - if (of_get_property(np, "has-inverted-stacr-oc", NULL)) + if (of_property_read_bool(np, "has-inverted-stacr-oc")) dev->features |= EMAC_FTR_STACR_OC_INVERT; - if (of_get_property(np, "has-new-stacr-staopc", NULL)) + if (of_property_read_bool(np, "has-new-stacr-staopc")) dev->features |= EMAC_FTR_HAS_NEW_STACR; /* CAB lacks the appropriate properties */ @@ -3042,7 +3042,7 @@ static int emac_probe(struct platform_device *ofdev) * property here for now, but new flat device trees should set a * status property to "disabled" instead. */ - if (of_get_property(np, "unused", NULL) || !of_device_is_available(np)) + if (of_property_read_bool(np, "unused") || !of_device_is_available(np)) return -ENODEV; /* Find ourselves in the bootlist if we are there */ @@ -3333,7 +3333,7 @@ static void __init emac_make_bootlist(void) if (of_match_node(emac_match, np) == NULL) continue; - if (of_get_property(np, "unused", NULL)) + if (of_property_read_bool(np, "unused")) continue; idx = of_get_property(np, "cell-index", NULL); if (idx == NULL) diff --git a/drivers/net/ethernet/ibm/emac/rgmii.c b/drivers/net/ethernet/ibm/emac/rgmii.c index 242ef976fd15..50358cf00130 100644 --- a/drivers/net/ethernet/ibm/emac/rgmii.c +++ b/drivers/net/ethernet/ibm/emac/rgmii.c @@ -242,7 +242,7 @@ static int rgmii_probe(struct platform_device *ofdev) } /* Check for RGMII flags */ - if (of_get_property(ofdev->dev.of_node, "has-mdio", NULL)) + if (of_property_read_bool(ofdev->dev.of_node, "has-mdio")) dev->flags |= EMAC_RGMII_FLAG_HAS_MDIO; /* CAB lacks the right properties, fix this up */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c index ac8580f501e2..ac550d1ac015 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c @@ -213,8 +213,7 @@ imx_dwmac_parse_dt(struct imx_priv_data *dwmac, struct device *dev) struct device_node *np = dev->of_node; int err = 0; - if (of_get_property(np, "snps,rmii_refclk_ext", NULL)) - dwmac->rmii_refclk_ext = true; + dwmac->rmii_refclk_ext = of_property_read_bool(np, "snps,rmii_refclk_ext"); dwmac->clk_tx = devm_clk_get(dev, "tx"); if (IS_ERR(dwmac->clk_tx)) { diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c index e6144d963eaa..ab8b09a9ef61 100644 --- a/drivers/net/ethernet/sun/niu.c +++ b/drivers/net/ethernet/sun/niu.c @@ -9271,7 +9271,7 @@ static int niu_get_of_props(struct niu *np) if (model) strcpy(np->vpd.model, model); - if (of_find_property(dp, "hot-swappable-phy", NULL)) { + if (of_property_read_bool(dp, "hot-swappable-phy")) { np->flags |= (NIU_FLAGS_10G | NIU_FLAGS_FIBER | NIU_FLAGS_HOTPLUG_PHY); } diff --git a/drivers/net/ethernet/ti/cpsw-phy-sel.c b/drivers/net/ethernet/ti/cpsw-phy-sel.c index e8f38e3f7706..25e707d7b87c 100644 --- a/drivers/net/ethernet/ti/cpsw-phy-sel.c +++ b/drivers/net/ethernet/ti/cpsw-phy-sel.c @@ -226,8 +226,7 @@ static int cpsw_phy_sel_probe(struct platform_device *pdev) if (IS_ERR(priv->gmii_sel)) return PTR_ERR(priv->gmii_sel); - if (of_find_property(pdev->dev.of_node, "rmii-clock-ext", NULL)) - priv->rmii_clock_external = true; + priv->rmii_clock_external = of_property_read_bool(pdev->dev.of_node, "rmii-clock-ext"); dev_set_drvdata(&pdev->dev, priv); diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c index 751fb0bc65c5..2adf82a32bf6 100644 --- a/drivers/net/ethernet/ti/netcp_ethss.c +++ b/drivers/net/ethernet/ti/netcp_ethss.c @@ -3583,13 +3583,11 @@ static int gbe_probe(struct netcp_device *netcp_device, struct device *dev, /* init the hw stats lock */ spin_lock_init(&gbe_dev->hw_stats_lock); - if (of_find_property(node, "enable-ale", NULL)) { - gbe_dev->enable_ale = true; + gbe_dev->enable_ale = of_property_read_bool(node, "enable-ale"); + if (gbe_dev->enable_ale) dev_info(dev, "ALE enabled\n"); - } else { - gbe_dev->enable_ale = false; + else dev_dbg(dev, "ALE bypass enabled*\n"); - } ret = of_property_read_u32(node, "tx-queue", &gbe_dev->tx_queue_id); diff --git a/drivers/net/ethernet/via/via-velocity.c b/drivers/net/ethernet/via/via-velocity.c index a502812ac418..86f7843b4591 100644 --- a/drivers/net/ethernet/via/via-velocity.c +++ b/drivers/net/ethernet/via/via-velocity.c @@ -2709,8 +2709,7 @@ static int velocity_get_platform_info(struct velocity_info *vptr) struct resource res; int ret; - if (of_get_property(vptr->dev->of_node, "no-eeprom", NULL)) - vptr->no_eeprom = 1; + vptr->no_eeprom = of_property_read_bool(vptr->dev->of_node, "no-eeprom"); ret = of_address_to_resource(vptr->dev->of_node, 0, &res); if (ret) { diff --git a/drivers/net/ethernet/via/via-velocity.h b/drivers/net/ethernet/via/via-velocity.h index ffdac6fac054..f64ed39b93d8 100644 --- a/drivers/net/ethernet/via/via-velocity.h +++ b/drivers/net/ethernet/via/via-velocity.h @@ -1383,7 +1383,7 @@ struct velocity_info { struct device *dev; struct pci_dev *pdev; struct net_device *netdev; - int no_eeprom; + bool no_eeprom; unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)]; u8 ip_addr[4]; diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c index 1066420d6a83..e0ac1bcd9925 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_main.c +++ b/drivers/net/ethernet/xilinx/ll_temac_main.c @@ -1455,12 +1455,11 @@ static int temac_probe(struct platform_device *pdev) * endianness mode. Default for OF devices is big-endian. */ little_endian = false; - if (temac_np) { - if (of_get_property(temac_np, "little-endian", NULL)) - little_endian = true; - } else if (pdata) { + if (temac_np) + little_endian = of_property_read_bool(temac_np, "little-endian"); + else if (pdata) little_endian = pdata->reg_little_endian; - } + if (little_endian) { lp->temac_ior = _temac_ior_le; lp->temac_iow = _temac_iow_le; diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c index 1c53b5546927..47c2ad7a3e42 100644 --- a/drivers/net/wan/fsl_ucc_hdlc.c +++ b/drivers/net/wan/fsl_ucc_hdlc.c @@ -1177,14 +1177,9 @@ static int ucc_hdlc_probe(struct platform_device *pdev) uhdlc_priv->dev = &pdev->dev; uhdlc_priv->ut_info = ut_info; - if (of_get_property(np, "fsl,tdm-interface", NULL)) - uhdlc_priv->tsa = 1; - - if (of_get_property(np, "fsl,ucc-internal-loopback", NULL)) - uhdlc_priv->loopback = 1; - - if (of_get_property(np, "fsl,hdlc-bus", NULL)) - uhdlc_priv->hdlc_bus = 1; + uhdlc_priv->tsa = of_property_read_bool(np, "fsl,tdm-interface"); + uhdlc_priv->loopback = of_property_read_bool(np, "fsl,ucc-internal-loopback"); + uhdlc_priv->hdlc_bus = of_property_read_bool(np, "fsl,hdlc-bus"); if (uhdlc_priv->tsa == 1) { utdm = kzalloc(sizeof(*utdm), GFP_KERNEL); diff --git a/drivers/net/wireless/ti/wlcore/spi.c b/drivers/net/wireless/ti/wlcore/spi.c index 2d2edddc77bd..3f88e6a0a510 100644 --- a/drivers/net/wireless/ti/wlcore/spi.c +++ b/drivers/net/wireless/ti/wlcore/spi.c @@ -447,8 +447,7 @@ static int wlcore_probe_of(struct spi_device *spi, struct wl12xx_spi_glue *glue, dev_info(&spi->dev, "selected chip family is %s\n", pdev_data->family->name); - if (of_find_property(dt_node, "clock-xtal", NULL)) - pdev_data->ref_clock_xtal = true; + pdev_data->ref_clock_xtal = of_property_read_bool(dt_node, "clock-xtal"); /* optional clock frequency params */ of_property_read_u32(dt_node, "ref-clock-frequency", diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 80713febfac6..d9da942ad53d 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -1803,8 +1803,8 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, pdev = to_platform_device(dev->dev.parent); if (pdev) { np = pdev->dev.of_node; - if (np && (of_get_property(np, "mellanox,multi-host", NULL) || - of_get_property(np, "mlx,multi-host", NULL))) + if (np && (of_property_read_bool(np, "mellanox,multi-host") || + of_property_read_bool(np, "mlx,multi-host"))) ndp->mlx_multi_host = true; } From fa0f1ba7c8233118b6fdaa65e2f5ded563d3e1fa Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Wed, 15 Mar 2023 09:52:22 +0800 Subject: [PATCH 080/108] virtio_net: fix page_to_skb() miss headroom Because headroom is not passed to page_to_skb(), this causes the shinfo exceeds the range. Then the frags of shinfo are changed by other process. [ 157.724634] stack segment: 0000 [#1] PREEMPT SMP NOPTI [ 157.725358] CPU: 3 PID: 679 Comm: xdp_pass_user_f Tainted: G E 6.2.0+ #150 [ 157.726401] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/4 [ 157.727820] RIP: 0010:skb_release_data+0x11b/0x180 [ 157.728449] Code: 44 24 02 48 83 c3 01 39 d8 7e be 48 89 d8 48 c1 e0 04 41 80 7d 7e 00 49 8b 6c 04 30 79 0c 48 89 ef e8 89 b [ 157.730751] RSP: 0018:ffffc90000178b48 EFLAGS: 00010202 [ 157.731383] RAX: 0000000000000010 RBX: 0000000000000001 RCX: 0000000000000000 [ 157.732270] RDX: 0000000000000000 RSI: 0000000000000002 RDI: ffff888100dd0b00 [ 157.733117] RBP: 5d5d76010f6e2408 R08: ffff888100dd0b2c R09: 0000000000000000 [ 157.734013] R10: ffffffff82effd30 R11: 000000000000a14e R12: ffff88810981ffc0 [ 157.734904] R13: ffff888100dd0b00 R14: 0000000000000002 R15: 0000000000002310 [ 157.735793] FS: 00007f06121d9740(0000) GS:ffff88842fcc0000(0000) knlGS:0000000000000000 [ 157.736794] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 157.737522] CR2: 00007ffd9a56c084 CR3: 0000000104bda001 CR4: 0000000000770ee0 [ 157.738420] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 157.739283] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 157.740146] PKRU: 55555554 [ 157.740502] Call Trace: [ 157.740843] [ 157.741117] kfree_skb_reason+0x50/0x120 [ 157.741613] __udp4_lib_rcv+0x52b/0x5e0 [ 157.742132] ip_protocol_deliver_rcu+0xaf/0x190 [ 157.742715] ip_local_deliver_finish+0x77/0xa0 [ 157.743280] ip_sublist_rcv_finish+0x80/0x90 [ 157.743834] ip_list_rcv_finish.constprop.0+0x16f/0x190 [ 157.744493] ip_list_rcv+0x126/0x140 [ 157.744952] __netif_receive_skb_list_core+0x29b/0x2c0 [ 157.745602] __netif_receive_skb_list+0xed/0x160 [ 157.746190] ? udp4_gro_receive+0x275/0x350 [ 157.746732] netif_receive_skb_list_internal+0xf2/0x1b0 [ 157.747398] napi_gro_receive+0xd1/0x210 [ 157.747911] virtnet_receive+0x75/0x1c0 [ 157.748422] virtnet_poll+0x48/0x1b0 [ 157.748878] __napi_poll+0x29/0x1b0 [ 157.749330] net_rx_action+0x27a/0x340 [ 157.749812] __do_softirq+0xf3/0x2fb [ 157.750298] do_softirq+0xa2/0xd0 [ 157.750745] [ 157.751563] [ 157.752329] __local_bh_enable_ip+0x6d/0x80 [ 157.753178] virtnet_xdp_set+0x482/0x860 [ 157.754159] ? __pfx_virtnet_xdp+0x10/0x10 [ 157.755129] dev_xdp_install+0xa4/0xe0 [ 157.756033] dev_xdp_attach+0x20b/0x5e0 [ 157.756933] do_setlink+0x82e/0xc90 [ 157.757777] ? __nla_validate_parse+0x12b/0x1e0 [ 157.758744] rtnl_setlink+0xd8/0x170 [ 157.759549] ? mod_objcg_state+0xcb/0x320 [ 157.760328] ? security_capable+0x37/0x60 [ 157.761209] ? security_capable+0x37/0x60 [ 157.762072] rtnetlink_rcv_msg+0x145/0x3d0 [ 157.762929] ? ___slab_alloc+0x327/0x610 [ 157.763754] ? __alloc_skb+0x141/0x170 [ 157.764533] ? __pfx_rtnetlink_rcv_msg+0x10/0x10 [ 157.765422] netlink_rcv_skb+0x58/0x110 [ 157.766229] netlink_unicast+0x21f/0x330 [ 157.766951] netlink_sendmsg+0x240/0x4a0 [ 157.767654] sock_sendmsg+0x93/0xa0 [ 157.768434] ? sockfd_lookup_light+0x12/0x70 [ 157.769245] __sys_sendto+0xfe/0x170 [ 157.770079] ? handle_mm_fault+0xe9/0x2d0 [ 157.770859] ? preempt_count_add+0x51/0xa0 [ 157.771645] ? up_read+0x3c/0x80 [ 157.772340] ? do_user_addr_fault+0x1e9/0x710 [ 157.773166] ? kvm_read_and_reset_apf_flags+0x49/0x60 [ 157.774087] __x64_sys_sendto+0x29/0x30 [ 157.774856] do_syscall_64+0x3c/0x90 [ 157.775518] entry_SYSCALL_64_after_hwframe+0x72/0xdc [ 157.776382] RIP: 0033:0x7f06122def70 Fixes: 18117a842ab0 ("virtio-net: remove xdp related info from page_to_skb()") Signed-off-by: Xuan Zhuo Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 1a309cfb4976..8ecf7a341d54 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -446,7 +446,8 @@ static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx) static struct sk_buff *page_to_skb(struct virtnet_info *vi, struct receive_queue *rq, struct page *page, unsigned int offset, - unsigned int len, unsigned int truesize) + unsigned int len, unsigned int truesize, + unsigned int headroom) { struct sk_buff *skb; struct virtio_net_hdr_mrg_rxbuf *hdr; @@ -464,11 +465,11 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, else hdr_padded_len = sizeof(struct padded_vnet_hdr); - buf = p; + buf = p - headroom; len -= hdr_len; offset += hdr_padded_len; p += hdr_padded_len; - tailroom = truesize - hdr_padded_len - len; + tailroom = truesize - headroom - hdr_padded_len - len; shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); @@ -1009,7 +1010,7 @@ static struct sk_buff *receive_big(struct net_device *dev, { struct page *page = buf; struct sk_buff *skb = - page_to_skb(vi, rq, page, 0, len, PAGE_SIZE); + page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, 0); stats->bytes += len - vi->hdr_len; if (unlikely(!skb)) @@ -1332,7 +1333,7 @@ err_xdp_frags: rcu_read_unlock(); skip_xdp: - head_skb = page_to_skb(vi, rq, page, offset, len, truesize); + head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); curr_skb = head_skb; if (unlikely(!curr_skb)) From 1a3bd6eabae35afc5c6dbe2651f21467cf8ad3fd Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Wed, 15 Mar 2023 09:52:23 +0800 Subject: [PATCH 081/108] virtio_net: free xdp shinfo frags when build_skb_from_xdp_buff() fails build_skb_from_xdp_buff() may return NULL, in this case we need to free the frags of xdp shinfo. Fixes: fab89bafa95b ("virtio-net: support multi-buffer xdp") Signed-off-by: Xuan Zhuo Acked-by: Michael S. Tsirkin Reviewed-by: Yunsheng Lin Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 8ecf7a341d54..2396c28c0122 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1273,9 +1273,12 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, switch (act) { case XDP_PASS: + head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); + if (unlikely(!head_skb)) + goto err_xdp_frags; + if (unlikely(xdp_page != page)) put_page(page); - head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); rcu_read_unlock(); return head_skb; case XDP_TX: From 7f5ebf5dae42e710162f1c481ebcf28ab7b741c7 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 15 Mar 2023 08:41:14 +0100 Subject: [PATCH 082/108] ravb: avoid PHY being resumed when interface is not up RAVB doesn't need mdiobus suspend/resume, that's why it sets 'mac_managed_pm'. However, setting it needs to be moved from init to probe, so mdiobus PM functions will really never be called (e.g. when the interface is not up yet during suspend/resume). Fixes: 4924c0cdce75 ("net: ravb: Fix PHY state warning splat during system resume") Suggested-by: Heiner Kallweit Signed-off-by: Wolfram Sang Reviewed-by: Michal Kubiak Reviewed-by: Sergey Shtylyov Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/ravb_main.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 0f54849a3823..894e2690c643 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -1455,8 +1455,6 @@ static int ravb_phy_init(struct net_device *ndev) phy_remove_link_mode(phydev, ETHTOOL_LINK_MODE_100baseT_Half_BIT); } - /* Indicate that the MAC is responsible for managing PHY PM */ - phydev->mac_managed_pm = true; phy_attached_info(phydev); return 0; @@ -2379,6 +2377,8 @@ static int ravb_mdio_init(struct ravb_private *priv) { struct platform_device *pdev = priv->pdev; struct device *dev = &pdev->dev; + struct phy_device *phydev; + struct device_node *pn; int error; /* Bitbang init */ @@ -2400,6 +2400,14 @@ static int ravb_mdio_init(struct ravb_private *priv) if (error) goto out_free_bus; + pn = of_parse_phandle(dev->of_node, "phy-handle", 0); + phydev = of_phy_find_device(pn); + if (phydev) { + phydev->mac_managed_pm = true; + put_device(&phydev->mdio.dev); + } + of_node_put(pn); + return 0; out_free_bus: From c6be7136afb224a01d4cde2983ddebac8da98693 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 15 Mar 2023 08:41:15 +0100 Subject: [PATCH 083/108] sh_eth: avoid PHY being resumed when interface is not up SH_ETH doesn't need mdiobus suspend/resume, that's why it sets 'mac_managed_pm'. However, setting it needs to be moved from init to probe, so mdiobus PM functions will really never be called (e.g. when the interface is not up yet during suspend/resume). Fixes: 6a1dbfefdae4 ("net: sh_eth: Fix PHY state warning splat during system resume") Suggested-by: Heiner Kallweit Signed-off-by: Wolfram Sang Reviewed-by: Michal Kubiak Reviewed-by: Sergey Shtylyov Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/sh_eth.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index ed17163d7811..d8ec729825be 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -2029,8 +2029,6 @@ static int sh_eth_phy_init(struct net_device *ndev) if (mdp->cd->register_type != SH_ETH_REG_GIGABIT) phy_set_max_speed(phydev, SPEED_100); - /* Indicate that the MAC is responsible for managing PHY PM */ - phydev->mac_managed_pm = true; phy_attached_info(phydev); return 0; @@ -3097,6 +3095,8 @@ static int sh_mdio_init(struct sh_eth_private *mdp, struct bb_info *bitbang; struct platform_device *pdev = mdp->pdev; struct device *dev = &mdp->pdev->dev; + struct phy_device *phydev; + struct device_node *pn; /* create bit control struct for PHY */ bitbang = devm_kzalloc(dev, sizeof(struct bb_info), GFP_KERNEL); @@ -3133,6 +3133,14 @@ static int sh_mdio_init(struct sh_eth_private *mdp, if (ret) goto out_free_bus; + pn = of_parse_phandle(dev->of_node, "phy-handle", 0); + phydev = of_phy_find_device(pn); + if (phydev) { + phydev->mac_managed_pm = true; + put_device(&phydev->mdio.dev); + } + of_node_put(pn); + return 0; out_free_bus: From 8a2618e14f81604a9b6ad305d57e0c8da939cd65 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 15 Mar 2023 14:40:09 +0200 Subject: [PATCH 084/108] ipv4: Fix incorrect table ID in IOCTL path Commit f96a3d74554d ("ipv4: Fix incorrect route flushing when source address is deleted") started to take the table ID field in the FIB info structure into account when determining if two structures are identical or not. This field is initialized using the 'fc_table' field in the route configuration structure, which is not set when adding a route via IOCTL. The above can result in user space being able to install two identical routes that only differ in the table ID field of their associated FIB info. Fix by initializing the table ID field in the route configuration structure in the IOCTL path. Before the fix: # ip route add default via 192.0.2.2 # route add default gw 192.0.2.2 # ip -4 r show default # default via 192.0.2.2 dev dummy10 # default via 192.0.2.2 dev dummy10 After the fix: # ip route add default via 192.0.2.2 # route add default gw 192.0.2.2 SIOCADDRT: File exists # ip -4 r show default default via 192.0.2.2 dev dummy10 Audited the code paths to ensure there are no other paths that do not properly initialize the route configuration structure when installing a route. Fixes: 5a56a0b3a45d ("net: Don't delete routes in different VRFs") Fixes: f96a3d74554d ("ipv4: Fix incorrect route flushing when source address is deleted") Reported-by: gaoxingwang Link: https://lore.kernel.org/netdev/20230314144159.2354729-1-gaoxingwang1@huawei.com/ Tested-by: gaoxingwang Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230315124009.4015212-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_frontend.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index b5736ef16ed2..390f4be7f7be 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -576,6 +576,9 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, cfg->fc_scope = RT_SCOPE_UNIVERSE; } + if (!cfg->fc_table) + cfg->fc_table = RT_TABLE_MAIN; + if (cmd == SIOCDELRT) return 0; From 43ffe6caccc7a1bb9d7442fbab521efbf6c1378c Mon Sep 17 00:00:00 2001 From: Szymon Heidrich Date: Thu, 16 Mar 2023 12:05:40 +0100 Subject: [PATCH 085/108] net: usb: smsc75xx: Move packet length check to prevent kernel panic in skb_pull Packet length check needs to be located after size and align_count calculation to prevent kernel panic in skb_pull() in case rx_cmd_a & RX_CMD_A_RED evaluates to true. Fixes: d8b228318935 ("net: usb: smsc75xx: Limit packet length to skb->len") Signed-off-by: Szymon Heidrich Link: https://lore.kernel.org/r/20230316110540.77531-1-szymon.heidrich@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/smsc75xx.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/smsc75xx.c b/drivers/net/usb/smsc75xx.c index db34f8d1d605..5d6454fedb3f 100644 --- a/drivers/net/usb/smsc75xx.c +++ b/drivers/net/usb/smsc75xx.c @@ -2200,6 +2200,13 @@ static int smsc75xx_rx_fixup(struct usbnet *dev, struct sk_buff *skb) size = (rx_cmd_a & RX_CMD_A_LEN) - RXW_PADDING; align_count = (4 - ((size + RXW_PADDING) % 4)) % 4; + if (unlikely(size > skb->len)) { + netif_dbg(dev, rx_err, dev->net, + "size err rx_cmd_a=0x%08x\n", + rx_cmd_a); + return 0; + } + if (unlikely(rx_cmd_a & RX_CMD_A_RED)) { netif_dbg(dev, rx_err, dev->net, "Error rx_cmd_a=0x%08x\n", rx_cmd_a); @@ -2212,8 +2219,7 @@ static int smsc75xx_rx_fixup(struct usbnet *dev, struct sk_buff *skb) dev->net->stats.rx_frame_errors++; } else { /* MAX_SINGLE_PACKET_SIZE + 4(CRC) + 2(COE) + 4(Vlan) */ - if (unlikely(size > (MAX_SINGLE_PACKET_SIZE + ETH_HLEN + 12) || - size > skb->len)) { + if (unlikely(size > (MAX_SINGLE_PACKET_SIZE + ETH_HLEN + 12))) { netif_dbg(dev, rx_err, dev->net, "size err rx_cmd_a=0x%08x\n", rx_cmd_a); From 37d010399f7552add2b68e2b347901c83562dab8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 15 Mar 2023 13:55:38 +0100 Subject: [PATCH 086/108] net: atlantic: Fix crash when XDP is enabled but no program is loaded MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The aq_xdp_run_prog() function falls back to the XDP_ABORTED action handler (using a goto) if the operations for any of the other actions fail. The XDP_ABORTED handler in turn calls the bpf_warn_invalid_xdp_action() tracepoint. However, the function also jumps into the XDP_PASS helper if no XDP program is loaded on the device, which means the XDP_ABORTED handler can be run with a NULL program pointer. This results in a NULL pointer deref because the tracepoint dereferences the 'prog' pointer passed to it. This situation can happen in multiple ways: - If a packet arrives between the removal of the program from the interface and the static_branch_dec() in aq_xdp_setup() - If there are multiple devices using the same driver in the system and one of them has an XDP program loaded and the other does not. Fix this by refactoring the aq_xdp_run_prog() function to remove the 'goto pass' handling if there is no XDP program loaded. Instead, factor out the skb building in a separate small helper function. Fixes: 26efaef759a1 ("net: atlantic: Implement xdp data plane") Reported-by: Freysteinn Alfredsson Tested-by: Freysteinn Alfredsson Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20230315125539.103319-1-toke@redhat.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/aquantia/atlantic/aq_ring.c | 28 ++++++++++++++----- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index 1e8d902e1c8e..7f933175cbda 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -412,6 +412,25 @@ int aq_xdp_xmit(struct net_device *dev, int num_frames, return num_frames - drop; } +static struct sk_buff *aq_xdp_build_skb(struct xdp_buff *xdp, + struct net_device *dev, + struct aq_ring_buff_s *buff) +{ + struct xdp_frame *xdpf; + struct sk_buff *skb; + + xdpf = xdp_convert_buff_to_frame(xdp); + if (unlikely(!xdpf)) + return NULL; + + skb = xdp_build_skb_from_frame(xdpf, dev); + if (!skb) + return NULL; + + aq_get_rxpages_xdp(buff, xdp); + return skb; +} + static struct sk_buff *aq_xdp_run_prog(struct aq_nic_s *aq_nic, struct xdp_buff *xdp, struct aq_ring_s *rx_ring, @@ -431,7 +450,7 @@ static struct sk_buff *aq_xdp_run_prog(struct aq_nic_s *aq_nic, prog = READ_ONCE(rx_ring->xdp_prog); if (!prog) - goto pass; + return aq_xdp_build_skb(xdp, aq_nic->ndev, buff); prefetchw(xdp->data_hard_start); /* xdp_frame write */ @@ -442,17 +461,12 @@ static struct sk_buff *aq_xdp_run_prog(struct aq_nic_s *aq_nic, act = bpf_prog_run_xdp(prog, xdp); switch (act) { case XDP_PASS: -pass: - xdpf = xdp_convert_buff_to_frame(xdp); - if (unlikely(!xdpf)) - goto out_aborted; - skb = xdp_build_skb_from_frame(xdpf, aq_nic->ndev); + skb = aq_xdp_build_skb(xdp, aq_nic->ndev, buff); if (!skb) goto out_aborted; u64_stats_update_begin(&rx_ring->stats.rx.syncp); ++rx_ring->stats.rx.xdp_pass; u64_stats_update_end(&rx_ring->stats.rx.syncp); - aq_get_rxpages_xdp(buff, xdp); return skb; case XDP_TX: xdpf = xdp_convert_buff_to_frame(xdp); From 3d87debb8ed2649608ff432699e7c961c0c6f03b Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Wed, 15 Mar 2023 14:14:35 +0100 Subject: [PATCH 087/108] net/iucv: Fix size of interrupt data iucv_irq_data needs to be 4 bytes larger. These bytes are not used by the iucv module, but written by the z/VM hypervisor in case a CPU is deconfigured. Reported as: BUG dma-kmalloc-64 (Not tainted): kmalloc Redzone overwritten ----------------------------------------------------------------------------- 0x0000000000400564-0x0000000000400567 @offset=1380. First byte 0x80 instead of 0xcc Allocated in iucv_cpu_prepare+0x44/0xd0 age=167839 cpu=2 pid=1 __kmem_cache_alloc_node+0x166/0x450 kmalloc_node_trace+0x3a/0x70 iucv_cpu_prepare+0x44/0xd0 cpuhp_invoke_callback+0x156/0x2f0 cpuhp_issue_call+0xf0/0x298 __cpuhp_setup_state_cpuslocked+0x136/0x338 __cpuhp_setup_state+0xf4/0x288 iucv_init+0xf4/0x280 do_one_initcall+0x78/0x390 do_initcalls+0x11a/0x140 kernel_init_freeable+0x25e/0x2a0 kernel_init+0x2e/0x170 __ret_from_fork+0x3c/0x58 ret_from_fork+0xa/0x40 Freed in iucv_init+0x92/0x280 age=167839 cpu=2 pid=1 __kmem_cache_free+0x308/0x358 iucv_init+0x92/0x280 do_one_initcall+0x78/0x390 do_initcalls+0x11a/0x140 kernel_init_freeable+0x25e/0x2a0 kernel_init+0x2e/0x170 __ret_from_fork+0x3c/0x58 ret_from_fork+0xa/0x40 Slab 0x0000037200010000 objects=32 used=30 fp=0x0000000000400640 flags=0x1ffff00000010200(slab|head|node=0|zone=0| Object 0x0000000000400540 @offset=1344 fp=0x0000000000000000 Redzone 0000000000400500: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc ................ Redzone 0000000000400510: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc ................ Redzone 0000000000400520: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc ................ Redzone 0000000000400530: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc ................ Object 0000000000400540: 00 01 00 03 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object 0000000000400550: f3 86 81 f2 f4 82 f8 82 f0 f0 f0 f0 f0 f0 f0 f2 ................ Object 0000000000400560: 00 00 00 00 80 00 00 00 cc cc cc cc cc cc cc cc ................ Object 0000000000400570: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc ................ Redzone 0000000000400580: cc cc cc cc cc cc cc cc ........ Padding 00000000004005d4: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ Padding 00000000004005e4: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ Padding 00000000004005f4: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZ CPU: 6 PID: 121030 Comm: 116-pai-crypto. Not tainted 6.3.0-20230221.rc0.git4.99b8246b2d71.300.fc37.s390x+debug #1 Hardware name: IBM 3931 A01 704 (z/VM 7.3.0) Call Trace: [<000000032aa034ec>] dump_stack_lvl+0xac/0x100 [<0000000329f5a6cc>] check_bytes_and_report+0x104/0x140 [<0000000329f5aa78>] check_object+0x370/0x3c0 [<0000000329f5ede6>] free_debug_processing+0x15e/0x348 [<0000000329f5f06a>] free_to_partial_list+0x9a/0x2f0 [<0000000329f5f4a4>] __slab_free+0x1e4/0x3a8 [<0000000329f61768>] __kmem_cache_free+0x308/0x358 [<000000032a91465c>] iucv_cpu_dead+0x6c/0x88 [<0000000329c2fc66>] cpuhp_invoke_callback+0x156/0x2f0 [<000000032aa062da>] _cpu_down.constprop.0+0x22a/0x5e0 [<0000000329c3243e>] cpu_device_down+0x4e/0x78 [<000000032a61dee0>] device_offline+0xc8/0x118 [<000000032a61e048>] online_store+0x60/0xe0 [<000000032a08b6b0>] kernfs_fop_write_iter+0x150/0x1e8 [<0000000329fab65c>] vfs_write+0x174/0x360 [<0000000329fab9fc>] ksys_write+0x74/0x100 [<000000032aa03a5a>] __do_syscall+0x1da/0x208 [<000000032aa177b2>] system_call+0x82/0xb0 INFO: lockdep is turned off. FIX dma-kmalloc-64: Restoring kmalloc Redzone 0x0000000000400564-0x0000000000400567=0xcc FIX dma-kmalloc-64: Object at 0x0000000000400540 not freed Fixes: 2356f4cb1911 ("[S390]: Rewrite of the IUCV base code, part 2") Signed-off-by: Alexandra Winter Link: https://lore.kernel.org/r/20230315131435.4113889-1-wintera@linux.ibm.com Signed-off-by: Jakub Kicinski --- net/iucv/iucv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index eb0295d90039..fc3fddeb6f36 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -83,7 +83,7 @@ struct iucv_irq_data { u16 ippathid; u8 ipflags1; u8 iptype; - u32 res2[8]; + u32 res2[9]; }; struct iucv_irq_list { From f38373345c65529639a01fba3675eb8cb4c579c3 Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Wed, 15 Mar 2023 14:41:17 +0100 Subject: [PATCH 088/108] i825xx: sni_82596: use eth_hw_addr_set() netdev->dev_addr is now const, we can't write to it directly. Copy scrambled mac address octects into an array then eth_hw_addr_set(). Fixes: adeef3e32146 ("net: constify netdev->dev_addr") Signed-off-by: Thomas Bogendoerfer Reviewed-by: Michal Kubiak Link: https://lore.kernel.org/r/20230315134117.79511-1-tsbogend@alpha.franken.de Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/i825xx/sni_82596.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/i825xx/sni_82596.c b/drivers/net/ethernet/i825xx/sni_82596.c index daec9ce04531..54bb4d9a0d1e 100644 --- a/drivers/net/ethernet/i825xx/sni_82596.c +++ b/drivers/net/ethernet/i825xx/sni_82596.c @@ -78,6 +78,7 @@ static int sni_82596_probe(struct platform_device *dev) void __iomem *mpu_addr; void __iomem *ca_addr; u8 __iomem *eth_addr; + u8 mac[ETH_ALEN]; res = platform_get_resource(dev, IORESOURCE_MEM, 0); ca = platform_get_resource(dev, IORESOURCE_MEM, 1); @@ -109,12 +110,13 @@ static int sni_82596_probe(struct platform_device *dev) goto probe_failed; /* someone seems to like messed up stuff */ - netdevice->dev_addr[0] = readb(eth_addr + 0x0b); - netdevice->dev_addr[1] = readb(eth_addr + 0x0a); - netdevice->dev_addr[2] = readb(eth_addr + 0x09); - netdevice->dev_addr[3] = readb(eth_addr + 0x08); - netdevice->dev_addr[4] = readb(eth_addr + 0x07); - netdevice->dev_addr[5] = readb(eth_addr + 0x06); + mac[0] = readb(eth_addr + 0x0b); + mac[1] = readb(eth_addr + 0x0a); + mac[2] = readb(eth_addr + 0x09); + mac[3] = readb(eth_addr + 0x08); + mac[4] = readb(eth_addr + 0x07); + mac[5] = readb(eth_addr + 0x06); + eth_hw_addr_set(netdevice, mac); iounmap(eth_addr); if (netdevice->irq < 0) { From 24994513ad13ff2c47ba91d2b5df82c3d496c370 Mon Sep 17 00:00:00 2001 From: Po-Hsu Lin Date: Thu, 16 Mar 2023 00:53:53 +0800 Subject: [PATCH 089/108] selftests: net: devlink_port_split.py: skip test if no suitable device available The `devlink -j port show` command output may not contain the "flavour" key, an example from Ubuntu 22.10 s390x LPAR(5.19.0-37-generic), with mlx4 driver and iproute2-5.15.0: {"port":{"pci/0001:00:00.0/1":{"type":"eth","netdev":"ens301"}, "pci/0001:00:00.0/2":{"type":"eth","netdev":"ens301d1"}, "pci/0002:00:00.0/1":{"type":"eth","netdev":"ens317"}, "pci/0002:00:00.0/2":{"type":"eth","netdev":"ens317d1"}}} This will cause a KeyError exception. Create a validate_devlink_output() to check for this "flavour" from devlink command output to avoid this KeyError exception. Also let it handle the check for `devlink -j dev show` output in main(). Apart from this, if the test was not started because the max lanes of the designated device is 0. The script will still return 0 and thus causing a false-negative test result. Use a found_max_lanes flag to determine if these tests were skipped due to this reason and return KSFT_SKIP to make it more clear. Link: https://bugs.launchpad.net/bugs/1937133 Fixes: f3348a82e727 ("selftests: net: Add port split test") Signed-off-by: Po-Hsu Lin Link: https://lore.kernel.org/r/20230315165353.229590-1-po-hsu.lin@canonical.com Signed-off-by: Jakub Kicinski --- .../selftests/net/devlink_port_split.py | 36 ++++++++++++++++--- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/net/devlink_port_split.py b/tools/testing/selftests/net/devlink_port_split.py index 2b5d6ff87373..2d84c7a0be6b 100755 --- a/tools/testing/selftests/net/devlink_port_split.py +++ b/tools/testing/selftests/net/devlink_port_split.py @@ -59,6 +59,8 @@ class devlink_ports(object): assert stderr == "" ports = json.loads(stdout)['port'] + validate_devlink_output(ports, 'flavour') + for port in ports: if dev in port: if ports[port]['flavour'] == 'physical': @@ -220,6 +222,27 @@ def split_splittable_port(port, k, lanes, dev): unsplit(port.bus_info) +def validate_devlink_output(devlink_data, target_property=None): + """ + Determine if test should be skipped by checking: + 1. devlink_data contains values + 2. The target_property exist in devlink_data + """ + skip_reason = None + if any(devlink_data.values()): + if target_property: + skip_reason = "{} not found in devlink output, test skipped".format(target_property) + for key in devlink_data: + if target_property in devlink_data[key]: + skip_reason = None + else: + skip_reason = 'devlink output is empty, test skipped' + + if skip_reason: + print(skip_reason) + sys.exit(KSFT_SKIP) + + def make_parser(): parser = argparse.ArgumentParser(description='A test for port splitting.') parser.add_argument('--dev', @@ -240,12 +263,9 @@ def main(cmdline=None): stdout, stderr = run_command(cmd) assert stderr == "" + validate_devlink_output(json.loads(stdout)) devs = json.loads(stdout)['dev'] - if devs: - dev = list(devs.keys())[0] - else: - print("no devlink device was found, test skipped") - sys.exit(KSFT_SKIP) + dev = list(devs.keys())[0] cmd = "devlink dev show %s" % dev stdout, stderr = run_command(cmd) @@ -255,6 +275,7 @@ def main(cmdline=None): ports = devlink_ports(dev) + found_max_lanes = False for port in ports.if_names: max_lanes = get_max_lanes(port.name) @@ -277,6 +298,11 @@ def main(cmdline=None): split_splittable_port(port, lane, max_lanes, dev) lane //= 2 + found_max_lanes = True + + if not found_max_lanes: + print(f"Test not started, no port of device {dev} reports max_lanes") + sys.exit(KSFT_SKIP) if __name__ == "__main__": From 470efd68a4653d9819d391489886432cd31bcd0b Mon Sep 17 00:00:00 2001 From: Daniil Tatianin Date: Wed, 15 Mar 2023 22:46:18 +0300 Subject: [PATCH 090/108] qed/qed_mng_tlv: correctly zero out ->min instead of ->hour This fixes an issue where ->hour would erroneously get zeroed out instead of ->min because of a bad copy paste. Found by Linux Verification Center (linuxtesting.org) with the SVACE static analysis tool. Fixes: f240b6882211 ("qed: Add support for processing fcoe tlv request.") Signed-off-by: Daniil Tatianin Link: https://lore.kernel.org/r/20230315194618.579286-1-d-tatianin@yandex-team.ru Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c index 6190adf965bc..f55eed092f25 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c @@ -422,7 +422,7 @@ qed_mfw_get_tlv_time_value(struct qed_mfw_tlv_time *p_time, if (p_time->hour > 23) p_time->hour = 0; if (p_time->min > 59) - p_time->hour = 0; + p_time->min = 0; if (p_time->msec > 999) p_time->msec = 0; if (p_time->usec > 999) From 1b0120e4db0bf2838d1ce741195ce4b7cc100b91 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Wed, 15 Mar 2023 21:25:17 +0100 Subject: [PATCH 091/108] hsr: ratelimit only when errors are printed Recently, when automatically merging -net and net-next in MPTCP devel tree, our CI reported [1] a conflict in hsr, the same as the one reported by Stephen in netdev [2]. When looking at the conflict, I noticed it is in fact the v1 [3] that has been applied in -net and the v2 [4] in net-next. Maybe the v1 was applied by accident. As mentioned by Jakub Kicinski [5], the new condition makes more sense before the net_ratelimit(), not to update net_ratelimit's state which is unnecessary if we're not going to print either way. Here, this modification applies the v2 but in -net. Link: https://github.com/multipath-tcp/mptcp_net-next/actions/runs/4423171069 [1] Link: https://lore.kernel.org/netdev/20230315100914.53fc1760@canb.auug.org.au/ [2] Link: https://lore.kernel.org/netdev/20230307133229.127442-1-koverskeid@gmail.com/ [3] Link: https://lore.kernel.org/netdev/20230309092302.179586-1-koverskeid@gmail.com/ [4] Link: https://lore.kernel.org/netdev/20230308232001.2fb62013@kernel.org/ [5] Fixes: 28e8cabe80f3 ("net: hsr: Don't log netdev_err message on unknown prp dst node") Signed-off-by: Matthieu Baerts Reviewed-by: Steen Hegelund Link: https://lore.kernel.org/r/20230315-net-20230315-hsr_framereg-ratelimit-v1-1-61d2ef176d11@tessares.net Signed-off-by: Jakub Kicinski --- net/hsr/hsr_framereg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 865eda39d601..b77f1189d19d 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -415,7 +415,7 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, node_dst = find_node_by_addr_A(&port->hsr->node_db, eth_hdr(skb)->h_dest); if (!node_dst) { - if (net_ratelimit() && port->hsr->prot_version != PRP_V1) + if (port->hsr->prot_version != PRP_V1 && net_ratelimit()) netdev_err(skb->dev, "%s: Unknown node\n", __func__); return; } From 054abb515f346b8f30a0a11953d9f786d3e76813 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 15 Mar 2023 16:03:49 -0700 Subject: [PATCH 092/108] tools: ynl: make definitions optional again definitions are optional, commit in question breaks cli for ethtool. Fixes: 6517a60b0307 ("tools: ynl: move the enum classes to shared code") Reviewed-by: Chuck Lever Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/nlspec.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/net/ynl/lib/nlspec.py b/tools/net/ynl/lib/nlspec.py index 960a356e8225..e01a72d06638 100644 --- a/tools/net/ynl/lib/nlspec.py +++ b/tools/net/ynl/lib/nlspec.py @@ -387,7 +387,8 @@ class SpecFamily(SpecElement): def resolve(self): self.resolve_up(super()) - for elem in self.yaml['definitions']: + definitions = self.yaml.get('definitions', []) + for elem in definitions: if elem['type'] == 'enum' or elem['type'] == 'flags': self.consts[elem['name']] = self.new_enum(elem) else: From 4e16b6a748df52800c90f3ab181aef8129bd332f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 15 Mar 2023 16:03:50 -0700 Subject: [PATCH 093/108] ynl: broaden the license even more I relicensed Netlink spec code to GPL-2.0 OR BSD-3-Clause but we still put a slightly different license on the uAPI header than the rest of the code. Use the Linux-syscall-note on all the specs and all generated code. It's moot for kernel code, but should not hurt. This way the licenses match everywhere. Cc: Chuck Lever Fixes: 37d9df224d1e ("ynl: re-license uniformly under GPL-2.0 OR BSD-3-Clause") Reviewed-by: Chuck Lever Signed-off-by: Jakub Kicinski --- Documentation/netlink/genetlink-c.yaml | 2 +- Documentation/netlink/genetlink-legacy.yaml | 2 +- Documentation/netlink/genetlink.yaml | 2 +- Documentation/netlink/specs/ethtool.yaml | 2 +- Documentation/netlink/specs/fou.yaml | 2 +- Documentation/netlink/specs/netdev.yaml | 2 +- Documentation/userspace-api/netlink/specs.rst | 3 ++- include/uapi/linux/fou.h | 2 +- include/uapi/linux/netdev.h | 2 +- net/core/netdev-genl-gen.c | 2 +- net/core/netdev-genl-gen.h | 2 +- net/ipv4/fou_nl.c | 2 +- net/ipv4/fou_nl.h | 2 +- tools/include/uapi/linux/netdev.h | 2 +- tools/net/ynl/ynl-gen-c.py | 8 ++++---- 15 files changed, 19 insertions(+), 18 deletions(-) diff --git a/Documentation/netlink/genetlink-c.yaml b/Documentation/netlink/genetlink-c.yaml index f082a5ad7cf1..5c3642b3f802 100644 --- a/Documentation/netlink/genetlink-c.yaml +++ b/Documentation/netlink/genetlink-c.yaml @@ -1,4 +1,4 @@ -# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) %YAML 1.2 --- $id: http://kernel.org/schemas/netlink/genetlink-c.yaml# diff --git a/Documentation/netlink/genetlink-legacy.yaml b/Documentation/netlink/genetlink-legacy.yaml index c6b8c77f7d12..5e98c6d2b9aa 100644 --- a/Documentation/netlink/genetlink-legacy.yaml +++ b/Documentation/netlink/genetlink-legacy.yaml @@ -1,4 +1,4 @@ -# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) %YAML 1.2 --- $id: http://kernel.org/schemas/netlink/genetlink-legacy.yaml# diff --git a/Documentation/netlink/genetlink.yaml b/Documentation/netlink/genetlink.yaml index b2d56ab9e615..d35dcd6f8d82 100644 --- a/Documentation/netlink/genetlink.yaml +++ b/Documentation/netlink/genetlink.yaml @@ -1,4 +1,4 @@ -# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) %YAML 1.2 --- $id: http://kernel.org/schemas/netlink/genetlink-legacy.yaml# diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 18ecb7d90cbe..4727c067e2ba 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -1,4 +1,4 @@ -# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) name: ethtool diff --git a/Documentation/netlink/specs/fou.yaml b/Documentation/netlink/specs/fou.yaml index cff104288723..3e13826a3fdf 100644 --- a/Documentation/netlink/specs/fou.yaml +++ b/Documentation/netlink/specs/fou.yaml @@ -1,4 +1,4 @@ -# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) name: fou diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 753e5914a8b7..b99e7ffef7a1 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -1,4 +1,4 @@ -# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) name: netdev diff --git a/Documentation/userspace-api/netlink/specs.rst b/Documentation/userspace-api/netlink/specs.rst index 2122e0c4a399..a22442ba1d30 100644 --- a/Documentation/userspace-api/netlink/specs.rst +++ b/Documentation/userspace-api/netlink/specs.rst @@ -24,7 +24,8 @@ YAML specifications can be found under ``Documentation/netlink/specs/`` This document describes details of the schema. See :doc:`intro-specs` for a practical starting guide. -All specs must be licensed under ``GPL-2.0-only OR BSD-3-Clause`` +All specs must be licensed under +``((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)`` to allow for easy adoption in user space code. Compatibility levels diff --git a/include/uapi/linux/fou.h b/include/uapi/linux/fou.h index 5041c3598493..b5cd3e7b3775 100644 --- a/include/uapi/linux/fou.h +++ b/include/uapi/linux/fou.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause */ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/fou.yaml */ /* YNL-GEN uapi header */ diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index ed134fbdfd32..639524b59930 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause */ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/netdev.yaml */ /* YNL-GEN uapi header */ diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 9e10802587fc..3abab70d66dd 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/netdev.yaml */ /* YNL-GEN kernel source */ diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index 2c5fc7d1e8a7..74d74fc23167 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/netdev.yaml */ /* YNL-GEN kernel header */ diff --git a/net/ipv4/fou_nl.c b/net/ipv4/fou_nl.c index 5c14fe030eda..6c37c4f98cca 100644 --- a/net/ipv4/fou_nl.c +++ b/net/ipv4/fou_nl.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/fou.yaml */ /* YNL-GEN kernel source */ diff --git a/net/ipv4/fou_nl.h b/net/ipv4/fou_nl.h index 58b1e1ed4b3b..dbd0780a5d34 100644 --- a/net/ipv4/fou_nl.h +++ b/net/ipv4/fou_nl.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/fou.yaml */ /* YNL-GEN kernel header */ diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index ed134fbdfd32..639524b59930 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause */ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/netdev.yaml */ /* YNL-GEN uapi header */ diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index d47376f19de7..3b4d03a50fc1 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) import argparse import collections @@ -2068,12 +2068,12 @@ def main(): _, spec_kernel = find_kernel_root(args.spec) if args.mode == 'uapi': - cw.p('/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause */') + cw.p('/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */') else: if args.header: - cw.p('/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */') + cw.p('/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */') else: - cw.p('// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause') + cw.p('// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)') cw.p("/* Do not edit directly, auto-generated from: */") cw.p(f"/*\t{spec_kernel} */") cw.p(f"/* YNL-GEN {args.mode} {'header' if args.header else 'source'} */") From cfab77c0b54583816faac5f9abdf588c13895a9d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 15 Mar 2023 16:03:51 -0700 Subject: [PATCH 094/108] ynl: make the tooling check the license The (only recently documented) expectation is that all specs are under a certain license, but we don't actually enforce it. What's worse we then go ahead and assume the license was right, outputting the expected license into generated files. Fixes: 37d9df224d1e ("ynl: re-license uniformly under GPL-2.0 OR BSD-3-Clause") Reviewed-by: Chuck Lever Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/nlspec.py | 8 ++++++++ tools/net/ynl/ynl-gen-c.py | 13 +++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/tools/net/ynl/lib/nlspec.py b/tools/net/ynl/lib/nlspec.py index e01a72d06638..d04450c2a44a 100644 --- a/tools/net/ynl/lib/nlspec.py +++ b/tools/net/ynl/lib/nlspec.py @@ -274,6 +274,7 @@ class SpecFamily(SpecElement): Attributes: proto protocol type (e.g. genetlink) + license spec license (loaded from an SPDX tag on the spec) attr_sets dict of attribute sets msgs dict of all messages (index by name) @@ -283,6 +284,13 @@ class SpecFamily(SpecElement): """ def __init__(self, spec_path, schema_path=None): with open(spec_path, "r") as stream: + prefix = '# SPDX-License-Identifier: ' + first = stream.readline().strip() + if not first.startswith(prefix): + raise Exception('SPDX license tag required in the spec') + self.license = first[len(prefix):] + + stream.seek(0) spec = yaml.safe_load(stream) self._resolution_list = [] diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index 3b4d03a50fc1..c16671a02621 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -2059,6 +2059,10 @@ def main(): try: parsed = Family(args.spec) + if parsed.license != '((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)': + print('Spec license:', parsed.license) + print('License must be: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)') + os.sys.exit(1) except yaml.YAMLError as exc: print(exc) os.sys.exit(1) @@ -2067,13 +2071,10 @@ def main(): cw = CodeWriter(BaseNlLib(), out_file) _, spec_kernel = find_kernel_root(args.spec) - if args.mode == 'uapi': - cw.p('/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */') + if args.mode == 'uapi' or args.header: + cw.p(f'/* SPDX-License-Identifier: {parsed.license} */') else: - if args.header: - cw.p('/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */') - else: - cw.p('// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)') + cw.p(f'// SPDX-License-Identifier: {parsed.license}') cw.p("/* Do not edit directly, auto-generated from: */") cw.p(f"/*\t{spec_kernel} */") cw.p(f"/* YNL-GEN {args.mode} {'header' if args.header else 'source'} */") From 5ae06327a3a5bad4ee246d81df203b1b00a7b390 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Thu, 16 Mar 2023 01:19:16 +0200 Subject: [PATCH 095/108] net: dsa: microchip: fix RGMII delay configuration on KSZ8765/KSZ8794/KSZ8795 The blamed commit has replaced a ksz_write8() call to address REG_PORT_5_CTRL_6 (0x56) with a ksz_set_xmii() -> ksz_pwrite8() call to regs[P_XMII_CTRL_1], which is also defined as 0x56 for ksz8795_regs[]. The trouble is that, when compared to ksz_write8(), ksz_pwrite8() also adjusts the register offset with the port base address. So in reality, ksz_pwrite8(offset=0x56) accesses register 0x56 + 0x50 = 0xa6, which in this switch appears to be unmapped, and the RGMII delay configuration on the CPU port does nothing. So if the switch wasn't fine with the RGMII delay configuration done through pin strapping and relied on Linux to apply a different one in order to pass traffic, this is now broken. Using the offset translation logic imposed by ksz_pwrite8(), the correct value for regs[P_XMII_CTRL_1] should have been 0x6 on ksz8795_regs[], in order to really end up accessing register 0x56. Static code analysis shows that, despite there being multiple other accesses to regs[P_XMII_CTRL_1] in this driver, the only code path that is applicable to ksz8795_regs[] and ksz8_dev_ops is ksz_set_xmii(). Therefore, the problem is isolated to RGMII delays. In its current form, ksz8795_regs[] contains the same value for P_XMII_CTRL_0 and for P_XMII_CTRL_1, and this raises valid suspicions that writes made by the driver to regs[P_XMII_CTRL_0] might overwrite writes made to regs[P_XMII_CTRL_1] or vice versa. Again, static analysis shows that the only accesses to P_XMII_CTRL_0 from the driver are made from code paths which are not reachable with ksz8_dev_ops. So the accesses made by ksz_set_xmii() are safe for this switch family. [ vladimiroltean: rewrote commit message ] Fixes: c476bede4b0f ("net: dsa: microchip: ksz8795: use common xmii function") Signed-off-by: Marek Vasut Signed-off-by: Vladimir Oltean Acked-by: Arun Ramadoss Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20230315231916.2998480-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 729b36eeb2c4..7fc2155d93d6 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -319,7 +319,7 @@ static const u16 ksz8795_regs[] = { [S_BROADCAST_CTRL] = 0x06, [S_MULTICAST_CTRL] = 0x04, [P_XMII_CTRL_0] = 0x06, - [P_XMII_CTRL_1] = 0x56, + [P_XMII_CTRL_1] = 0x06, }; static const u32 ksz8795_masks[] = { From 8de2bd02439eb839a452a853c1004c2c45ff6fef Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 16 Mar 2023 11:37:52 +0800 Subject: [PATCH 096/108] Revert "net/sched: act_api: move TCA_EXT_WARN_MSG to the correct hierarchy" This reverts commit 923b2e30dc9cd05931da0f64e2e23d040865c035. This is not a correct fix as TCA_EXT_WARN_MSG is not a hierarchy to TCA_ACT_TAB. I didn't notice the TC actions use different enum when adding TCA_EXT_WARN_MSG. To fix the difference I will add a new WARN enum in TCA_ROOT_MAX as Jamal suggested. Signed-off-by: Hangbin Liu Acked-by: Jamal Hadi Salim Signed-off-by: Jakub Kicinski --- net/sched/act_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 34c508675041..fce522886099 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1596,12 +1596,12 @@ static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], if (tcf_action_dump(skb, actions, bind, ref, false) < 0) goto out_nlmsg_trim; + nla_nest_end(skb, nest); + if (extack && extack->_msg && nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) goto out_nlmsg_trim; - nla_nest_end(skb, nest); - nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; From 2f59823fe696caa844249a90bb3f9aeda69cfe5c Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 16 Mar 2023 11:37:53 +0800 Subject: [PATCH 097/108] net/sched: act_api: add specific EXT_WARN_MSG for tc action In my previous commit 0349b8779cc9 ("sched: add new attr TCA_EXT_WARN_MSG to report tc extact message") I didn't notice the tc action use different enum with filter. So we can't use TCA_EXT_WARN_MSG directly for tc action. Let's add a TCA_ROOT_EXT_WARN_MSG for tc action specifically and put this param before going to the TCA_ACT_TAB nest. Fixes: 0349b8779cc9 ("sched: add new attr TCA_EXT_WARN_MSG to report tc extact message") Signed-off-by: Hangbin Liu Acked-by: Jamal Hadi Salim Signed-off-by: Jakub Kicinski --- include/uapi/linux/rtnetlink.h | 1 + net/sched/act_api.c | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 25a0af57dd5e..51c13cf9c5ae 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -789,6 +789,7 @@ enum { TCA_ROOT_FLAGS, TCA_ROOT_COUNT, TCA_ROOT_TIME_DELTA, /* in msecs */ + TCA_ROOT_EXT_WARN_MSG, __TCA_ROOT_MAX, #define TCA_ROOT_MAX (__TCA_ROOT_MAX - 1) }; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index fce522886099..296fc1afedd8 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1589,6 +1589,10 @@ static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], t->tca__pad1 = 0; t->tca__pad2 = 0; + if (extack && extack->_msg && + nla_put_string(skb, TCA_ROOT_EXT_WARN_MSG, extack->_msg)) + goto out_nlmsg_trim; + nest = nla_nest_start_noflag(skb, TCA_ACT_TAB); if (!nest) goto out_nlmsg_trim; @@ -1598,10 +1602,6 @@ static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], nla_nest_end(skb, nest); - if (extack && extack->_msg && - nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) - goto out_nlmsg_trim; - nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; From 769639c1fe8a98129aa97c8ee981639db1e8955c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 16 Mar 2023 15:02:34 -0700 Subject: [PATCH 098/108] net: xdp: don't call notifiers during driver init Drivers will commonly perform feature setting during init, if they use the xdp_set_features_flag() helper they'll likely run into an ASSERT_RTNL() inside call_netdevice_notifiers_info(). Don't call the notifier until the device is actually registered. Nothing should be tracking the device until its registered and after its unregistration has started. Fixes: 4d5ab0ad964d ("net/mlx5e: take into account device reconfiguration for xdp_features flag") Link: https://lore.kernel.org/r/20230316220234.598091-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/xdp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/xdp.c b/net/core/xdp.c index 87e654b7d06c..b5737e47ec41 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -781,7 +781,9 @@ void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) return; dev->xdp_features = val; - call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev); + + if (dev->reg_state == NETREG_REGISTERED) + call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev); } EXPORT_SYMBOL_GPL(xdp_set_features_flag); From dd172d0c2cea3c14ca9d007eeab51bec7676ece7 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 16 Mar 2023 09:51:33 -0500 Subject: [PATCH 099/108] net: ipa: reg: include When "reg.h" got created, it included calls to WARN() and WARN_ON(). Those macros are defined via . In addition, it uses is_power_of_2(), which is defined in . Include those files so IPA "reg.h" has access to all definitions it requires. Meanwhile, is included but nothing defined therein is required directly in "reg.h", so get rid of that. Fixes: 81772e444dbe ("net: ipa: start generalizing "ipa_reg"") Signed-off-by: Alex Elder Signed-off-by: Jakub Kicinski --- drivers/net/ipa/reg.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ipa/reg.h b/drivers/net/ipa/reg.h index 57b457f39b6e..2ee07eebca67 100644 --- a/drivers/net/ipa/reg.h +++ b/drivers/net/ipa/reg.h @@ -6,7 +6,8 @@ #define _REG_H_ #include -#include +#include +#include /** * struct reg - A register descriptor From 55c49e5c94411a82a0ba06dc615ed12b6387dec5 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 16 Mar 2023 09:51:34 -0500 Subject: [PATCH 100/108] net: ipa: add two missing declarations When gsi_reg_init() got added, its declaration was added to "gsi_reg.h" without declaring the two struct pointer types it uses. Add these struct declarations to "gsi_reg.h". Fixes: 3c506add35c7 ("net: ipa: introduce gsi_reg_init()") Signed-off-by: Alex Elder Signed-off-by: Jakub Kicinski --- drivers/net/ipa/gsi_reg.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ipa/gsi_reg.h b/drivers/net/ipa/gsi_reg.h index f62f0a5c653d..48fde65fa2e8 100644 --- a/drivers/net/ipa/gsi_reg.h +++ b/drivers/net/ipa/gsi_reg.h @@ -10,6 +10,10 @@ #include +struct platform_device; + +struct gsi; + /** * DOC: GSI Registers * From 786bbe50e1d5777f0b6ec7b4c2de6189d6f7feb4 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 16 Mar 2023 09:51:35 -0500 Subject: [PATCH 101/108] net: ipa: kill FILT_ROUT_CACHE_CFG IPA register A recent commit defined a few IPA registers used for IPA v5.0+. One of those was a mistake. Although the filter and router caches get *flushed* using a single register, they use distinct registers (ENDP_FILTER_CACHE_CFG and ENDP_ROUTER_CACHE_CFG) for configuration. And although there *exists* a FILT_ROUT_CACHE_CFG register, it is not needed in upstream code. So get rid of definitions related to FILT_ROUT_CACHE_CFG, because they are not needed. Fixes: 8ba59716d16a ("net: ipa: define IPA v5.0+ registers") Signed-off-by: Alex Elder Signed-off-by: Jakub Kicinski --- drivers/net/ipa/ipa_reg.c | 4 ++-- drivers/net/ipa/ipa_reg.h | 9 --------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/drivers/net/ipa/ipa_reg.c b/drivers/net/ipa/ipa_reg.c index 735fa6591609..463a31dfa9f4 100644 --- a/drivers/net/ipa/ipa_reg.c +++ b/drivers/net/ipa/ipa_reg.c @@ -39,7 +39,8 @@ static bool ipa_reg_id_valid(struct ipa *ipa, enum ipa_reg_id reg_id) return version <= IPA_VERSION_3_1; case ENDP_FILTER_ROUTER_HSH_CFG: - return version != IPA_VERSION_4_2; + return version < IPA_VERSION_5_0 && + version != IPA_VERSION_4_2; case IRQ_SUSPEND_EN: case IRQ_SUSPEND_CLR: @@ -52,7 +53,6 @@ static bool ipa_reg_id_valid(struct ipa *ipa, enum ipa_reg_id reg_id) case QSB_MAX_WRITES: case QSB_MAX_READS: case FILT_ROUT_HASH_EN: - case FILT_ROUT_CACHE_CFG: case FILT_ROUT_HASH_FLUSH: case FILT_ROUT_CACHE_FLUSH: case STATE_AGGR_ACTIVE: diff --git a/drivers/net/ipa/ipa_reg.h b/drivers/net/ipa/ipa_reg.h index 28aa1351dd48..ff2be8be0f68 100644 --- a/drivers/net/ipa/ipa_reg.h +++ b/drivers/net/ipa/ipa_reg.h @@ -61,7 +61,6 @@ enum ipa_reg_id { QSB_MAX_WRITES, QSB_MAX_READS, FILT_ROUT_HASH_EN, /* Not IPA v5.0+ */ - FILT_ROUT_CACHE_CFG, /* IPA v5.0+ */ FILT_ROUT_HASH_FLUSH, /* Not IPA v5.0+ */ FILT_ROUT_CACHE_FLUSH, /* IPA v5.0+ */ STATE_AGGR_ACTIVE, @@ -206,14 +205,6 @@ enum ipa_reg_qsb_max_reads_field_id { GEN_QMB_1_MAX_READS_BEATS, /* IPA v4.0+ */ }; -/* FILT_ROUT_CACHE_CFG register */ -enum ipa_reg_filt_rout_cache_cfg_field_id { - ROUTER_CACHE_EN, - FILTER_CACHE_EN, - LOW_PRI_HASH_HIT_DISABLE, - LRU_EVICTION_THRESHOLD, -}; - /* FILT_ROUT_HASH_EN and FILT_ROUT_HASH_FLUSH registers */ enum ipa_reg_filt_rout_hash_field_id { IPV6_ROUTER_HASH, From 21e8aaca401ce2b45ece1d8fabd29d422de7b48e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 16 Mar 2023 09:51:36 -0500 Subject: [PATCH 102/108] net: ipa: fix some register validity checks A recent commit defined HW_PARAM_4 as a GSI register ID but did not add it to gsi_reg_id_valid() to indicate it's valid (for IPA v5.0+). Add version checks for the HW_PARAM_2 and INTER_EE IRQ GSI registers there as well. IPA v5.0 supports up to 8 source and destination resource groups. Update the validity check (and the comments where the register IDs are defined) to reflect that. Similarly update comments and validity checks for the hash/cache-related registers. Note that this patch fixes an omission and constrains things further, but these don't technically represent bugs. Fixes: f651334e1ef5 ("net: ipa: add HW_PARAM_4 GSI register") Signed-off-by: Alex Elder Signed-off-by: Jakub Kicinski --- drivers/net/ipa/gsi_reg.c | 9 ++++++++- drivers/net/ipa/ipa_reg.c | 24 ++++++++++++++++-------- drivers/net/ipa/ipa_reg.h | 12 ++++++------ 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/drivers/net/ipa/gsi_reg.c b/drivers/net/ipa/gsi_reg.c index 1412b67304c8..1651fbad4bd5 100644 --- a/drivers/net/ipa/gsi_reg.c +++ b/drivers/net/ipa/gsi_reg.c @@ -15,6 +15,14 @@ static bool gsi_reg_id_valid(struct gsi *gsi, enum gsi_reg_id reg_id) switch (reg_id) { case INTER_EE_SRC_CH_IRQ_MSK: case INTER_EE_SRC_EV_CH_IRQ_MSK: + return gsi->version >= IPA_VERSION_3_5; + + case HW_PARAM_2: + return gsi->version >= IPA_VERSION_3_5_1; + + case HW_PARAM_4: + return gsi->version >= IPA_VERSION_5_0; + case CH_C_CNTXT_0: case CH_C_CNTXT_1: case CH_C_CNTXT_2: @@ -43,7 +51,6 @@ static bool gsi_reg_id_valid(struct gsi *gsi, enum gsi_reg_id reg_id) case CH_CMD: case EV_CH_CMD: case GENERIC_CMD: - case HW_PARAM_2: case CNTXT_TYPE_IRQ: case CNTXT_TYPE_IRQ_MSK: case CNTXT_SRC_CH_IRQ: diff --git a/drivers/net/ipa/ipa_reg.c b/drivers/net/ipa/ipa_reg.c index 463a31dfa9f4..3f475428dddd 100644 --- a/drivers/net/ipa/ipa_reg.c +++ b/drivers/net/ipa/ipa_reg.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved. - * Copyright (C) 2019-2022 Linaro Ltd. + * Copyright (C) 2019-2023 Linaro Ltd. */ #include @@ -15,6 +15,17 @@ static bool ipa_reg_id_valid(struct ipa *ipa, enum ipa_reg_id reg_id) enum ipa_version version = ipa->version; switch (reg_id) { + case FILT_ROUT_HASH_EN: + return version == IPA_VERSION_4_2; + + case FILT_ROUT_HASH_FLUSH: + return version < IPA_VERSION_5_0 && version != IPA_VERSION_4_2; + + case FILT_ROUT_CACHE_FLUSH: + case ENDP_FILTER_CACHE_CFG: + case ENDP_ROUTER_CACHE_CFG: + return version >= IPA_VERSION_5_0; + case IPA_BCR: case COUNTER_CFG: return version < IPA_VERSION_4_5; @@ -32,11 +43,13 @@ static bool ipa_reg_id_valid(struct ipa *ipa, enum ipa_reg_id reg_id) case SRC_RSRC_GRP_45_RSRC_TYPE: case DST_RSRC_GRP_45_RSRC_TYPE: return version <= IPA_VERSION_3_1 || - version == IPA_VERSION_4_5; + version == IPA_VERSION_4_5 || + version == IPA_VERSION_5_0; case SRC_RSRC_GRP_67_RSRC_TYPE: case DST_RSRC_GRP_67_RSRC_TYPE: - return version <= IPA_VERSION_3_1; + return version <= IPA_VERSION_3_1 || + version == IPA_VERSION_5_0; case ENDP_FILTER_ROUTER_HSH_CFG: return version < IPA_VERSION_5_0 && @@ -52,9 +65,6 @@ static bool ipa_reg_id_valid(struct ipa *ipa, enum ipa_reg_id reg_id) case SHARED_MEM_SIZE: case QSB_MAX_WRITES: case QSB_MAX_READS: - case FILT_ROUT_HASH_EN: - case FILT_ROUT_HASH_FLUSH: - case FILT_ROUT_CACHE_FLUSH: case STATE_AGGR_ACTIVE: case LOCAL_PKT_PROC_CNTXT: case AGGR_FORCE_CLOSE: @@ -76,8 +86,6 @@ static bool ipa_reg_id_valid(struct ipa *ipa, enum ipa_reg_id reg_id) case ENDP_INIT_RSRC_GRP: case ENDP_INIT_SEQ: case ENDP_STATUS: - case ENDP_FILTER_CACHE_CFG: - case ENDP_ROUTER_CACHE_CFG: case IPA_IRQ_STTS: case IPA_IRQ_EN: case IPA_IRQ_CLR: diff --git a/drivers/net/ipa/ipa_reg.h b/drivers/net/ipa/ipa_reg.h index ff2be8be0f68..7dd65d39333d 100644 --- a/drivers/net/ipa/ipa_reg.h +++ b/drivers/net/ipa/ipa_reg.h @@ -60,8 +60,8 @@ enum ipa_reg_id { SHARED_MEM_SIZE, QSB_MAX_WRITES, QSB_MAX_READS, - FILT_ROUT_HASH_EN, /* Not IPA v5.0+ */ - FILT_ROUT_HASH_FLUSH, /* Not IPA v5.0+ */ + FILT_ROUT_HASH_EN, /* IPA v4.2 */ + FILT_ROUT_HASH_FLUSH, /* Not IPA v4.2 nor IPA v5.0+ */ FILT_ROUT_CACHE_FLUSH, /* IPA v5.0+ */ STATE_AGGR_ACTIVE, IPA_BCR, /* Not IPA v4.5+ */ @@ -76,12 +76,12 @@ enum ipa_reg_id { TIMERS_PULSE_GRAN_CFG, /* IPA v4.5+ */ SRC_RSRC_GRP_01_RSRC_TYPE, SRC_RSRC_GRP_23_RSRC_TYPE, - SRC_RSRC_GRP_45_RSRC_TYPE, /* Not IPA v3.5+, IPA v4.5 */ - SRC_RSRC_GRP_67_RSRC_TYPE, /* Not IPA v3.5+ */ + SRC_RSRC_GRP_45_RSRC_TYPE, /* Not IPA v3.5+; IPA v4.5, IPA v5.0 */ + SRC_RSRC_GRP_67_RSRC_TYPE, /* Not IPA v3.5+; IPA v5.0 */ DST_RSRC_GRP_01_RSRC_TYPE, DST_RSRC_GRP_23_RSRC_TYPE, - DST_RSRC_GRP_45_RSRC_TYPE, /* Not IPA v3.5+, IPA v4.5 */ - DST_RSRC_GRP_67_RSRC_TYPE, /* Not IPA v3.5+ */ + DST_RSRC_GRP_45_RSRC_TYPE, /* Not IPA v3.5+; IPA v4.5, IPA v5.0 */ + DST_RSRC_GRP_67_RSRC_TYPE, /* Not IPA v3.5+; IPA v5.0 */ ENDP_INIT_CTRL, /* Not IPA v4.2+ for TX, not IPA v4.0+ for RX */ ENDP_INIT_CFG, ENDP_INIT_NAT, /* TX only */ From 90de546d9a0b3c771667af18bb3f80567eabb89b Mon Sep 17 00:00:00 2001 From: Liang He Date: Wed, 15 Mar 2023 14:00:21 +0800 Subject: [PATCH 103/108] ethernet: sun: add check for the mdesc_grab() In vnet_port_probe() and vsw_port_probe(), we should check the return value of mdesc_grab() as it may return NULL which can caused NPD bugs. Fixes: 5d01fa0c6bd8 ("ldmvsw: Add ldmvsw.c driver code") Fixes: 43fdf27470b2 ("[SPARC64]: Abstract out mdesc accesses for better MD update handling.") Signed-off-by: Liang He Reviewed-by: Piotr Raczynski Signed-off-by: David S. Miller --- drivers/net/ethernet/sun/ldmvsw.c | 3 +++ drivers/net/ethernet/sun/sunvnet.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/drivers/net/ethernet/sun/ldmvsw.c b/drivers/net/ethernet/sun/ldmvsw.c index 8addee6d04bd..734a817d3c94 100644 --- a/drivers/net/ethernet/sun/ldmvsw.c +++ b/drivers/net/ethernet/sun/ldmvsw.c @@ -287,6 +287,9 @@ static int vsw_port_probe(struct vio_dev *vdev, const struct vio_device_id *id) hp = mdesc_grab(); + if (!hp) + return -ENODEV; + rmac = mdesc_get_property(hp, vdev->mp, remote_macaddr_prop, &len); err = -ENODEV; if (!rmac) { diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c index fe86fbd58586..e220620d0ffc 100644 --- a/drivers/net/ethernet/sun/sunvnet.c +++ b/drivers/net/ethernet/sun/sunvnet.c @@ -433,6 +433,9 @@ static int vnet_port_probe(struct vio_dev *vdev, const struct vio_device_id *id) hp = mdesc_grab(); + if (!hp) + return -ENODEV; + vp = vnet_find_parent(hp, vdev->mp, vdev); if (IS_ERR(vp)) { pr_err("Cannot find port parent vnet\n"); From e05bb97d9c9dd4ba5739a27921044c935a7fb3be Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 15 Mar 2023 16:04:23 +0900 Subject: [PATCH 104/108] net: renesas: rswitch: Fix the output value of quote from rswitch_rx() If the RX descriptor doesn't have any data, the output value of quote from rswitch_rx() will be increased unexpectedily. So, fix it. Reported-by: Volodymyr Babchuk Fixes: 3590918b5d07 ("net: ethernet: renesas: Add support for "Ethernet Switch"") Signed-off-by: Yoshihiro Shimoda Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/rswitch.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index 853394e5bb8b..46d8d9c8fc19 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -702,13 +702,14 @@ static bool rswitch_rx(struct net_device *ndev, int *quota) u16 pkt_len; u32 get_ts; + if (*quota <= 0) + return true; + boguscnt = min_t(int, gq->ring_size, *quota); limit = boguscnt; desc = &gq->rx_ring[gq->cur]; while ((desc->desc.die_dt & DT_MASK) != DT_FEMPTY) { - if (--boguscnt < 0) - break; dma_rmb(); pkt_len = le16_to_cpu(desc->desc.info_ds) & RX_DS; skb = gq->skbs[gq->cur]; @@ -734,6 +735,9 @@ static bool rswitch_rx(struct net_device *ndev, int *quota) gq->cur = rswitch_next_queue_index(gq, true, 1); desc = &gq->rx_ring[gq->cur]; + + if (--boguscnt <= 0) + break; } num = rswitch_get_num_cur_queues(gq); @@ -745,7 +749,7 @@ static bool rswitch_rx(struct net_device *ndev, int *quota) goto err; gq->dirty = rswitch_next_queue_index(gq, false, num); - *quota -= limit - (++boguscnt); + *quota -= limit - boguscnt; return boguscnt <= 0; From 2c59e993c86ad57afde26eaf1beb35694da5fbfe Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 15 Mar 2023 16:04:24 +0900 Subject: [PATCH 105/108] net: renesas: rswitch: Fix GWTSDIE register handling Since the GWCA has the TX timestamp feature, this driver should not disable it if one of ports is opened. So, fix it. Reported-by: Phong Hoang Fixes: 33f5d733b589 ("net: renesas: rswitch: Improve TX timestamp accuracy") Signed-off-by: Yoshihiro Shimoda Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/rswitch.c | 9 +++++++-- drivers/net/ethernet/renesas/rswitch.h | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index 46d8d9c8fc19..c4f93d24c6a4 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -1441,7 +1441,10 @@ static int rswitch_open(struct net_device *ndev) rswitch_enadis_data_irq(rdev->priv, rdev->tx_queue->index, true); rswitch_enadis_data_irq(rdev->priv, rdev->rx_queue->index, true); - iowrite32(GWCA_TS_IRQ_BIT, rdev->priv->addr + GWTSDIE); + if (bitmap_empty(rdev->priv->opened_ports, RSWITCH_NUM_PORTS)) + iowrite32(GWCA_TS_IRQ_BIT, rdev->priv->addr + GWTSDIE); + + bitmap_set(rdev->priv->opened_ports, rdev->port, 1); return 0; }; @@ -1452,8 +1455,10 @@ static int rswitch_stop(struct net_device *ndev) struct rswitch_gwca_ts_info *ts_info, *ts_info2; netif_tx_stop_all_queues(ndev); + bitmap_clear(rdev->priv->opened_ports, rdev->port, 1); - iowrite32(GWCA_TS_IRQ_BIT, rdev->priv->addr + GWTSDID); + if (bitmap_empty(rdev->priv->opened_ports, RSWITCH_NUM_PORTS)) + iowrite32(GWCA_TS_IRQ_BIT, rdev->priv->addr + GWTSDID); list_for_each_entry_safe(ts_info, ts_info2, &rdev->priv->gwca.ts_info_list, list) { if (ts_info->port != rdev->port) diff --git a/drivers/net/ethernet/renesas/rswitch.h b/drivers/net/ethernet/renesas/rswitch.h index 27d3d38c055f..b3e0411b408e 100644 --- a/drivers/net/ethernet/renesas/rswitch.h +++ b/drivers/net/ethernet/renesas/rswitch.h @@ -998,6 +998,7 @@ struct rswitch_private { struct rcar_gen4_ptp_private *ptp_priv; struct rswitch_device *rdev[RSWITCH_NUM_PORTS]; + DECLARE_BITMAP(opened_ports, RSWITCH_NUM_PORTS); struct rswitch_gwca gwca; struct rswitch_etha etha[RSWITCH_NUM_PORTS]; From 9ec7eb60dcbcb6c41076defbc5df7bbd95ceaba5 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 15 Mar 2023 13:18:40 +0200 Subject: [PATCH 106/108] bonding: restore IFF_MASTER/SLAVE flags on bond enslave ether type change Add bond_ether_setup helper which is used to fix ether_setup() calls in the bonding driver. It takes care of both IFF_MASTER and IFF_SLAVE flags, the former is always restored and the latter only if it was set. If the bond enslaves non-ARPHRD_ETHER device (changes its type), then releases it and enslaves ARPHRD_ETHER device (changes back) then we use ether_setup() to restore the bond device type but it also resets its flags and removes IFF_MASTER and IFF_SLAVE[1]. Use the bond_ether_setup helper to restore both after such transition. [1] reproduce (nlmon is non-ARPHRD_ETHER): $ ip l add nlmon0 type nlmon $ ip l add bond2 type bond mode active-backup $ ip l set nlmon0 master bond2 $ ip l set nlmon0 nomaster $ ip l add bond1 type bond (we use bond1 as ARPHRD_ETHER device to restore bond2's mode) $ ip l set bond1 master bond2 $ ip l sh dev bond2 37: bond2: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether be:d7:c5:40:5b:cc brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 1500 (notice bond2's IFF_MASTER is missing) Fixes: e36b9d16c6a6 ("bonding: clean muticast addresses when device changes type") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 00646aa315c3..4bd911f9d3f9 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1775,6 +1775,19 @@ void bond_lower_state_changed(struct slave *slave) slave_err(bond_dev, slave_dev, "Error: %s\n", errmsg); \ } while (0) +/* The bonding driver uses ether_setup() to convert a master bond device + * to ARPHRD_ETHER, that resets the target netdevice's flags so we always + * have to restore the IFF_MASTER flag, and only restore IFF_SLAVE if it was set + */ +static void bond_ether_setup(struct net_device *bond_dev) +{ + unsigned int slave_flag = bond_dev->flags & IFF_SLAVE; + + ether_setup(bond_dev); + bond_dev->flags |= IFF_MASTER | slave_flag; + bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING; +} + /* enslave device to bond device */ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, struct netlink_ext_ack *extack) @@ -1866,10 +1879,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, if (slave_dev->type != ARPHRD_ETHER) bond_setup_by_slave(bond_dev, slave_dev); - else { - ether_setup(bond_dev); - bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING; - } + else + bond_ether_setup(bond_dev); call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, bond_dev); From e667d469098671261d558be0cd93dca4d285ce1e Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 15 Mar 2023 13:18:41 +0200 Subject: [PATCH 107/108] bonding: restore bond's IFF_SLAVE flag if a non-eth dev enslave fails syzbot reported a warning[1] where the bond device itself is a slave and we try to enslave a non-ethernet device as the first slave which fails but then in the error path when ether_setup() restores the bond device it also clears all flags. In my previous fix[2] I restored the IFF_MASTER flag, but I didn't consider the case that the bond device itself might also be a slave with IFF_SLAVE set, so we need to restore that flag as well. Use the bond_ether_setup helper which does the right thing and restores the bond's flags properly. Steps to reproduce using a nlmon dev: $ ip l add nlmon0 type nlmon $ ip l add bond1 type bond $ ip l add bond2 type bond $ ip l set bond1 master bond2 $ ip l set dev nlmon0 master bond1 $ ip -d l sh dev bond1 22: bond1: mtu 1500 qdisc noqueue master bond2 state DOWN mode DEFAULT group default qlen 1000 (now bond1's IFF_SLAVE flag is gone and we'll hit a warning[3] if we try to delete it) [1] https://syzkaller.appspot.com/bug?id=391c7b1f6522182899efba27d891f1743e8eb3ef [2] commit 7d5cd2ce5292 ("bonding: correctly handle bonding type change on enslave failure") [3] example warning: [ 27.008664] bond1: (slave nlmon0): The slave device specified does not support setting the MAC address [ 27.008692] bond1: (slave nlmon0): Error -95 calling set_mac_address [ 32.464639] bond1 (unregistering): Released all slaves [ 32.464685] ------------[ cut here ]------------ [ 32.464686] WARNING: CPU: 1 PID: 2004 at net/core/dev.c:10829 unregister_netdevice_many+0x72a/0x780 [ 32.464694] Modules linked in: br_netfilter bridge bonding virtio_net [ 32.464699] CPU: 1 PID: 2004 Comm: ip Kdump: loaded Not tainted 5.18.0-rc3+ #47 [ 32.464703] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.1-2.fc37 04/01/2014 [ 32.464704] RIP: 0010:unregister_netdevice_many+0x72a/0x780 [ 32.464707] Code: 99 fd ff ff ba 90 1a 00 00 48 c7 c6 f4 02 66 96 48 c7 c7 20 4d 35 96 c6 05 fa c7 2b 02 01 e8 be 6f 4a 00 0f 0b e9 73 fd ff ff <0f> 0b e9 5f fd ff ff 80 3d e3 c7 2b 02 00 0f 85 3b fd ff ff ba 59 [ 32.464710] RSP: 0018:ffffa006422d7820 EFLAGS: 00010206 [ 32.464712] RAX: ffff8f6e077140a0 RBX: ffffa006422d7888 RCX: 0000000000000000 [ 32.464714] RDX: ffff8f6e12edbe58 RSI: 0000000000000296 RDI: ffffffff96d4a520 [ 32.464716] RBP: ffff8f6e07714000 R08: ffffffff96d63600 R09: ffffa006422d7728 [ 32.464717] R10: 0000000000000ec0 R11: ffffffff9698c988 R12: ffff8f6e12edb140 [ 32.464719] R13: dead000000000122 R14: dead000000000100 R15: ffff8f6e12edb140 [ 32.464723] FS: 00007f297c2f1740(0000) GS:ffff8f6e5d900000(0000) knlGS:0000000000000000 [ 32.464725] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 32.464726] CR2: 00007f297bf1c800 CR3: 00000000115e8000 CR4: 0000000000350ee0 [ 32.464730] Call Trace: [ 32.464763] [ 32.464767] rtnl_dellink+0x13e/0x380 [ 32.464776] ? cred_has_capability.isra.0+0x68/0x100 [ 32.464780] ? __rtnl_unlock+0x33/0x60 [ 32.464783] ? bpf_lsm_capset+0x10/0x10 [ 32.464786] ? security_capable+0x36/0x50 [ 32.464790] rtnetlink_rcv_msg+0x14e/0x3b0 [ 32.464792] ? _copy_to_iter+0xb1/0x790 [ 32.464796] ? post_alloc_hook+0xa0/0x160 [ 32.464799] ? rtnl_calcit.isra.0+0x110/0x110 [ 32.464802] netlink_rcv_skb+0x50/0xf0 [ 32.464806] netlink_unicast+0x216/0x340 [ 32.464809] netlink_sendmsg+0x23f/0x480 [ 32.464812] sock_sendmsg+0x5e/0x60 [ 32.464815] ____sys_sendmsg+0x22c/0x270 [ 32.464818] ? import_iovec+0x17/0x20 [ 32.464821] ? sendmsg_copy_msghdr+0x59/0x90 [ 32.464823] ? do_set_pte+0xa0/0xe0 [ 32.464828] ___sys_sendmsg+0x81/0xc0 [ 32.464832] ? mod_objcg_state+0xc6/0x300 [ 32.464835] ? refill_obj_stock+0xa9/0x160 [ 32.464838] ? memcg_slab_free_hook+0x1a5/0x1f0 [ 32.464842] __sys_sendmsg+0x49/0x80 [ 32.464847] do_syscall_64+0x3b/0x90 [ 32.464851] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 32.464865] RIP: 0033:0x7f297bf2e5e7 [ 32.464868] Code: 64 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 89 54 24 1c 48 89 74 24 10 [ 32.464869] RSP: 002b:00007ffd96c824c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 32.464872] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f297bf2e5e7 [ 32.464874] RDX: 0000000000000000 RSI: 00007ffd96c82540 RDI: 0000000000000003 [ 32.464875] RBP: 00000000640f19de R08: 0000000000000001 R09: 000000000000007c [ 32.464876] R10: 00007f297bffabe0 R11: 0000000000000246 R12: 0000000000000001 [ 32.464877] R13: 00007ffd96c82d20 R14: 00007ffd96c82610 R15: 000055bfe38a7020 [ 32.464881] [ 32.464882] ---[ end trace 0000000000000000 ]--- Fixes: 7d5cd2ce5292 ("bonding: correctly handle bonding type change on enslave failure") Reported-by: syzbot+9dfc3f3348729cc82277@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?id=391c7b1f6522182899efba27d891f1743e8eb3ef Signed-off-by: Nikolay Aleksandrov Reviewed-by: Michal Kubiak Acked-by: Jonathan Toppins Acked-by: Jay Vosburgh Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 4bd911f9d3f9..236e5219c811 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2300,9 +2300,7 @@ err_undo_flags: eth_hw_addr_random(bond_dev); if (bond_dev->type != ARPHRD_ETHER) { dev_close(bond_dev); - ether_setup(bond_dev); - bond_dev->flags |= IFF_MASTER; - bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING; + bond_ether_setup(bond_dev); } } From 222c94ec0ad48b951f0f692a7cf5bcf7a6bcb6b1 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 15 Mar 2023 13:18:42 +0200 Subject: [PATCH 108/108] selftests: bonding: add tests for ether type changes Add new network selftests for the bonding device which exercise the ether type changing call paths. They also test for the recent syzbot bug[1] which causes a warning and results in wrong device flags (IFF_SLAVE missing). The test adds three bond devices and a nlmon device, enslaves one of the bond devices to the other and then uses the nlmon device for successful and unsuccesful enslaves both of which change the bond ether type. Thus we can test for both MASTER and SLAVE flags at the same time. If the flags are properly restored we get: TEST: Change ether type of an enslaved bond device with unsuccessful enslave [ OK ] TEST: Change ether type of an enslaved bond device with successful enslave [ OK ] [1] https://syzkaller.appspot.com/bug?id=391c7b1f6522182899efba27d891f1743e8eb3ef Signed-off-by: Nikolay Aleksandrov Reviewed-by: Michal Kubiak Acked-by: Jonathan Toppins Acked-by: Jay Vosburgh Signed-off-by: David S. Miller --- .../selftests/drivers/net/bonding/Makefile | 3 +- .../net/bonding/bond-eth-type-change.sh | 85 +++++++++++++++++++ 2 files changed, 87 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/drivers/net/bonding/bond-eth-type-change.sh diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile index 8e3b786a748f..a39bb2560d9b 100644 --- a/tools/testing/selftests/drivers/net/bonding/Makefile +++ b/tools/testing/selftests/drivers/net/bonding/Makefile @@ -8,7 +8,8 @@ TEST_PROGS := \ dev_addr_lists.sh \ mode-1-recovery-updelay.sh \ mode-2-recovery-updelay.sh \ - option_prio.sh + option_prio.sh \ + bond-eth-type-change.sh TEST_FILES := \ lag_lib.sh \ diff --git a/tools/testing/selftests/drivers/net/bonding/bond-eth-type-change.sh b/tools/testing/selftests/drivers/net/bonding/bond-eth-type-change.sh new file mode 100755 index 000000000000..5cdd22048ba7 --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/bond-eth-type-change.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test bond device ether type changing +# + +ALL_TESTS=" + bond_test_unsuccessful_enslave_type_change + bond_test_successful_enslave_type_change +" +REQUIRE_MZ=no +NUM_NETIFS=0 +lib_dir=$(dirname "$0") +source "$lib_dir"/net_forwarding_lib.sh + +bond_check_flags() +{ + local bonddev=$1 + + ip -d l sh dev "$bonddev" | grep -q "MASTER" + check_err $? "MASTER flag is missing from the bond device" + + ip -d l sh dev "$bonddev" | grep -q "SLAVE" + check_err $? "SLAVE flag is missing from the bond device" +} + +# test enslaved bond dev type change from ARPHRD_ETHER and back +# this allows us to test both MASTER and SLAVE flags at once +bond_test_enslave_type_change() +{ + local test_success=$1 + local devbond0="test-bond0" + local devbond1="test-bond1" + local devbond2="test-bond2" + local nonethdev="test-noneth0" + + # create a non-ARPHRD_ETHER device for testing (e.g. nlmon type) + ip link add name "$nonethdev" type nlmon + check_err $? "could not create a non-ARPHRD_ETHER device (nlmon)" + ip link add name "$devbond0" type bond + if [ $test_success -eq 1 ]; then + # we need devbond0 in active-backup mode to successfully enslave nonethdev + ip link set dev "$devbond0" type bond mode active-backup + check_err $? "could not change bond mode to active-backup" + fi + ip link add name "$devbond1" type bond + ip link add name "$devbond2" type bond + ip link set dev "$devbond0" master "$devbond1" + check_err $? "could not enslave $devbond0 to $devbond1" + # change bond type to non-ARPHRD_ETHER + ip link set dev "$nonethdev" master "$devbond0" 1>/dev/null 2>/dev/null + ip link set dev "$nonethdev" nomaster 1>/dev/null 2>/dev/null + # restore ARPHRD_ETHER type by enslaving such device + ip link set dev "$devbond2" master "$devbond0" + check_err $? "could not enslave $devbond2 to $devbond0" + ip link set dev "$devbond1" nomaster + + bond_check_flags "$devbond0" + + # clean up + ip link del dev "$devbond0" + ip link del dev "$devbond1" + ip link del dev "$devbond2" + ip link del dev "$nonethdev" +} + +bond_test_unsuccessful_enslave_type_change() +{ + RET=0 + + bond_test_enslave_type_change 0 + log_test "Change ether type of an enslaved bond device with unsuccessful enslave" +} + +bond_test_successful_enslave_type_change() +{ + RET=0 + + bond_test_enslave_type_change 1 + log_test "Change ether type of an enslaved bond device with successful enslave" +} + +tests_run + +exit "$EXIT_STATUS"