From 987d0242d189661f78b77cc4d77f843b15600fed Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Thu, 20 Apr 2023 00:14:10 -0700
Subject: [PATCH 01/57] bpf: Add bpf_dynptr_adjust

Add a new kfunc

int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 end);

which adjusts the dynptr to reflect the new [start, end) interval.
In particular, it advances the offset of the dynptr by "start" bytes,
and if end is less than the size of the dynptr, then this will trim the
dynptr accordingly.

Adjusting the dynptr interval may be useful in certain situations.
For example, when hashing which takes in generic dynptrs, if the dynptr
points to a struct but only a certain memory region inside the struct
should be hashed, adjust can be used to narrow in on the
specific region to hash.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230420071414.570108-2-joannelkoong@gmail.com
---
 kernel/bpf/helpers.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 8d368fa353f9..42e06d1b492d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1448,6 +1448,13 @@ u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr)
 	return ptr->size & DYNPTR_SIZE_MASK;
 }
 
+static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size)
+{
+	u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK;
+
+	ptr->size = new_size | metadata;
+}
+
 int bpf_dynptr_check_size(u32 size)
 {
 	return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
@@ -2297,6 +2304,24 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 o
 	return bpf_dynptr_slice(ptr, offset, buffer, buffer__szk);
 }
 
+__bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 end)
+{
+	u32 size;
+
+	if (!ptr->data || start > end)
+		return -EINVAL;
+
+	size = bpf_dynptr_get_size(ptr);
+
+	if (start > size || end > size)
+		return -ERANGE;
+
+	ptr->offset += start;
+	bpf_dynptr_set_size(ptr, end - start);
+
+	return 0;
+}
+
 __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
 {
 	return obj;
@@ -2369,6 +2394,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
 BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_dynptr_adjust)
 BTF_SET8_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {

From 540ccf96ddbc173474c32e595787d5622253be3d Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Thu, 20 Apr 2023 00:14:11 -0700
Subject: [PATCH 02/57] bpf: Add bpf_dynptr_is_null and bpf_dynptr_is_rdonly

bpf_dynptr_is_null returns true if the dynptr is null / invalid
(determined by whether ptr->data is NULL), else false if
the dynptr is a valid dynptr.

bpf_dynptr_is_rdonly returns true if the dynptr is read-only,
else false if the dynptr is read-writable. If the dynptr is
null / invalid, false is returned by default.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20230420071414.570108-3-joannelkoong@gmail.com
---
 kernel/bpf/helpers.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 42e06d1b492d..a683b3e71a28 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1423,7 +1423,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = {
 #define DYNPTR_SIZE_MASK	0xFFFFFF
 #define DYNPTR_RDONLY_BIT	BIT(31)
 
-static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
+static bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
 {
 	return ptr->size & DYNPTR_RDONLY_BIT;
 }
@@ -1570,7 +1570,7 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v
 	enum bpf_dynptr_type type;
 	int err;
 
-	if (!dst->data || bpf_dynptr_is_rdonly(dst))
+	if (!dst->data || __bpf_dynptr_is_rdonly(dst))
 		return -EINVAL;
 
 	err = bpf_dynptr_check_off_len(dst, offset, len);
@@ -1626,7 +1626,7 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3
 	if (err)
 		return 0;
 
-	if (bpf_dynptr_is_rdonly(ptr))
+	if (__bpf_dynptr_is_rdonly(ptr))
 		return 0;
 
 	type = bpf_dynptr_get_type(ptr);
@@ -2276,7 +2276,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
 __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset,
 					void *buffer, u32 buffer__szk)
 {
-	if (!ptr->data || bpf_dynptr_is_rdonly(ptr))
+	if (!ptr->data || __bpf_dynptr_is_rdonly(ptr))
 		return NULL;
 
 	/* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice.
@@ -2322,6 +2322,19 @@ __bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 en
 	return 0;
 }
 
+__bpf_kfunc bool bpf_dynptr_is_null(struct bpf_dynptr_kern *ptr)
+{
+	return !ptr->data;
+}
+
+__bpf_kfunc bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr)
+{
+	if (!ptr->data)
+		return false;
+
+	return __bpf_dynptr_is_rdonly(ptr);
+}
+
 __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
 {
 	return obj;
@@ -2395,6 +2408,8 @@ BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
 BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_dynptr_adjust)
+BTF_ID_FLAGS(func, bpf_dynptr_is_null)
+BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
 BTF_SET8_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {

From 26662d7347a058ca497792c4b22ac91cc415cbf6 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Thu, 20 Apr 2023 00:14:12 -0700
Subject: [PATCH 03/57] bpf: Add bpf_dynptr_size

bpf_dynptr_size returns the number of usable bytes in a dynptr.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20230420071414.570108-4-joannelkoong@gmail.com
---
 include/linux/bpf.h      |  2 +-
 kernel/bpf/helpers.c     | 15 ++++++++++++---
 kernel/trace/bpf_trace.c |  4 ++--
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e53ceee1df37..456f33b9d205 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1197,7 +1197,7 @@ enum bpf_dynptr_type {
 };
 
 int bpf_dynptr_check_size(u32 size);
-u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr);
+u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr);
 
 #ifdef CONFIG_BPF_JIT
 int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a683b3e71a28..c465e97733b9 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1443,7 +1443,7 @@ static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *pt
 	return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
 }
 
-u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr)
+u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
 {
 	return ptr->size & DYNPTR_SIZE_MASK;
 }
@@ -1476,7 +1476,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
 
 static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
 {
-	u32 size = bpf_dynptr_get_size(ptr);
+	u32 size = __bpf_dynptr_size(ptr);
 
 	if (len > size || offset > size - len)
 		return -E2BIG;
@@ -2311,7 +2311,7 @@ __bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 en
 	if (!ptr->data || start > end)
 		return -EINVAL;
 
-	size = bpf_dynptr_get_size(ptr);
+	size = __bpf_dynptr_size(ptr);
 
 	if (start > size || end > size)
 		return -ERANGE;
@@ -2335,6 +2335,14 @@ __bpf_kfunc bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr)
 	return __bpf_dynptr_is_rdonly(ptr);
 }
 
+__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
+{
+	if (!ptr->data)
+		return -EINVAL;
+
+	return __bpf_dynptr_size(ptr);
+}
+
 __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
 {
 	return obj;
@@ -2410,6 +2418,7 @@ BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_dynptr_adjust)
 BTF_ID_FLAGS(func, bpf_dynptr_is_null)
 BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
+BTF_ID_FLAGS(func, bpf_dynptr_size)
 BTF_SET8_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index bcf91bc7bf71..8deb22a99abe 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1349,9 +1349,9 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
 	}
 
 	return verify_pkcs7_signature(data_ptr->data,
-				      bpf_dynptr_get_size(data_ptr),
+				      __bpf_dynptr_size(data_ptr),
 				      sig_ptr->data,
-				      bpf_dynptr_get_size(sig_ptr),
+				      __bpf_dynptr_size(sig_ptr),
 				      trusted_keyring->key,
 				      VERIFYING_UNSPECIFIED_SIGNATURE, NULL,
 				      NULL);

From 361f129f3cc185af6667aca0bec0be9a020a8abc Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Thu, 20 Apr 2023 00:14:13 -0700
Subject: [PATCH 04/57] bpf: Add bpf_dynptr_clone

The cloned dynptr will point to the same data as its parent dynptr,
with the same type, offset, size and read-only properties.

Any writes to a dynptr will be reflected across all instances
(by 'instance', this means any dynptrs that point to the same
underlying data).

Please note that data slice and dynptr invalidations will affect all
instances as well. For example, if bpf_dynptr_write() is called on an
skb-type dynptr, all data slices of dynptr instances to that skb
will be invalidated as well (eg data slices of any clones, parents,
grandparents, ...). Another example is if a ringbuf dynptr is submitted,
any instance of that dynptr will be invalidated.

Changing the view of the dynptr (eg advancing the offset or
trimming the size) will only affect that dynptr and not affect any
other instances.

One example use case where cloning may be helpful is for hashing or
iterating through dynptr data. Cloning will allow the user to maintain
the original view of the dynptr for future use, while also allowing
views to smaller subsets of the data after the offset is advanced or the
size is trimmed.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230420071414.570108-5-joannelkoong@gmail.com
---
 kernel/bpf/helpers.c  |  14 ++++++
 kernel/bpf/verifier.c | 103 ++++++++++++++++++++++++++++++++++--------
 2 files changed, 97 insertions(+), 20 deletions(-)

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index c465e97733b9..bb6b4637ebf2 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2343,6 +2343,19 @@ __bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
 	return __bpf_dynptr_size(ptr);
 }
 
+__bpf_kfunc int bpf_dynptr_clone(struct bpf_dynptr_kern *ptr,
+				 struct bpf_dynptr_kern *clone__uninit)
+{
+	if (!ptr->data) {
+		bpf_dynptr_set_null(clone__uninit);
+		return -EINVAL;
+	}
+
+	*clone__uninit = *ptr;
+
+	return 0;
+}
+
 __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
 {
 	return obj;
@@ -2419,6 +2432,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_adjust)
 BTF_ID_FLAGS(func, bpf_dynptr_is_null)
 BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
 BTF_ID_FLAGS(func, bpf_dynptr_size)
+BTF_ID_FLAGS(func, bpf_dynptr_clone)
 BTF_SET8_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fbcf5a4e2fcd..ff4a8ab99f08 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -309,6 +309,7 @@ struct bpf_kfunc_call_arg_meta {
 	struct {
 		enum bpf_dynptr_type type;
 		u32 id;
+		u32 ref_obj_id;
 	} initialized_dynptr;
 	struct {
 		u8 spi;
@@ -847,11 +848,11 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
 				        struct bpf_func_state *state, int spi);
 
 static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
-				   enum bpf_arg_type arg_type, int insn_idx)
+				   enum bpf_arg_type arg_type, int insn_idx, int clone_ref_obj_id)
 {
 	struct bpf_func_state *state = func(env, reg);
 	enum bpf_dynptr_type type;
-	int spi, i, id, err;
+	int spi, i, err;
 
 	spi = dynptr_get_spi(env, reg);
 	if (spi < 0)
@@ -887,7 +888,13 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
 
 	if (dynptr_type_refcounted(type)) {
 		/* The id is used to track proper releasing */
-		id = acquire_reference_state(env, insn_idx);
+		int id;
+
+		if (clone_ref_obj_id)
+			id = clone_ref_obj_id;
+		else
+			id = acquire_reference_state(env, insn_idx);
+
 		if (id < 0)
 			return id;
 
@@ -901,24 +908,15 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
 	return 0;
 }
 
-static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi)
 {
-	struct bpf_func_state *state = func(env, reg);
-	int spi, i;
-
-	spi = dynptr_get_spi(env, reg);
-	if (spi < 0)
-		return spi;
+	int i;
 
 	for (i = 0; i < BPF_REG_SIZE; i++) {
 		state->stack[spi].slot_type[i] = STACK_INVALID;
 		state->stack[spi - 1].slot_type[i] = STACK_INVALID;
 	}
 
-	/* Invalidate any slices associated with this dynptr */
-	if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type))
-		WARN_ON_ONCE(release_reference(env, state->stack[spi].spilled_ptr.ref_obj_id));
-
 	__mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
 	__mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
 
@@ -945,6 +943,50 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
 	 */
 	state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
 	state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+}
+
+static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi, ref_obj_id, i;
+
+	spi = dynptr_get_spi(env, reg);
+	if (spi < 0)
+		return spi;
+
+	if (!dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
+		invalidate_dynptr(env, state, spi);
+		return 0;
+	}
+
+	ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id;
+
+	/* If the dynptr has a ref_obj_id, then we need to invalidate
+	 * two things:
+	 *
+	 * 1) Any dynptrs with a matching ref_obj_id (clones)
+	 * 2) Any slices derived from this dynptr.
+	 */
+
+	/* Invalidate any slices associated with this dynptr */
+	WARN_ON_ONCE(release_reference(env, ref_obj_id));
+
+	/* Invalidate any dynptr clones */
+	for (i = 1; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		if (state->stack[i].spilled_ptr.ref_obj_id != ref_obj_id)
+			continue;
+
+		/* it should always be the case that if the ref obj id
+		 * matches then the stack slot also belongs to a
+		 * dynptr
+		 */
+		if (state->stack[i].slot_type[0] != STACK_DYNPTR) {
+			verbose(env, "verifier internal error: misconfigured ref_obj_id\n");
+			return -EFAULT;
+		}
+		if (state->stack[i].spilled_ptr.dynptr.first_slot)
+			invalidate_dynptr(env, state, i);
+	}
 
 	return 0;
 }
@@ -6677,7 +6719,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
  * type, and declare it as 'const struct bpf_dynptr *' in their prototype.
  */
 static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx,
-			       enum bpf_arg_type arg_type)
+			       enum bpf_arg_type arg_type, int clone_ref_obj_id)
 {
 	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
 	int err;
@@ -6721,7 +6763,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
 				return err;
 		}
 
-		err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx);
+		err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, clone_ref_obj_id);
 	} else /* MEM_RDONLY and None case from above */ {
 		/* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
 		if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
@@ -7631,7 +7673,7 @@ skip_type_check:
 		err = check_mem_size_reg(env, reg, regno, true, meta);
 		break;
 	case ARG_PTR_TO_DYNPTR:
-		err = process_dynptr_func(env, regno, insn_idx, arg_type);
+		err = process_dynptr_func(env, regno, insn_idx, arg_type, 0);
 		if (err)
 			return err;
 		break;
@@ -9595,6 +9637,7 @@ enum special_kfunc_type {
 	KF_bpf_dynptr_from_xdp,
 	KF_bpf_dynptr_slice,
 	KF_bpf_dynptr_slice_rdwr,
+	KF_bpf_dynptr_clone,
 };
 
 BTF_SET_START(special_kfunc_set)
@@ -9614,6 +9657,7 @@ BTF_ID(func, bpf_dynptr_from_skb)
 BTF_ID(func, bpf_dynptr_from_xdp)
 BTF_ID(func, bpf_dynptr_slice)
 BTF_ID(func, bpf_dynptr_slice_rdwr)
+BTF_ID(func, bpf_dynptr_clone)
 BTF_SET_END(special_kfunc_set)
 
 BTF_ID_LIST(special_kfunc_list)
@@ -9635,6 +9679,7 @@ BTF_ID(func, bpf_dynptr_from_skb)
 BTF_ID(func, bpf_dynptr_from_xdp)
 BTF_ID(func, bpf_dynptr_slice)
 BTF_ID(func, bpf_dynptr_slice_rdwr)
+BTF_ID(func, bpf_dynptr_clone)
 
 static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
 {
@@ -10330,6 +10375,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 		case KF_ARG_PTR_TO_DYNPTR:
 		{
 			enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR;
+			int clone_ref_obj_id = 0;
 
 			if (reg->type != PTR_TO_STACK &&
 			    reg->type != CONST_PTR_TO_DYNPTR) {
@@ -10343,12 +10389,28 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			if (is_kfunc_arg_uninit(btf, &args[i]))
 				dynptr_arg_type |= MEM_UNINIT;
 
-			if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb])
+			if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
 				dynptr_arg_type |= DYNPTR_TYPE_SKB;
-			else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp])
+			} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) {
 				dynptr_arg_type |= DYNPTR_TYPE_XDP;
+			} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] &&
+				   (dynptr_arg_type & MEM_UNINIT)) {
+				enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type;
 
-			ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type);
+				if (parent_type == BPF_DYNPTR_TYPE_INVALID) {
+					verbose(env, "verifier internal error: no dynptr type for parent of clone\n");
+					return -EFAULT;
+				}
+
+				dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type);
+				clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id;
+				if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) {
+					verbose(env, "verifier internal error: missing ref obj id for parent of clone\n");
+					return -EFAULT;
+				}
+			}
+
+			ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id);
 			if (ret < 0)
 				return ret;
 
@@ -10361,6 +10423,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 				}
 				meta->initialized_dynptr.id = id;
 				meta->initialized_dynptr.type = dynptr_get_type(env, reg);
+				meta->initialized_dynptr.ref_obj_id = dynptr_ref_obj_id(env, reg);
 			}
 
 			break;

From d911ba7ceafd29606df4018bbd87a1642f9e6d88 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Thu, 20 Apr 2023 00:14:14 -0700
Subject: [PATCH 05/57] selftests/bpf: Add tests for dynptr convenience helpers

Add various tests for the added dynptr convenience helpers.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230420071414.570108-6-joannelkoong@gmail.com
---
 tools/testing/selftests/bpf/bpf_kfuncs.h      |   6 +
 .../testing/selftests/bpf/prog_tests/dynptr.c |   6 +
 .../testing/selftests/bpf/progs/dynptr_fail.c | 287 +++++++++++++++++
 .../selftests/bpf/progs/dynptr_success.c      | 298 ++++++++++++++++++
 4 files changed, 597 insertions(+)

diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h
index 8c993ec8ceea..f3c41f8902a0 100644
--- a/tools/testing/selftests/bpf/bpf_kfuncs.h
+++ b/tools/testing/selftests/bpf/bpf_kfuncs.h
@@ -35,4 +35,10 @@ extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u32 offset,
 extern void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *ptr, __u32 offset,
 			      void *buffer, __u32 buffer__szk) __ksym;
 
+extern int bpf_dynptr_adjust(const struct bpf_dynptr *ptr, __u32 start, __u32 end) __ksym;
+extern int bpf_dynptr_is_null(const struct bpf_dynptr *ptr) __ksym;
+extern int bpf_dynptr_is_rdonly(const struct bpf_dynptr *ptr) __ksym;
+extern __u32 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym;
+extern int bpf_dynptr_clone(const struct bpf_dynptr *ptr, struct bpf_dynptr *clone__init) __ksym;
+
 #endif
diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c
index d176c34a7d2e..0478916aff37 100644
--- a/tools/testing/selftests/bpf/prog_tests/dynptr.c
+++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c
@@ -20,6 +20,12 @@ static struct {
 	{"test_ringbuf", SETUP_SYSCALL_SLEEP},
 	{"test_skb_readonly", SETUP_SKB_PROG},
 	{"test_dynptr_skb_data", SETUP_SKB_PROG},
+	{"test_adjust", SETUP_SYSCALL_SLEEP},
+	{"test_adjust_err", SETUP_SYSCALL_SLEEP},
+	{"test_zero_size_dynptr", SETUP_SYSCALL_SLEEP},
+	{"test_dynptr_is_null", SETUP_SYSCALL_SLEEP},
+	{"test_dynptr_is_rdonly", SETUP_SKB_PROG},
+	{"test_dynptr_clone", SETUP_SKB_PROG},
 };
 
 static void verify_success(const char *prog_name, enum test_setup_type setup_type)
diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index 759eb5c245cd..efe4ce72d00e 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -1378,3 +1378,290 @@ int invalid_slice_rdwr_rdonly(struct __sk_buff *skb)
 
 	return 0;
 }
+
+/* bpf_dynptr_adjust can only be called on initialized dynptrs */
+SEC("?raw_tp")
+__failure __msg("Expected an initialized dynptr as arg #1")
+int dynptr_adjust_invalid(void *ctx)
+{
+	struct bpf_dynptr ptr;
+
+	/* this should fail */
+	bpf_dynptr_adjust(&ptr, 1, 2);
+
+	return 0;
+}
+
+/* bpf_dynptr_is_null can only be called on initialized dynptrs */
+SEC("?raw_tp")
+__failure __msg("Expected an initialized dynptr as arg #1")
+int dynptr_is_null_invalid(void *ctx)
+{
+	struct bpf_dynptr ptr;
+
+	/* this should fail */
+	bpf_dynptr_is_null(&ptr);
+
+	return 0;
+}
+
+/* bpf_dynptr_is_rdonly can only be called on initialized dynptrs */
+SEC("?raw_tp")
+__failure __msg("Expected an initialized dynptr as arg #1")
+int dynptr_is_rdonly_invalid(void *ctx)
+{
+	struct bpf_dynptr ptr;
+
+	/* this should fail */
+	bpf_dynptr_is_rdonly(&ptr);
+
+	return 0;
+}
+
+/* bpf_dynptr_size can only be called on initialized dynptrs */
+SEC("?raw_tp")
+__failure __msg("Expected an initialized dynptr as arg #1")
+int dynptr_size_invalid(void *ctx)
+{
+	struct bpf_dynptr ptr;
+
+	/* this should fail */
+	bpf_dynptr_size(&ptr);
+
+	return 0;
+}
+
+/* Only initialized dynptrs can be cloned */
+SEC("?raw_tp")
+__failure __msg("Expected an initialized dynptr as arg #1")
+int clone_invalid1(void *ctx)
+{
+	struct bpf_dynptr ptr1;
+	struct bpf_dynptr ptr2;
+
+	/* this should fail */
+	bpf_dynptr_clone(&ptr1, &ptr2);
+
+	return 0;
+}
+
+/* Can't overwrite an existing dynptr when cloning */
+SEC("?xdp")
+__failure __msg("cannot overwrite referenced dynptr")
+int clone_invalid2(struct xdp_md *xdp)
+{
+	struct bpf_dynptr ptr1;
+	struct bpf_dynptr clone;
+
+	bpf_dynptr_from_xdp(xdp, 0, &ptr1);
+
+	bpf_ringbuf_reserve_dynptr(&ringbuf, 64, 0, &clone);
+
+	/* this should fail */
+	bpf_dynptr_clone(&ptr1, &clone);
+
+	bpf_ringbuf_submit_dynptr(&clone, 0);
+
+	return 0;
+}
+
+/* Invalidating a dynptr should invalidate its clones */
+SEC("?raw_tp")
+__failure __msg("Expected an initialized dynptr as arg #3")
+int clone_invalidate1(void *ctx)
+{
+	struct bpf_dynptr clone;
+	struct bpf_dynptr ptr;
+	char read_data[64];
+
+	bpf_ringbuf_reserve_dynptr(&ringbuf, val, 0, &ptr);
+
+	bpf_dynptr_clone(&ptr, &clone);
+
+	bpf_ringbuf_submit_dynptr(&ptr, 0);
+
+	/* this should fail */
+	bpf_dynptr_read(read_data, sizeof(read_data), &clone, 0, 0);
+
+	return 0;
+}
+
+/* Invalidating a dynptr should invalidate its parent */
+SEC("?raw_tp")
+__failure __msg("Expected an initialized dynptr as arg #3")
+int clone_invalidate2(void *ctx)
+{
+	struct bpf_dynptr ptr;
+	struct bpf_dynptr clone;
+	char read_data[64];
+
+	bpf_ringbuf_reserve_dynptr(&ringbuf, val, 0, &ptr);
+
+	bpf_dynptr_clone(&ptr, &clone);
+
+	bpf_ringbuf_submit_dynptr(&clone, 0);
+
+	/* this should fail */
+	bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0);
+
+	return 0;
+}
+
+/* Invalidating a dynptr should invalidate its siblings */
+SEC("?raw_tp")
+__failure __msg("Expected an initialized dynptr as arg #3")
+int clone_invalidate3(void *ctx)
+{
+	struct bpf_dynptr ptr;
+	struct bpf_dynptr clone1;
+	struct bpf_dynptr clone2;
+	char read_data[64];
+
+	bpf_ringbuf_reserve_dynptr(&ringbuf, val, 0, &ptr);
+
+	bpf_dynptr_clone(&ptr, &clone1);
+
+	bpf_dynptr_clone(&ptr, &clone2);
+
+	bpf_ringbuf_submit_dynptr(&clone2, 0);
+
+	/* this should fail */
+	bpf_dynptr_read(read_data, sizeof(read_data), &clone1, 0, 0);
+
+	return 0;
+}
+
+/* Invalidating a dynptr should invalidate any data slices
+ * of its clones
+ */
+SEC("?raw_tp")
+__failure __msg("invalid mem access 'scalar'")
+int clone_invalidate4(void *ctx)
+{
+	struct bpf_dynptr ptr;
+	struct bpf_dynptr clone;
+	int *data;
+
+	bpf_ringbuf_reserve_dynptr(&ringbuf, val, 0, &ptr);
+
+	bpf_dynptr_clone(&ptr, &clone);
+	data = bpf_dynptr_data(&clone, 0, sizeof(val));
+	if (!data)
+		return 0;
+
+	bpf_ringbuf_submit_dynptr(&ptr, 0);
+
+	/* this should fail */
+	*data = 123;
+
+	return 0;
+}
+
+/* Invalidating a dynptr should invalidate any data slices
+ * of its parent
+ */
+SEC("?raw_tp")
+__failure __msg("invalid mem access 'scalar'")
+int clone_invalidate5(void *ctx)
+{
+	struct bpf_dynptr ptr;
+	struct bpf_dynptr clone;
+	int *data;
+
+	bpf_ringbuf_reserve_dynptr(&ringbuf, val, 0, &ptr);
+	data = bpf_dynptr_data(&ptr, 0, sizeof(val));
+	if (!data)
+		return 0;
+
+	bpf_dynptr_clone(&ptr, &clone);
+
+	bpf_ringbuf_submit_dynptr(&clone, 0);
+
+	/* this should fail */
+	*data = 123;
+
+	return 0;
+}
+
+/* Invalidating a dynptr should invalidate any data slices
+ * of its sibling
+ */
+SEC("?raw_tp")
+__failure __msg("invalid mem access 'scalar'")
+int clone_invalidate6(void *ctx)
+{
+	struct bpf_dynptr ptr;
+	struct bpf_dynptr clone1;
+	struct bpf_dynptr clone2;
+	int *data;
+
+	bpf_ringbuf_reserve_dynptr(&ringbuf, val, 0, &ptr);
+
+	bpf_dynptr_clone(&ptr, &clone1);
+
+	bpf_dynptr_clone(&ptr, &clone2);
+
+	data = bpf_dynptr_data(&clone1, 0, sizeof(val));
+	if (!data)
+		return 0;
+
+	bpf_ringbuf_submit_dynptr(&clone2, 0);
+
+	/* this should fail */
+	*data = 123;
+
+	return 0;
+}
+
+/* A skb clone's data slices should be invalid anytime packet data changes */
+SEC("?tc")
+__failure __msg("invalid mem access 'scalar'")
+int clone_skb_packet_data(struct __sk_buff *skb)
+{
+	char buffer[sizeof(__u32)] = {};
+	struct bpf_dynptr clone;
+	struct bpf_dynptr ptr;
+	__u32 *data;
+
+	bpf_dynptr_from_skb(skb, 0, &ptr);
+
+	bpf_dynptr_clone(&ptr, &clone);
+	data = bpf_dynptr_slice_rdwr(&clone, 0, buffer, sizeof(buffer));
+	if (!data)
+		return XDP_DROP;
+
+	if (bpf_skb_pull_data(skb, skb->len))
+		return SK_DROP;
+
+	/* this should fail */
+	*data = 123;
+
+	return 0;
+}
+
+/* A xdp clone's data slices should be invalid anytime packet data changes */
+SEC("?xdp")
+__failure __msg("invalid mem access 'scalar'")
+int clone_xdp_packet_data(struct xdp_md *xdp)
+{
+	char buffer[sizeof(__u32)] = {};
+	struct bpf_dynptr clone;
+	struct bpf_dynptr ptr;
+	struct ethhdr *hdr;
+	__u32 *data;
+
+	bpf_dynptr_from_xdp(xdp, 0, &ptr);
+
+	bpf_dynptr_clone(&ptr, &clone);
+	data = bpf_dynptr_slice_rdwr(&clone, 0, buffer, sizeof(buffer));
+	if (!data)
+		return XDP_DROP;
+
+	if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(*hdr)))
+		return XDP_DROP;
+
+	/* this should fail */
+	*data = 123;
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c
index b2fa6c47ecc0..be7de62de045 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_success.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_success.c
@@ -207,3 +207,301 @@ int test_dynptr_skb_data(struct __sk_buff *skb)
 
 	return 1;
 }
+
+SEC("tp/syscalls/sys_enter_nanosleep")
+int test_adjust(void *ctx)
+{
+	struct bpf_dynptr ptr;
+	__u32 bytes = 64;
+	__u32 off = 10;
+	__u32 trim = 15;
+
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 0;
+
+	err = bpf_ringbuf_reserve_dynptr(&ringbuf, bytes, 0, &ptr);
+	if (err) {
+		err = 1;
+		goto done;
+	}
+
+	if (bpf_dynptr_size(&ptr) != bytes) {
+		err = 2;
+		goto done;
+	}
+
+	/* Advance the dynptr by off */
+	err = bpf_dynptr_adjust(&ptr, off, bpf_dynptr_size(&ptr));
+	if (err) {
+		err = 3;
+		goto done;
+	}
+
+	if (bpf_dynptr_size(&ptr) != bytes - off) {
+		err = 4;
+		goto done;
+	}
+
+	/* Trim the dynptr */
+	err = bpf_dynptr_adjust(&ptr, off, 15);
+	if (err) {
+		err = 5;
+		goto done;
+	}
+
+	/* Check that the size was adjusted correctly */
+	if (bpf_dynptr_size(&ptr) != trim - off) {
+		err = 6;
+		goto done;
+	}
+
+done:
+	bpf_ringbuf_discard_dynptr(&ptr, 0);
+	return 0;
+}
+
+SEC("tp/syscalls/sys_enter_nanosleep")
+int test_adjust_err(void *ctx)
+{
+	char write_data[45] = "hello there, world!!";
+	struct bpf_dynptr ptr;
+	__u32 size = 64;
+	__u32 off = 20;
+
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 0;
+
+	if (bpf_ringbuf_reserve_dynptr(&ringbuf, size, 0, &ptr)) {
+		err = 1;
+		goto done;
+	}
+
+	/* Check that start can't be greater than end */
+	if (bpf_dynptr_adjust(&ptr, 5, 1) != -EINVAL) {
+		err = 2;
+		goto done;
+	}
+
+	/* Check that start can't be greater than size */
+	if (bpf_dynptr_adjust(&ptr, size + 1, size + 1) != -ERANGE) {
+		err = 3;
+		goto done;
+	}
+
+	/* Check that end can't be greater than size */
+	if (bpf_dynptr_adjust(&ptr, 0, size + 1) != -ERANGE) {
+		err = 4;
+		goto done;
+	}
+
+	if (bpf_dynptr_adjust(&ptr, off, size)) {
+		err = 5;
+		goto done;
+	}
+
+	/* Check that you can't write more bytes than available into the dynptr
+	 * after you've adjusted it
+	 */
+	if (bpf_dynptr_write(&ptr, 0, &write_data, sizeof(write_data), 0) != -E2BIG) {
+		err = 6;
+		goto done;
+	}
+
+	/* Check that even after adjusting, submitting/discarding
+	 * a ringbuf dynptr works
+	 */
+	bpf_ringbuf_submit_dynptr(&ptr, 0);
+	return 0;
+
+done:
+	bpf_ringbuf_discard_dynptr(&ptr, 0);
+	return 0;
+}
+
+SEC("tp/syscalls/sys_enter_nanosleep")
+int test_zero_size_dynptr(void *ctx)
+{
+	char write_data = 'x', read_data;
+	struct bpf_dynptr ptr;
+	__u32 size = 64;
+
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 0;
+
+	if (bpf_ringbuf_reserve_dynptr(&ringbuf, size, 0, &ptr)) {
+		err = 1;
+		goto done;
+	}
+
+	/* After this, the dynptr has a size of 0 */
+	if (bpf_dynptr_adjust(&ptr, size, size)) {
+		err = 2;
+		goto done;
+	}
+
+	/* Test that reading + writing non-zero bytes is not ok */
+	if (bpf_dynptr_read(&read_data, sizeof(read_data), &ptr, 0, 0) != -E2BIG) {
+		err = 3;
+		goto done;
+	}
+
+	if (bpf_dynptr_write(&ptr, 0, &write_data, sizeof(write_data), 0) != -E2BIG) {
+		err = 4;
+		goto done;
+	}
+
+	/* Test that reading + writing 0 bytes from a 0-size dynptr is ok */
+	if (bpf_dynptr_read(&read_data, 0, &ptr, 0, 0)) {
+		err = 5;
+		goto done;
+	}
+
+	if (bpf_dynptr_write(&ptr, 0, &write_data, 0, 0)) {
+		err = 6;
+		goto done;
+	}
+
+	err = 0;
+
+done:
+	bpf_ringbuf_discard_dynptr(&ptr, 0);
+	return 0;
+}
+
+SEC("tp/syscalls/sys_enter_nanosleep")
+int test_dynptr_is_null(void *ctx)
+{
+	struct bpf_dynptr ptr1;
+	struct bpf_dynptr ptr2;
+	__u64 size = 4;
+
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 0;
+
+	/* Pass in invalid flags, get back an invalid dynptr */
+	if (bpf_ringbuf_reserve_dynptr(&ringbuf, size, 123, &ptr1) != -EINVAL) {
+		err = 1;
+		goto exit_early;
+	}
+
+	/* Test that the invalid dynptr is null */
+	if (!bpf_dynptr_is_null(&ptr1)) {
+		err = 2;
+		goto exit_early;
+	}
+
+	/* Get a valid dynptr */
+	if (bpf_ringbuf_reserve_dynptr(&ringbuf, size, 0, &ptr2)) {
+		err = 3;
+		goto exit;
+	}
+
+	/* Test that the valid dynptr is not null */
+	if (bpf_dynptr_is_null(&ptr2)) {
+		err = 4;
+		goto exit;
+	}
+
+exit:
+	bpf_ringbuf_discard_dynptr(&ptr2, 0);
+exit_early:
+	bpf_ringbuf_discard_dynptr(&ptr1, 0);
+	return 0;
+}
+
+SEC("cgroup_skb/egress")
+int test_dynptr_is_rdonly(struct __sk_buff *skb)
+{
+	struct bpf_dynptr ptr1;
+	struct bpf_dynptr ptr2;
+	struct bpf_dynptr ptr3;
+
+	/* Pass in invalid flags, get back an invalid dynptr */
+	if (bpf_dynptr_from_skb(skb, 123, &ptr1) != -EINVAL) {
+		err = 1;
+		return 0;
+	}
+
+	/* Test that an invalid dynptr is_rdonly returns false */
+	if (bpf_dynptr_is_rdonly(&ptr1)) {
+		err = 2;
+		return 0;
+	}
+
+	/* Get a read-only dynptr */
+	if (bpf_dynptr_from_skb(skb, 0, &ptr2)) {
+		err = 3;
+		return 0;
+	}
+
+	/* Test that the dynptr is read-only */
+	if (!bpf_dynptr_is_rdonly(&ptr2)) {
+		err = 4;
+		return 0;
+	}
+
+	/* Get a read-writeable dynptr */
+	if (bpf_ringbuf_reserve_dynptr(&ringbuf, 64, 0, &ptr3)) {
+		err = 5;
+		goto done;
+	}
+
+	/* Test that the dynptr is read-only */
+	if (bpf_dynptr_is_rdonly(&ptr3)) {
+		err = 6;
+		goto done;
+	}
+
+done:
+	bpf_ringbuf_discard_dynptr(&ptr3, 0);
+	return 0;
+}
+
+SEC("cgroup_skb/egress")
+int test_dynptr_clone(struct __sk_buff *skb)
+{
+	struct bpf_dynptr ptr1;
+	struct bpf_dynptr ptr2;
+	__u32 off = 2, size;
+
+	/* Get a dynptr */
+	if (bpf_dynptr_from_skb(skb, 0, &ptr1)) {
+		err = 1;
+		return 0;
+	}
+
+	if (bpf_dynptr_adjust(&ptr1, off, bpf_dynptr_size(&ptr1))) {
+		err = 2;
+		return 0;
+	}
+
+	/* Clone the dynptr */
+	if (bpf_dynptr_clone(&ptr1, &ptr2)) {
+		err = 3;
+		return 0;
+	}
+
+	size = bpf_dynptr_size(&ptr1);
+
+	/* Check that the clone has the same size and rd-only */
+	if (bpf_dynptr_size(&ptr2) != size) {
+		err = 4;
+		return 0;
+	}
+
+	if (bpf_dynptr_is_rdonly(&ptr2) != bpf_dynptr_is_rdonly(&ptr1)) {
+		err = 5;
+		return 0;
+	}
+
+	/* Advance and trim the original dynptr */
+	bpf_dynptr_adjust(&ptr1, 5, 5);
+
+	/* Check that only original dynptr was affected, and the clone wasn't */
+	if (bpf_dynptr_size(&ptr2) != size) {
+		err = 6;
+		return 0;
+	}
+
+	return 0;
+}

From b3e8701dd1fa25fc59cffa68240326efccff0336 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 4 Apr 2023 14:05:58 +0000
Subject: [PATCH 06/57] selftests/bpf: Add test case to assert precise scalar
 path pruning

Add a test case to check for precision marking of safe paths. Ensure
that the verifier will not prematurely prune scalars contributing to
registers needing precision.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 .../testing/selftests/bpf/verifier/precise.c  | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c
index 6c03a7d805f9..8f0340eed696 100644
--- a/tools/testing/selftests/bpf/verifier/precise.c
+++ b/tools/testing/selftests/bpf/verifier/precise.c
@@ -217,3 +217,39 @@
 	.errstr = "invalid access to memory, mem_size=1 off=42 size=8",
 	.result = REJECT,
 },
+{
+	"precise: program doesn't prematurely prune branches",
+	.insns = {
+		BPF_ALU64_IMM(BPF_MOV, BPF_REG_6, 0x400),
+		BPF_ALU64_IMM(BPF_MOV, BPF_REG_7, 0),
+		BPF_ALU64_IMM(BPF_MOV, BPF_REG_8, 0),
+		BPF_ALU64_IMM(BPF_MOV, BPF_REG_9, 0x80000000),
+		BPF_ALU64_IMM(BPF_MOD, BPF_REG_6, 0x401),
+		BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+		BPF_JMP_REG(BPF_JLE, BPF_REG_6, BPF_REG_9, 2),
+		BPF_ALU64_IMM(BPF_MOD, BPF_REG_6, 1),
+		BPF_ALU64_IMM(BPF_MOV, BPF_REG_9, 0),
+		BPF_JMP_REG(BPF_JLE, BPF_REG_6, BPF_REG_9, 1),
+		BPF_ALU64_IMM(BPF_MOV, BPF_REG_6, 0),
+		BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0),
+		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4),
+		BPF_LD_MAP_FD(BPF_REG_4, 0),
+		BPF_ALU64_REG(BPF_MOV, BPF_REG_1, BPF_REG_4),
+		BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+		BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+		BPF_EXIT_INSN(),
+		BPF_ALU64_IMM(BPF_RSH, BPF_REG_6, 10),
+		BPF_ALU64_IMM(BPF_MUL, BPF_REG_6, 8192),
+		BPF_ALU64_REG(BPF_MOV, BPF_REG_1, BPF_REG_0),
+		BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_6),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0),
+		BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_3, 0),
+		BPF_EXIT_INSN(),
+	},
+	.fixup_map_array_48b = { 13 },
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.result = REJECT,
+	.errstr = "register with unbounded min value is not allowed",
+},

From af0335d2926e1c597c247956cd608b6e8c9d6463 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@isovalent.com>
Date: Sat, 22 Apr 2023 10:20:53 -0700
Subject: [PATCH 07/57] docs/bpf: Add table to describe LRU properties

Depending on the map type and flags for LRU, different properties are
global or percpu. Add a table to describe these.

Signed-off-by: Joe Stringer <joe@isovalent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20230422172054.3355436-1-joe@isovalent.com
---
 Documentation/bpf/map_hash.rst | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/Documentation/bpf/map_hash.rst b/Documentation/bpf/map_hash.rst
index 8669426264c6..1314dfc5e7e1 100644
--- a/Documentation/bpf/map_hash.rst
+++ b/Documentation/bpf/map_hash.rst
@@ -29,7 +29,16 @@ will automatically evict the least recently used entries when the hash
 table reaches capacity. An LRU hash maintains an internal LRU list that
 is used to select elements for eviction. This internal LRU list is
 shared across CPUs but it is possible to request a per CPU LRU list with
-the ``BPF_F_NO_COMMON_LRU`` flag when calling ``bpf_map_create``.
+the ``BPF_F_NO_COMMON_LRU`` flag when calling ``bpf_map_create``.  The
+following table outlines the properties of LRU maps depending on the a
+map type and the flags used to create the map.
+
+======================== ========================= ================================
+Flag                     ``BPF_MAP_TYPE_LRU_HASH`` ``BPF_MAP_TYPE_LRU_PERCPU_HASH``
+======================== ========================= ================================
+**BPF_F_NO_COMMON_LRU**  Per-CPU LRU, global map   Per-CPU LRU, per-cpu map
+**!BPF_F_NO_COMMON_LRU** Global LRU, global map    Global LRU, per-cpu map
+======================== ========================= ================================
 
 Usage
 =====

From 1a986518b8a517637f70cd6d7d494bd0cbbf6145 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@isovalent.com>
Date: Sat, 22 Apr 2023 10:20:54 -0700
Subject: [PATCH 08/57] docs/bpf: Add LRU internals description and graph

Extend the bpf hashmap docs to include a brief description of the
internals of the LRU map type (setting appropriate API expectations),
including the original commit message from Martin and a variant on the
graph that I had presented during my Linux Plumbers Conference 2022
talk on "Pressure feedback for LRU map types"[0].

The node names in the dot file correspond roughly to the functions
where the logic for those decisions or steps is defined, to help
curious developers to cross-reference and update this logic if the
details of the LRU implementation ever differ from this description.

  [0] https://lpc.events/event/16/contributions/1368/

Signed-off-by: Joe Stringer <joe@isovalent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20230422172054.3355436-2-joe@isovalent.com
---
 Documentation/bpf/map_hash.rst            |  42 ++++++
 Documentation/bpf/map_lru_hash_update.dot | 172 ++++++++++++++++++++++
 2 files changed, 214 insertions(+)
 create mode 100644 Documentation/bpf/map_lru_hash_update.dot

diff --git a/Documentation/bpf/map_hash.rst b/Documentation/bpf/map_hash.rst
index 1314dfc5e7e1..d2343952f2cb 100644
--- a/Documentation/bpf/map_hash.rst
+++ b/Documentation/bpf/map_hash.rst
@@ -1,5 +1,6 @@
 .. SPDX-License-Identifier: GPL-2.0-only
 .. Copyright (C) 2022 Red Hat, Inc.
+.. Copyright (C) 2022-2023 Isovalent, Inc.
 
 ===============================================
 BPF_MAP_TYPE_HASH, with PERCPU and LRU Variants
@@ -215,3 +216,44 @@ Userspace walking the map elements from the map declared above:
                     cur_key = &next_key;
             }
     }
+
+Internals
+=========
+
+This section of the document is targeted at Linux developers and describes
+aspects of the map implementations that are not considered stable ABI. The
+following details are subject to change in future versions of the kernel.
+
+``BPF_MAP_TYPE_LRU_HASH`` and variants
+--------------------------------------
+
+Updating elements in LRU maps may trigger eviction behaviour when the capacity
+of the map is reached. There are various steps that the update algorithm
+attempts in order to enforce the LRU property which have increasing impacts on
+other CPUs involved in the following operation attempts:
+
+- Attempt to use CPU-local state to batch operations
+- Attempt to fetch free nodes from global lists
+- Attempt to pull any node from a global list and remove it from the hashmap
+- Attempt to pull any node from any CPU's list and remove it from the hashmap
+
+This algorithm is described visually in the following diagram. See the
+description in commit 3a08c2fd7634 ("bpf: LRU List") for a full explanation of
+the corresponding operations:
+
+.. kernel-figure::  map_lru_hash_update.dot
+   :alt:    Diagram outlining the LRU eviction steps taken during map update.
+
+   LRU hash eviction during map update for ``BPF_MAP_TYPE_LRU_HASH`` and
+   variants. See the dot file source for kernel function name code references.
+
+Map updates start from the oval in the top right "begin ``bpf_map_update()``"
+and progress through the graph towards the bottom where the result may be
+either a successful update or a failure with various error codes. The key in
+the top right provides indicators for which locks may be involved in specific
+operations. This is intended as a visual hint for reasoning about how map
+contention may impact update operations, though the map type and flags may
+impact the actual contention on those locks, based on the logic described in
+the table above. For instance, if the map is created with type
+``BPF_MAP_TYPE_LRU_PERCPU_HASH`` and flags ``BPF_F_NO_COMMON_LRU`` then all map
+properties would be per-cpu.
diff --git a/Documentation/bpf/map_lru_hash_update.dot b/Documentation/bpf/map_lru_hash_update.dot
new file mode 100644
index 000000000000..a0fee349d29c
--- /dev/null
+++ b/Documentation/bpf/map_lru_hash_update.dot
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2022-2023 Isovalent, Inc.
+digraph {
+  node [colorscheme=accent4,style=filled] # Apply colorscheme to all nodes
+  graph [splines=ortho, nodesep=1]
+
+  subgraph cluster_key {
+    label = "Key\n(locks held during operation)";
+    rankdir = TB;
+
+    remote_lock [shape=rectangle,fillcolor=4,label="remote CPU LRU lock"]
+    hash_lock [shape=rectangle,fillcolor=3,label="hashtab lock"]
+    lru_lock [shape=rectangle,fillcolor=2,label="LRU lock"]
+    local_lock [shape=rectangle,fillcolor=1,label="local CPU LRU lock"]
+    no_lock [shape=rectangle,label="no locks held"]
+  }
+
+  begin [shape=oval,label="begin\nbpf_map_update()"]
+
+  // Nodes below with an 'fn_' prefix are roughly labeled by the C function
+  // names that initiate the corresponding logic in kernel/bpf/bpf_lru_list.c.
+  // Number suffixes and errno suffixes handle subsections of the corresponding
+  // logic in the function as of the writing of this dot.
+
+  // cf. __local_list_pop_free() / bpf_percpu_lru_pop_free()
+  local_freelist_check [shape=diamond,fillcolor=1,
+    label="Local freelist\nnode available?"];
+  use_local_node [shape=rectangle,
+    label="Use node owned\nby this CPU"]
+
+  // cf. bpf_lru_pop_free()
+  common_lru_check [shape=diamond,
+    label="Map created with\ncommon LRU?\n(!BPF_F_NO_COMMON_LRU)"];
+
+  fn_bpf_lru_list_pop_free_to_local [shape=rectangle,fillcolor=2,
+    label="Flush local pending,
+    Rotate Global list, move
+    LOCAL_FREE_TARGET
+    from global -> local"]
+  // Also corresponds to:
+  // fn__local_list_flush()
+  // fn_bpf_lru_list_rotate()
+  fn___bpf_lru_node_move_to_free[shape=diamond,fillcolor=2,
+    label="Able to free\nLOCAL_FREE_TARGET\nnodes?"]
+
+  fn___bpf_lru_list_shrink_inactive [shape=rectangle,fillcolor=3,
+    label="Shrink inactive list
+      up to remaining
+      LOCAL_FREE_TARGET
+      (global LRU -> local)"]
+  fn___bpf_lru_list_shrink [shape=diamond,fillcolor=2,
+    label="> 0 entries in\nlocal free list?"]
+  fn___bpf_lru_list_shrink2 [shape=rectangle,fillcolor=2,
+    label="Steal one node from
+      inactive, or if empty,
+      from active global list"]
+  fn___bpf_lru_list_shrink3 [shape=rectangle,fillcolor=3,
+    label="Try to remove\nnode from hashtab"]
+
+  local_freelist_check2 [shape=diamond,label="Htab removal\nsuccessful?"]
+  common_lru_check2 [shape=diamond,
+    label="Map created with\ncommon LRU?\n(!BPF_F_NO_COMMON_LRU)"];
+
+  subgraph cluster_remote_lock {
+    label = "Iterate through CPUs\n(start from current)";
+    style = dashed;
+    rankdir=LR;
+
+    local_freelist_check5 [shape=diamond,fillcolor=4,
+      label="Steal a node from\nper-cpu freelist?"]
+    local_freelist_check6 [shape=rectangle,fillcolor=4,
+      label="Steal a node from
+        (1) Unreferenced pending, or
+        (2) Any pending node"]
+    local_freelist_check7 [shape=rectangle,fillcolor=3,
+      label="Try to remove\nnode from hashtab"]
+    fn_htab_lru_map_update_elem [shape=diamond,
+      label="Stole node\nfrom remote\nCPU?"]
+    fn_htab_lru_map_update_elem2 [shape=diamond,label="Iterated\nall CPUs?"]
+    // Also corresponds to:
+    // use_local_node()
+    // fn__local_list_pop_pending()
+  }
+
+  fn_bpf_lru_list_pop_free_to_local2 [shape=rectangle,
+    label="Use node that was\nnot recently referenced"]
+  local_freelist_check4 [shape=rectangle,
+    label="Use node that was\nactively referenced\nin global list"]
+  fn_htab_lru_map_update_elem_ENOMEM [shape=oval,label="return -ENOMEM"]
+  fn_htab_lru_map_update_elem3 [shape=rectangle,
+    label="Use node that was\nactively referenced\nin (another?) CPU's cache"]
+  fn_htab_lru_map_update_elem4 [shape=rectangle,fillcolor=3,
+    label="Update hashmap\nwith new element"]
+  fn_htab_lru_map_update_elem5 [shape=oval,label="return 0"]
+  fn_htab_lru_map_update_elem_EBUSY [shape=oval,label="return -EBUSY"]
+  fn_htab_lru_map_update_elem_EEXIST [shape=oval,label="return -EEXIST"]
+  fn_htab_lru_map_update_elem_ENOENT [shape=oval,label="return -ENOENT"]
+
+  begin -> local_freelist_check
+  local_freelist_check -> use_local_node [xlabel="Y"]
+  local_freelist_check -> common_lru_check [xlabel="N"]
+  common_lru_check -> fn_bpf_lru_list_pop_free_to_local [xlabel="Y"]
+  common_lru_check -> fn___bpf_lru_list_shrink_inactive [xlabel="N"]
+  fn_bpf_lru_list_pop_free_to_local -> fn___bpf_lru_node_move_to_free
+  fn___bpf_lru_node_move_to_free ->
+    fn_bpf_lru_list_pop_free_to_local2 [xlabel="Y"]
+  fn___bpf_lru_node_move_to_free ->
+    fn___bpf_lru_list_shrink_inactive [xlabel="N"]
+  fn___bpf_lru_list_shrink_inactive -> fn___bpf_lru_list_shrink
+  fn___bpf_lru_list_shrink -> fn_bpf_lru_list_pop_free_to_local2 [xlabel = "Y"]
+  fn___bpf_lru_list_shrink -> fn___bpf_lru_list_shrink2 [xlabel="N"]
+  fn___bpf_lru_list_shrink2 -> fn___bpf_lru_list_shrink3
+  fn___bpf_lru_list_shrink3 -> local_freelist_check2
+  local_freelist_check2 -> local_freelist_check4 [xlabel = "Y"]
+  local_freelist_check2 -> common_lru_check2 [xlabel = "N"]
+  common_lru_check2 -> local_freelist_check5 [xlabel = "Y"]
+  common_lru_check2 -> fn_htab_lru_map_update_elem_ENOMEM [xlabel = "N"]
+  local_freelist_check5 -> fn_htab_lru_map_update_elem [xlabel = "Y"]
+  local_freelist_check5 -> local_freelist_check6 [xlabel = "N"]
+  local_freelist_check6 -> local_freelist_check7
+  local_freelist_check7 -> fn_htab_lru_map_update_elem
+
+  fn_htab_lru_map_update_elem -> fn_htab_lru_map_update_elem3 [xlabel = "Y"]
+  fn_htab_lru_map_update_elem -> fn_htab_lru_map_update_elem2  [xlabel = "N"]
+  fn_htab_lru_map_update_elem2 ->
+    fn_htab_lru_map_update_elem_ENOMEM [xlabel = "Y"]
+  fn_htab_lru_map_update_elem2 -> local_freelist_check5 [xlabel = "N"]
+  fn_htab_lru_map_update_elem3 -> fn_htab_lru_map_update_elem4
+
+  use_local_node -> fn_htab_lru_map_update_elem4
+  fn_bpf_lru_list_pop_free_to_local2 -> fn_htab_lru_map_update_elem4
+  local_freelist_check4 -> fn_htab_lru_map_update_elem4
+
+  fn_htab_lru_map_update_elem4 -> fn_htab_lru_map_update_elem5 [headlabel="Success"]
+  fn_htab_lru_map_update_elem4 ->
+    fn_htab_lru_map_update_elem_EBUSY [xlabel="Hashtab lock failed"]
+  fn_htab_lru_map_update_elem4 ->
+    fn_htab_lru_map_update_elem_EEXIST [xlabel="BPF_EXIST set and\nkey already exists"]
+  fn_htab_lru_map_update_elem4 ->
+    fn_htab_lru_map_update_elem_ENOENT [headlabel="BPF_NOEXIST set\nand no such entry"]
+
+  // Create invisible pad nodes to line up various nodes
+  pad0 [style=invis]
+  pad1 [style=invis]
+  pad2 [style=invis]
+  pad3 [style=invis]
+  pad4 [style=invis]
+
+  // Line up the key with the top of the graph
+  no_lock -> local_lock [style=invis]
+  local_lock -> lru_lock [style=invis]
+  lru_lock -> hash_lock [style=invis]
+  hash_lock -> remote_lock [style=invis]
+  remote_lock -> local_freelist_check5 [style=invis]
+  remote_lock -> fn___bpf_lru_list_shrink [style=invis]
+
+  // Line up return code nodes at the bottom of the graph
+  fn_htab_lru_map_update_elem -> pad0 [style=invis]
+  pad0 -> pad1 [style=invis]
+  pad1 -> pad2 [style=invis]
+  //pad2-> fn_htab_lru_map_update_elem_ENOMEM [style=invis]
+  fn_htab_lru_map_update_elem4 -> pad3 [style=invis]
+  pad3 -> fn_htab_lru_map_update_elem5  [style=invis]
+  pad3 -> fn_htab_lru_map_update_elem_EBUSY  [style=invis]
+  pad3 -> fn_htab_lru_map_update_elem_EEXIST  [style=invis]
+  pad3 -> fn_htab_lru_map_update_elem_ENOENT  [style=invis]
+
+  // Reduce diagram width by forcing some nodes to appear above others
+  local_freelist_check4 -> fn_htab_lru_map_update_elem3 [style=invis]
+  common_lru_check2 -> pad4 [style=invis]
+  pad4 -> local_freelist_check5 [style=invis]
+}

From 74fc8801edc279fb5f778eb91df48e4d3cd951a2 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Fri, 21 Apr 2023 14:41:31 -0700
Subject: [PATCH 09/57] bpftool: Show map IDs along with struct_ops links.

A new link type, BPF_LINK_TYPE_STRUCT_OPS, was added to attach
struct_ops to links. (226bc6ae6405) It would be helpful for users to
know which map is associated with the link.

The assumption was that every link is associated with a BPF program, but
this does not hold true for struct_ops. It would be better to display
map_id instead of prog_id for struct_ops links. However, some tools may
rely on the old assumption and need a prog_id.  The discussion on the
mailing list suggests that tools should parse JSON format. We will maintain
the existing JSON format by adding a map_id without removing prog_id. As
for plain text format, we will remove prog_id from the header line and add
a map_id for struct_ops links.

Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20230421214131.352662-1-kuifeng@meta.com
---
 tools/bpf/bpftool/link.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c
index d98dbc50cf4c..243b74e18e51 100644
--- a/tools/bpf/bpftool/link.c
+++ b/tools/bpf/bpftool/link.c
@@ -212,7 +212,10 @@ static int show_link_close_json(int fd, struct bpf_link_info *info)
 	case BPF_LINK_TYPE_NETFILTER:
 		netfilter_dump_json(info, json_wtr);
 		break;
-
+	case BPF_LINK_TYPE_STRUCT_OPS:
+		jsonw_uint_field(json_wtr, "map_id",
+				 info->struct_ops.map_id);
+		break;
 	default:
 		break;
 	}
@@ -245,7 +248,10 @@ static void show_link_header_plain(struct bpf_link_info *info)
 	else
 		printf("type %u  ", info->type);
 
-	printf("prog %u  ", info->prog_id);
+	if (info->type == BPF_LINK_TYPE_STRUCT_OPS)
+		printf("map %u  ", info->struct_ops.map_id);
+	else
+		printf("prog %u  ", info->prog_id);
 }
 
 static void show_link_attach_type_plain(__u32 attach_type)

From 84214ab4689f962b4bfc47fc9a5838d25ac4274d Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 18 Apr 2023 15:30:42 +0200
Subject: [PATCH 10/57] igc: Enable and fix RX hash usage by netstack

When function igc_rx_hash() was introduced in v4.20 via commit 0507ef8a0372
("igc: Add transmit and receive fastpath and interrupt handlers"), the
hardware wasn't configured to provide RSS hash, thus it made sense to not
enable net_device NETIF_F_RXHASH feature bit.

The NIC hardware was configured to enable RSS hash info in v5.2 via commit
2121c2712f82 ("igc: Add multiple receive queues control supporting"), but
forgot to set the NETIF_F_RXHASH feature bit.

The original implementation of igc_rx_hash() didn't extract the associated
pkt_hash_type, but statically set PKT_HASH_TYPE_L3. The largest portions of
this patch are about extracting the RSS Type from the hardware and mapping
this to enum pkt_hash_types. This was based on Foxville i225 software user
manual rev-1.3.1 and tested on Intel Ethernet Controller I225-LM (rev 03).

For UDP it's worth noting that RSS (type) hashing have been disabled both for
IPv4 and IPv6 (see IGC_MRQC_RSS_FIELD_IPV4_UDP + IGC_MRQC_RSS_FIELD_IPV6_UDP)
because hardware RSS doesn't handle fragmented pkts well when enabled (can
cause out-of-order). This results in PKT_HASH_TYPE_L3 for UDP packets, and
hash value doesn't include UDP port numbers. Not being PKT_HASH_TYPE_L4, have
the effect that netstack will do a software based hash calc calling into
flow_dissect, but only when code calls skb_get_hash(), which doesn't
necessary happen for local delivery.

For QA verification testing I wrote a small bpftrace prog:
 [0] https://github.com/xdp-project/xdp-project/blob/master/areas/hints/monitor_skb_hash_on_dev.bt

Fixes: 2121c2712f82 ("igc: Add multiple receive queues control supporting")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Yoong Siang <yoong.siang.song@intel.com>
Link: https://lore.kernel.org/bpf/168182464270.616355.11391652654430626584.stgit@firesoul
---
 drivers/net/ethernet/intel/igc/igc.h      | 28 ++++++++++++++++++++
 drivers/net/ethernet/intel/igc/igc_main.c | 31 ++++++++++++++++++++---
 2 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
index 34aebf00a512..f7f9e217e7b4 100644
--- a/drivers/net/ethernet/intel/igc/igc.h
+++ b/drivers/net/ethernet/intel/igc/igc.h
@@ -13,6 +13,7 @@
 #include <linux/ptp_clock_kernel.h>
 #include <linux/timecounter.h>
 #include <linux/net_tstamp.h>
+#include <linux/bitfield.h>
 
 #include "igc_hw.h"
 
@@ -311,6 +312,33 @@ extern char igc_driver_name[];
 #define IGC_MRQC_RSS_FIELD_IPV4_UDP	0x00400000
 #define IGC_MRQC_RSS_FIELD_IPV6_UDP	0x00800000
 
+/* RX-desc Write-Back format RSS Type's */
+enum igc_rss_type_num {
+	IGC_RSS_TYPE_NO_HASH		= 0,
+	IGC_RSS_TYPE_HASH_TCP_IPV4	= 1,
+	IGC_RSS_TYPE_HASH_IPV4		= 2,
+	IGC_RSS_TYPE_HASH_TCP_IPV6	= 3,
+	IGC_RSS_TYPE_HASH_IPV6_EX	= 4,
+	IGC_RSS_TYPE_HASH_IPV6		= 5,
+	IGC_RSS_TYPE_HASH_TCP_IPV6_EX	= 6,
+	IGC_RSS_TYPE_HASH_UDP_IPV4	= 7,
+	IGC_RSS_TYPE_HASH_UDP_IPV6	= 8,
+	IGC_RSS_TYPE_HASH_UDP_IPV6_EX	= 9,
+	IGC_RSS_TYPE_MAX		= 10,
+};
+#define IGC_RSS_TYPE_MAX_TABLE		16
+#define IGC_RSS_TYPE_MASK		GENMASK(3,0) /* 4-bits (3:0) = mask 0x0F */
+
+/* igc_rss_type - Rx descriptor RSS type field */
+static inline u32 igc_rss_type(const union igc_adv_rx_desc *rx_desc)
+{
+	/* RSS Type 4-bits (3:0) number: 0-9 (above 9 is reserved)
+	 * Accessing the same bits via u16 (wb.lower.lo_dword.hs_rss.pkt_info)
+	 * is slightly slower than via u32 (wb.lower.lo_dword.data)
+	 */
+	return le32_get_bits(rx_desc->wb.lower.lo_dword.data, IGC_RSS_TYPE_MASK);
+}
+
 /* Interrupt defines */
 #define IGC_START_ITR			648 /* ~6000 ints/sec */
 #define IGC_4K_ITR			980
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index ba49728be919..f6b03e0372e0 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -1687,14 +1687,36 @@ static void igc_rx_checksum(struct igc_ring *ring,
 		   le32_to_cpu(rx_desc->wb.upper.status_error));
 }
 
+/* Mapping HW RSS Type to enum pkt_hash_types */
+static const enum pkt_hash_types igc_rss_type_table[IGC_RSS_TYPE_MAX_TABLE] = {
+	[IGC_RSS_TYPE_NO_HASH]		= PKT_HASH_TYPE_L2,
+	[IGC_RSS_TYPE_HASH_TCP_IPV4]	= PKT_HASH_TYPE_L4,
+	[IGC_RSS_TYPE_HASH_IPV4]	= PKT_HASH_TYPE_L3,
+	[IGC_RSS_TYPE_HASH_TCP_IPV6]	= PKT_HASH_TYPE_L4,
+	[IGC_RSS_TYPE_HASH_IPV6_EX]	= PKT_HASH_TYPE_L3,
+	[IGC_RSS_TYPE_HASH_IPV6]	= PKT_HASH_TYPE_L3,
+	[IGC_RSS_TYPE_HASH_TCP_IPV6_EX] = PKT_HASH_TYPE_L4,
+	[IGC_RSS_TYPE_HASH_UDP_IPV4]	= PKT_HASH_TYPE_L4,
+	[IGC_RSS_TYPE_HASH_UDP_IPV6]	= PKT_HASH_TYPE_L4,
+	[IGC_RSS_TYPE_HASH_UDP_IPV6_EX] = PKT_HASH_TYPE_L4,
+	[10] = PKT_HASH_TYPE_NONE, /* RSS Type above 9 "Reserved" by HW  */
+	[11] = PKT_HASH_TYPE_NONE, /* keep array sized for SW bit-mask   */
+	[12] = PKT_HASH_TYPE_NONE, /* to handle future HW revisons       */
+	[13] = PKT_HASH_TYPE_NONE,
+	[14] = PKT_HASH_TYPE_NONE,
+	[15] = PKT_HASH_TYPE_NONE,
+};
+
 static inline void igc_rx_hash(struct igc_ring *ring,
 			       union igc_adv_rx_desc *rx_desc,
 			       struct sk_buff *skb)
 {
-	if (ring->netdev->features & NETIF_F_RXHASH)
-		skb_set_hash(skb,
-			     le32_to_cpu(rx_desc->wb.lower.hi_dword.rss),
-			     PKT_HASH_TYPE_L3);
+	if (ring->netdev->features & NETIF_F_RXHASH) {
+		u32 rss_hash = le32_to_cpu(rx_desc->wb.lower.hi_dword.rss);
+		u32 rss_type = igc_rss_type(rx_desc);
+
+		skb_set_hash(skb, rss_hash, igc_rss_type_table[rss_type]);
+	}
 }
 
 static void igc_rx_vlan(struct igc_ring *rx_ring,
@@ -6551,6 +6573,7 @@ static int igc_probe(struct pci_dev *pdev,
 	netdev->features |= NETIF_F_TSO;
 	netdev->features |= NETIF_F_TSO6;
 	netdev->features |= NETIF_F_TSO_ECN;
+	netdev->features |= NETIF_F_RXHASH;
 	netdev->features |= NETIF_F_RXCSUM;
 	netdev->features |= NETIF_F_HW_CSUM;
 	netdev->features |= NETIF_F_SCTP_CRC;

From 73b7123de0cfa4f6609677e927ab02cb05b593c2 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 18 Apr 2023 15:30:47 +0200
Subject: [PATCH 11/57] igc: Add igc_xdp_buff wrapper for xdp_buff in driver

Driver specific metadata data for XDP-hints kfuncs are propagated via tail
extending the struct xdp_buff with a locally scoped driver struct.

Zero-Copy AF_XDP/XSK does similar tricks via struct xdp_buff_xsk. This
xdp_buff_xsk struct contains a CB area (24 bytes) that can be used for
extending the locally scoped driver into. The XSK_CHECK_PRIV_TYPE define
catch size violations build time.

The changes needed for AF_XDP zero-copy in igc_clean_rx_irq_zc()
is done in next patch, because the member rx_desc isn't available
at this point.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Yoong Siang <yoong.siang.song@intel.com>
Link: https://lore.kernel.org/bpf/168182464779.616355.3761989884165609387.stgit@firesoul
---
 drivers/net/ethernet/intel/igc/igc.h      |  5 +++++
 drivers/net/ethernet/intel/igc/igc_main.c | 16 +++++++++-------
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
index f7f9e217e7b4..76a5115aefc8 100644
--- a/drivers/net/ethernet/intel/igc/igc.h
+++ b/drivers/net/ethernet/intel/igc/igc.h
@@ -499,6 +499,11 @@ struct igc_rx_buffer {
 	};
 };
 
+/* context wrapper around xdp_buff to provide access to descriptor metadata */
+struct igc_xdp_buff {
+	struct xdp_buff xdp;
+};
+
 struct igc_q_vector {
 	struct igc_adapter *adapter;    /* backlink */
 	void __iomem *itr_register;
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index f6b03e0372e0..06bf8a7eec93 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -2233,6 +2233,8 @@ static bool igc_alloc_rx_buffers_zc(struct igc_ring *ring, u16 count)
 	if (!count)
 		return ok;
 
+	XSK_CHECK_PRIV_TYPE(struct igc_xdp_buff);
+
 	desc = IGC_RX_DESC(ring, i);
 	bi = &ring->rx_buffer_info[i];
 	i -= ring->count;
@@ -2517,8 +2519,8 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget)
 		union igc_adv_rx_desc *rx_desc;
 		struct igc_rx_buffer *rx_buffer;
 		unsigned int size, truesize;
+		struct igc_xdp_buff ctx;
 		ktime_t timestamp = 0;
-		struct xdp_buff xdp;
 		int pkt_offset = 0;
 		void *pktbuf;
 
@@ -2552,13 +2554,13 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget)
 		}
 
 		if (!skb) {
-			xdp_init_buff(&xdp, truesize, &rx_ring->xdp_rxq);
-			xdp_prepare_buff(&xdp, pktbuf - igc_rx_offset(rx_ring),
+			xdp_init_buff(&ctx.xdp, truesize, &rx_ring->xdp_rxq);
+			xdp_prepare_buff(&ctx.xdp, pktbuf - igc_rx_offset(rx_ring),
 					 igc_rx_offset(rx_ring) + pkt_offset,
 					 size, true);
-			xdp_buff_clear_frags_flag(&xdp);
+			xdp_buff_clear_frags_flag(&ctx.xdp);
 
-			skb = igc_xdp_run_prog(adapter, &xdp);
+			skb = igc_xdp_run_prog(adapter, &ctx.xdp);
 		}
 
 		if (IS_ERR(skb)) {
@@ -2580,9 +2582,9 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget)
 		} else if (skb)
 			igc_add_rx_frag(rx_ring, rx_buffer, skb, size);
 		else if (ring_uses_build_skb(rx_ring))
-			skb = igc_build_skb(rx_ring, rx_buffer, &xdp);
+			skb = igc_build_skb(rx_ring, rx_buffer, &ctx.xdp);
 		else
-			skb = igc_construct_skb(rx_ring, rx_buffer, &xdp,
+			skb = igc_construct_skb(rx_ring, rx_buffer, &ctx.xdp,
 						timestamp);
 
 		/* exit if we failed to retrieve a buffer */

From 8416814fffa9cfa74c18da149f522dd9e1850987 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 18 Apr 2023 15:30:52 +0200
Subject: [PATCH 12/57] igc: Add XDP hints kfuncs for RX hash

This implements XDP hints kfunc for RX-hash (xmo_rx_hash).
The HW rss hash type is handled via mapping table.

This igc driver (default config) does L3 hashing for UDP packets
(excludes UDP src/dest ports in hash calc).  Meaning RSS hash type is
L3 based.  Tested that the igc_rss_type_num for UDP is either
IGC_RSS_TYPE_HASH_IPV4 or IGC_RSS_TYPE_HASH_IPV6.

This patch also updates AF_XDP zero-copy function igc_clean_rx_irq_zc()
to use the xdp_buff wrapper struct igc_xdp_buff.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Yoong Siang <yoong.siang.song@intel.com>
Link: https://lore.kernel.org/bpf/168182465285.616355.2701740913376314790.stgit@firesoul
---
 drivers/net/ethernet/intel/igc/igc.h      |  1 +
 drivers/net/ethernet/intel/igc/igc_main.c | 53 +++++++++++++++++++++++
 2 files changed, 54 insertions(+)

diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
index 76a5115aefc8..c609a2e648f8 100644
--- a/drivers/net/ethernet/intel/igc/igc.h
+++ b/drivers/net/ethernet/intel/igc/igc.h
@@ -502,6 +502,7 @@ struct igc_rx_buffer {
 /* context wrapper around xdp_buff to provide access to descriptor metadata */
 struct igc_xdp_buff {
 	struct xdp_buff xdp;
+	union igc_adv_rx_desc *rx_desc;
 };
 
 struct igc_q_vector {
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index 06bf8a7eec93..c18486f46085 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -2559,6 +2559,7 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget)
 					 igc_rx_offset(rx_ring) + pkt_offset,
 					 size, true);
 			xdp_buff_clear_frags_flag(&ctx.xdp);
+			ctx.rx_desc = rx_desc;
 
 			skb = igc_xdp_run_prog(adapter, &ctx.xdp);
 		}
@@ -2685,6 +2686,15 @@ static void igc_dispatch_skb_zc(struct igc_q_vector *q_vector,
 	napi_gro_receive(&q_vector->napi, skb);
 }
 
+static struct igc_xdp_buff *xsk_buff_to_igc_ctx(struct xdp_buff *xdp)
+{
+	/* xdp_buff pointer used by ZC code path is alloc as xdp_buff_xsk. The
+	 * igc_xdp_buff shares its layout with xdp_buff_xsk and private
+	 * igc_xdp_buff fields fall into xdp_buff_xsk->cb
+	 */
+       return (struct igc_xdp_buff *)xdp;
+}
+
 static int igc_clean_rx_irq_zc(struct igc_q_vector *q_vector, const int budget)
 {
 	struct igc_adapter *adapter = q_vector->adapter;
@@ -2703,6 +2713,7 @@ static int igc_clean_rx_irq_zc(struct igc_q_vector *q_vector, const int budget)
 	while (likely(total_packets < budget)) {
 		union igc_adv_rx_desc *desc;
 		struct igc_rx_buffer *bi;
+		struct igc_xdp_buff *ctx;
 		ktime_t timestamp = 0;
 		unsigned int size;
 		int res;
@@ -2720,6 +2731,9 @@ static int igc_clean_rx_irq_zc(struct igc_q_vector *q_vector, const int budget)
 
 		bi = &ring->rx_buffer_info[ntc];
 
+		ctx = xsk_buff_to_igc_ctx(bi->xdp);
+		ctx->rx_desc = desc;
+
 		if (igc_test_staterr(desc, IGC_RXDADV_STAT_TSIP)) {
 			timestamp = igc_ptp_rx_pktstamp(q_vector->adapter,
 							bi->xdp->data);
@@ -6475,6 +6489,44 @@ u32 igc_rd32(struct igc_hw *hw, u32 reg)
 	return value;
 }
 
+/* Mapping HW RSS Type to enum xdp_rss_hash_type */
+static enum xdp_rss_hash_type igc_xdp_rss_type[IGC_RSS_TYPE_MAX_TABLE] = {
+	[IGC_RSS_TYPE_NO_HASH]		= XDP_RSS_TYPE_L2,
+	[IGC_RSS_TYPE_HASH_TCP_IPV4]	= XDP_RSS_TYPE_L4_IPV4_TCP,
+	[IGC_RSS_TYPE_HASH_IPV4]	= XDP_RSS_TYPE_L3_IPV4,
+	[IGC_RSS_TYPE_HASH_TCP_IPV6]	= XDP_RSS_TYPE_L4_IPV6_TCP,
+	[IGC_RSS_TYPE_HASH_IPV6_EX]	= XDP_RSS_TYPE_L3_IPV6_EX,
+	[IGC_RSS_TYPE_HASH_IPV6]	= XDP_RSS_TYPE_L3_IPV6,
+	[IGC_RSS_TYPE_HASH_TCP_IPV6_EX] = XDP_RSS_TYPE_L4_IPV6_TCP_EX,
+	[IGC_RSS_TYPE_HASH_UDP_IPV4]	= XDP_RSS_TYPE_L4_IPV4_UDP,
+	[IGC_RSS_TYPE_HASH_UDP_IPV6]	= XDP_RSS_TYPE_L4_IPV6_UDP,
+	[IGC_RSS_TYPE_HASH_UDP_IPV6_EX] = XDP_RSS_TYPE_L4_IPV6_UDP_EX,
+	[10] = XDP_RSS_TYPE_NONE, /* RSS Type above 9 "Reserved" by HW  */
+	[11] = XDP_RSS_TYPE_NONE, /* keep array sized for SW bit-mask   */
+	[12] = XDP_RSS_TYPE_NONE, /* to handle future HW revisons       */
+	[13] = XDP_RSS_TYPE_NONE,
+	[14] = XDP_RSS_TYPE_NONE,
+	[15] = XDP_RSS_TYPE_NONE,
+};
+
+static int igc_xdp_rx_hash(const struct xdp_md *_ctx, u32 *hash,
+			   enum xdp_rss_hash_type *rss_type)
+{
+	const struct igc_xdp_buff *ctx = (void *)_ctx;
+
+	if (!(ctx->xdp.rxq->dev->features & NETIF_F_RXHASH))
+		return -ENODATA;
+
+	*hash = le32_to_cpu(ctx->rx_desc->wb.lower.hi_dword.rss);
+	*rss_type = igc_xdp_rss_type[igc_rss_type(ctx->rx_desc)];
+
+	return 0;
+}
+
+static const struct xdp_metadata_ops igc_xdp_metadata_ops = {
+	.xmo_rx_hash			= igc_xdp_rx_hash,
+};
+
 /**
  * igc_probe - Device Initialization Routine
  * @pdev: PCI device information struct
@@ -6548,6 +6600,7 @@ static int igc_probe(struct pci_dev *pdev,
 	hw->hw_addr = adapter->io_addr;
 
 	netdev->netdev_ops = &igc_netdev_ops;
+	netdev->xdp_metadata_ops = &igc_xdp_metadata_ops;
 	igc_ethtool_set_ops(netdev);
 	netdev->watchdog_timeo = 5 * HZ;
 

From d677266755c6e55c43b6755673a1eeae3d452e87 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 18 Apr 2023 15:30:57 +0200
Subject: [PATCH 13/57] igc: Add XDP hints kfuncs for RX timestamp

The NIC hardware RX timestamping mechanism adds an optional tailored
header before the MAC header containing packet reception time. Optional
depending on RX descriptor TSIP status bit (IGC_RXDADV_STAT_TSIP). In
case this bit is set driver does offset adjustments to packet data start
and extracts the timestamp.

The timestamp need to be extracted before invoking the XDP bpf_prog,
because this area just before the packet is also accessible by XDP via
data_meta context pointer (and helper bpf_xdp_adjust_meta). Thus, an XDP
bpf_prog can potentially overwrite this and corrupt data that we want to
extract with the new kfunc for reading the timestamp.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Yoong Siang <yoong.siang.song@intel.com>
Link: https://lore.kernel.org/bpf/168182465791.616355.2583922957423587914.stgit@firesoul
---
 drivers/net/ethernet/intel/igc/igc.h      |  1 +
 drivers/net/ethernet/intel/igc/igc_main.c | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
index c609a2e648f8..18d4af934d8c 100644
--- a/drivers/net/ethernet/intel/igc/igc.h
+++ b/drivers/net/ethernet/intel/igc/igc.h
@@ -503,6 +503,7 @@ struct igc_rx_buffer {
 struct igc_xdp_buff {
 	struct xdp_buff xdp;
 	union igc_adv_rx_desc *rx_desc;
+	ktime_t rx_ts; /* data indication bit IGC_RXDADV_STAT_TSIP */
 };
 
 struct igc_q_vector {
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index c18486f46085..b78e0e6562c8 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -2549,6 +2549,7 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget)
 		if (igc_test_staterr(rx_desc, IGC_RXDADV_STAT_TSIP)) {
 			timestamp = igc_ptp_rx_pktstamp(q_vector->adapter,
 							pktbuf);
+			ctx.rx_ts = timestamp;
 			pkt_offset = IGC_TS_HDR_LEN;
 			size -= IGC_TS_HDR_LEN;
 		}
@@ -2737,6 +2738,7 @@ static int igc_clean_rx_irq_zc(struct igc_q_vector *q_vector, const int budget)
 		if (igc_test_staterr(desc, IGC_RXDADV_STAT_TSIP)) {
 			timestamp = igc_ptp_rx_pktstamp(q_vector->adapter,
 							bi->xdp->data);
+			ctx->rx_ts = timestamp;
 
 			bi->xdp->data += IGC_TS_HDR_LEN;
 
@@ -6523,8 +6525,22 @@ static int igc_xdp_rx_hash(const struct xdp_md *_ctx, u32 *hash,
 	return 0;
 }
 
+static int igc_xdp_rx_timestamp(const struct xdp_md *_ctx, u64 *timestamp)
+{
+	const struct igc_xdp_buff *ctx = (void *)_ctx;
+
+	if (igc_test_staterr(ctx->rx_desc, IGC_RXDADV_STAT_TSIP)) {
+		*timestamp = ctx->rx_ts;
+
+		return 0;
+	}
+
+	return -ENODATA;
+}
+
 static const struct xdp_metadata_ops igc_xdp_metadata_ops = {
 	.xmo_rx_hash			= igc_xdp_rx_hash,
+	.xmo_rx_timestamp		= igc_xdp_rx_timestamp,
 };
 
 /**

From bb323478767d4b4d78380bc9e2c0971ca33442af Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 18 Apr 2023 15:31:03 +0200
Subject: [PATCH 14/57] selftests/bpf: xdp_hw_metadata track more timestamps

To correlate the hardware RX timestamp with something, add tracking of
two software timestamps both clock source CLOCK_TAI (see description in
man clock_gettime(2)).

XDP metadata is extended with xdp_timestamp for capturing when XDP
received the packet. Populated with BPF helper bpf_ktime_get_tai_ns(). I
could not find a BPF helper for getting CLOCK_REALTIME, which would have
been preferred. In userspace when AF_XDP sees the packet another
software timestamp is recorded via clock_gettime() also clock source
CLOCK_TAI.

Example output shortly after loading igc driver:

  poll: 1 (0) skip=1 fail=0 redir=2
  xsk_ring_cons__peek: 1
  0x12557a8: rx_desc[1]->addr=100000000009000 addr=9100 comp_addr=9000
  rx_hash: 0x82A96531 with RSS type:0x1
  rx_timestamp:  1681740540304898909 (sec:1681740540.3049)
  XDP RX-time:   1681740577304958316 (sec:1681740577.3050) delta sec:37.0001 (37000059.407 usec)
  AF_XDP time:   1681740577305051315 (sec:1681740577.3051) delta sec:0.0001 (92.999 usec)
  0x12557a8: complete idx=9 addr=9000

The first observation is that the 37 sec difference between RX HW vs XDP
timestamps, which indicate hardware is likely clock source
CLOCK_REALTIME, because (as of this writing) CLOCK_TAI is initialised
with a 37 sec offset.

The 93 usec (microsec) difference between XDP vs AF_XDP userspace is the
userspace wakeup time. On this hardware it was caused by CPU idle sleep
states, which can be reduced by tuning /dev/cpu_dma_latency.

View current requested/allowed latency bound via:
  hexdump --format '"%d\n"' /dev/cpu_dma_latency

More explanation of the output and how this can be used to identify
clock drift for the HW clock can be seen here[1]:

[1] https://github.com/xdp-project/xdp-project/blob/master/areas/hints/xdp_hints_kfuncs02_driver_igc.org

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Song Yoong Siang <yoong.siang.song@intel.com>
Link: https://lore.kernel.org/bpf/168182466298.616355.2544377890818617459.stgit@firesoul
---
 .../selftests/bpf/progs/xdp_hw_metadata.c     |  4 +-
 tools/testing/selftests/bpf/xdp_hw_metadata.c | 47 +++++++++++++++++--
 tools/testing/selftests/bpf/xdp_metadata.h    |  1 +
 3 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c b/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c
index e1c787815e44..b2dfd7066c6e 100644
--- a/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c
+++ b/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c
@@ -77,7 +77,9 @@ int rx(struct xdp_md *ctx)
 	}
 
 	err = bpf_xdp_metadata_rx_timestamp(ctx, &meta->rx_timestamp);
-	if (err)
+	if (!err)
+		meta->xdp_timestamp = bpf_ktime_get_tai_ns();
+	else
 		meta->rx_timestamp = 0; /* Used by AF_XDP as not avail signal */
 
 	err = bpf_xdp_metadata_rx_hash(ctx, &meta->rx_hash, &meta->rx_hash_type);
diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c
index 987cf0db5ebc..613321eb84c1 100644
--- a/tools/testing/selftests/bpf/xdp_hw_metadata.c
+++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c
@@ -27,6 +27,7 @@
 #include <sys/mman.h>
 #include <net/if.h>
 #include <poll.h>
+#include <time.h>
 
 #include "xdp_metadata.h"
 
@@ -134,18 +135,52 @@ static void refill_rx(struct xsk *xsk, __u64 addr)
 	}
 }
 
-static void verify_xdp_metadata(void *data)
+#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
+static __u64 gettime(clockid_t clock_id)
+{
+	struct timespec t;
+	int res;
+
+	/* See man clock_gettime(2) for type of clock_id's */
+	res = clock_gettime(clock_id, &t);
+
+	if (res < 0)
+		error(res, errno, "Error with clock_gettime()");
+
+	return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
+}
+
+static void verify_xdp_metadata(void *data, clockid_t clock_id)
 {
 	struct xdp_meta *meta;
 
 	meta = data - sizeof(*meta);
 
-	printf("rx_timestamp: %llu\n", meta->rx_timestamp);
 	if (meta->rx_hash_err < 0)
 		printf("No rx_hash err=%d\n", meta->rx_hash_err);
 	else
 		printf("rx_hash: 0x%X with RSS type:0x%X\n",
 		       meta->rx_hash, meta->rx_hash_type);
+
+	printf("rx_timestamp:  %llu (sec:%0.4f)\n", meta->rx_timestamp,
+	       (double)meta->rx_timestamp / NANOSEC_PER_SEC);
+	if (meta->rx_timestamp) {
+		__u64 usr_clock = gettime(clock_id);
+		__u64 xdp_clock = meta->xdp_timestamp;
+		__s64 delta_X = xdp_clock - meta->rx_timestamp;
+		__s64 delta_X2U = usr_clock - xdp_clock;
+
+		printf("XDP RX-time:   %llu (sec:%0.4f) delta sec:%0.4f (%0.3f usec)\n",
+		       xdp_clock, (double)xdp_clock / NANOSEC_PER_SEC,
+		       (double)delta_X / NANOSEC_PER_SEC,
+		       (double)delta_X / 1000);
+
+		printf("AF_XDP time:   %llu (sec:%0.4f) delta sec:%0.4f (%0.3f usec)\n",
+		       usr_clock, (double)usr_clock / NANOSEC_PER_SEC,
+		       (double)delta_X2U / NANOSEC_PER_SEC,
+		       (double)delta_X2U / 1000);
+	}
+
 }
 
 static void verify_skb_metadata(int fd)
@@ -193,7 +228,7 @@ static void verify_skb_metadata(int fd)
 	printf("skb hwtstamp is not found!\n");
 }
 
-static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd)
+static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd, clockid_t clock_id)
 {
 	const struct xdp_desc *rx_desc;
 	struct pollfd fds[rxq + 1];
@@ -243,7 +278,8 @@ static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd)
 			addr = xsk_umem__add_offset_to_addr(rx_desc->addr);
 			printf("%p: rx_desc[%u]->addr=%llx addr=%llx comp_addr=%llx\n",
 			       xsk, idx, rx_desc->addr, addr, comp_addr);
-			verify_xdp_metadata(xsk_umem__get_data(xsk->umem_area, addr));
+			verify_xdp_metadata(xsk_umem__get_data(xsk->umem_area, addr),
+					    clock_id);
 			xsk_ring_cons__release(&xsk->rx, 1);
 			refill_rx(xsk, comp_addr);
 		}
@@ -370,6 +406,7 @@ static void timestamping_enable(int fd, int val)
 
 int main(int argc, char *argv[])
 {
+	clockid_t clock_id = CLOCK_TAI;
 	int server_fd = -1;
 	int ret;
 	int i;
@@ -443,7 +480,7 @@ int main(int argc, char *argv[])
 		error(1, -ret, "bpf_xdp_attach");
 
 	signal(SIGINT, handle_signal);
-	ret = verify_metadata(rx_xsk, rxq, server_fd);
+	ret = verify_metadata(rx_xsk, rxq, server_fd, clock_id);
 	close(server_fd);
 	cleanup();
 	if (ret)
diff --git a/tools/testing/selftests/bpf/xdp_metadata.h b/tools/testing/selftests/bpf/xdp_metadata.h
index 0c4624dc6f2f..938a729bd307 100644
--- a/tools/testing/selftests/bpf/xdp_metadata.h
+++ b/tools/testing/selftests/bpf/xdp_metadata.h
@@ -11,6 +11,7 @@
 
 struct xdp_meta {
 	__u64 rx_timestamp;
+	__u64 xdp_timestamp;
 	__u32 rx_hash;
 	union {
 		__u32 rx_hash_type;

From a4644119208444f84d9ffcd4d0af630046c2de98 Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Thu, 27 Apr 2023 16:32:07 +0200
Subject: [PATCH 15/57] selftests/bpf: Update the aarch64 tests deny list

Now that ftrace supports direct call on arm64, BPF tracing programs work
on that architecture. This fixes the vast majority of BPF selftests
except for:

- multi_kprobe programs which require fprobe, not available on arm64 yet
- tracing_struct which requires trampoline support to access struct args

This patch updates the list of BPF selftests which are known to fail so
the BPF CI can validate the tests which pass now.

Signed-off-by: Florent Revest <revest@chromium.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/bpf/20230427143207.635263-1-revest@chromium.org
---
 tools/testing/selftests/bpf/DENYLIST.aarch64 | 81 +-------------------
 1 file changed, 4 insertions(+), 77 deletions(-)

diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64
index 0a6837f97c32..4b6b18424140 100644
--- a/tools/testing/selftests/bpf/DENYLIST.aarch64
+++ b/tools/testing/selftests/bpf/DENYLIST.aarch64
@@ -1,33 +1,5 @@
-bloom_filter_map                                 # libbpf: prog 'check_bloom': failed to attach: ERROR: strerror_r(-524)=22
-bpf_cookie/lsm
-bpf_cookie/multi_kprobe_attach_api
-bpf_cookie/multi_kprobe_link_api
-bpf_cookie/trampoline
-bpf_loop/check_callback_fn_stop                  # link unexpected error: -524
-bpf_loop/check_invalid_flags
-bpf_loop/check_nested_calls
-bpf_loop/check_non_constant_callback
-bpf_loop/check_nr_loops
-bpf_loop/check_null_callback_ctx
-bpf_loop/check_stack
-bpf_mod_race                                     # bpf_mod_kfunc_race__attach unexpected error: -524 (errno 524)
-bpf_tcp_ca/dctcp_fallback
-btf_dump/btf_dump: var_data                      # find type id unexpected find type id: actual -2 < expected 0
-cgroup_hierarchical_stats                        # attach unexpected error: -524 (errno 524)
-d_path/basic                                     # setup attach failed: -524
-deny_namespace                                   # attach unexpected error: -524 (errno 524)
-fentry_fexit                                     # fentry_attach unexpected error: -1 (errno 524)
-fentry_test                                      # fentry_attach unexpected error: -1 (errno 524)
-fexit_sleep                                      # fexit_attach fexit attach failed: -1
-fexit_stress                                     # fexit attach unexpected fexit attach: actual -524 < expected 0
-fexit_test                                       # fexit_attach unexpected error: -1 (errno 524)
-get_func_args_test                               # get_func_args_test__attach unexpected error: -524 (errno 524) (trampoline)
-get_func_ip_test                                 # get_func_ip_test__attach unexpected error: -524 (errno 524) (trampoline)
-htab_update/reenter_update
-kfree_skb                                        # attach fentry unexpected error: -524 (trampoline)
-kfunc_call/subprog                               # extern (var ksym) 'bpf_prog_active': not found in kernel BTF
-kfunc_call/subprog_lskel                         # skel unexpected error: -2
-kfunc_dynptr_param/dynptr_data_null              # libbpf: prog 'dynptr_data_null': failed to attach: ERROR: strerror_r(-524)=22
+bpf_cookie/multi_kprobe_attach_api               # kprobe_multi_link_api_subtest:FAIL:fentry_raw_skel_load unexpected error: -3
+bpf_cookie/multi_kprobe_link_api                 # kprobe_multi_link_api_subtest:FAIL:fentry_raw_skel_load unexpected error: -3
 kprobe_multi_bench_attach                        # bpf_program__attach_kprobe_multi_opts unexpected error: -95
 kprobe_multi_test/attach_api_addrs               # bpf_program__attach_kprobe_multi_opts unexpected error: -95
 kprobe_multi_test/attach_api_pattern             # bpf_program__attach_kprobe_multi_opts unexpected error: -95
@@ -35,51 +7,6 @@ kprobe_multi_test/attach_api_syms                # bpf_program__attach_kprobe_mu
 kprobe_multi_test/bench_attach                   # bpf_program__attach_kprobe_multi_opts unexpected error: -95
 kprobe_multi_test/link_api_addrs                 # link_fd unexpected link_fd: actual -95 < expected 0
 kprobe_multi_test/link_api_syms                  # link_fd unexpected link_fd: actual -95 < expected 0
-kprobe_multi_test/skel_api                       # kprobe_multi__attach unexpected error: -524 (errno 524)
-ksyms_module/libbpf                              # 'bpf_testmod_ksym_percpu': not found in kernel BTF
-ksyms_module/lskel                               # test_ksyms_module_lskel__open_and_load unexpected error: -2
-libbpf_get_fd_by_id_opts                         # test_libbpf_get_fd_by_id_opts__attach unexpected error: -524 (errno 524)
-linked_list
-lookup_key                                       # test_lookup_key__attach unexpected error: -524 (errno 524)
-lru_bug                                          # lru_bug__attach unexpected error: -524 (errno 524)
-modify_return                                    # modify_return__attach failed unexpected error: -524 (errno 524)
-module_attach                                    # skel_attach skeleton attach failed: -524
-module_fentry_shadow                             # bpf_link_create unexpected bpf_link_create: actual -524 < expected 0
-mptcp/base                                       # run_test mptcp unexpected error: -524 (errno 524)
-netcnt                                           # packets unexpected packets: actual 10001 != expected 10000
-rcu_read_lock                                    # failed to attach: ERROR: strerror_r(-524)=22
-recursion                                        # skel_attach unexpected error: -524 (errno 524)
-ringbuf                                          # skel_attach skeleton attachment failed: -1
-setget_sockopt                                   # attach_cgroup unexpected error: -524
-sk_storage_tracing                               # test_sk_storage_tracing__attach unexpected error: -524 (errno 524)
-skc_to_unix_sock                                 # could not attach BPF object unexpected error: -524 (errno 524)
-socket_cookie                                    # prog_attach unexpected error: -524
-stacktrace_build_id                              # compare_stack_ips stackmap vs. stack_amap err -1 errno 2
-task_local_storage/exit_creds                    # skel_attach unexpected error: -524 (errno 524)
-task_local_storage/recursion                     # skel_attach unexpected error: -524 (errno 524)
-test_bprm_opts                                   # attach attach failed: -524
-test_ima                                         # attach attach failed: -524
-test_local_storage                               # attach lsm attach failed: -524
-test_lsm                                         # test_lsm_first_attach unexpected error: -524 (errno 524)
-test_overhead                                    # attach_fentry unexpected error: -524
-timer                                            # timer unexpected error: -524 (errno 524)
-timer_crash                                      # timer_crash__attach unexpected error: -524 (errno 524)
-timer_mim                                        # timer_mim unexpected error: -524 (errno 524)
-trace_printk                                     # trace_printk__attach unexpected error: -1 (errno 524)
-trace_vprintk                                    # trace_vprintk__attach unexpected error: -1 (errno 524)
+kprobe_multi_test/skel_api                       # libbpf: failed to load BPF skeleton 'kprobe_multi': -3
+module_attach                                    # prog 'kprobe_multi': failed to auto-attach: -95
 tracing_struct                                   # tracing_struct__attach unexpected error: -524 (errno 524)
-trampoline_count                                 # attach_prog unexpected error: -524
-unpriv_bpf_disabled                              # skel_attach unexpected error: -524 (errno 524)
-user_ringbuf/test_user_ringbuf_post_misaligned   # misaligned_skel unexpected error: -524 (errno 524)
-user_ringbuf/test_user_ringbuf_post_producer_wrong_offset
-user_ringbuf/test_user_ringbuf_post_larger_than_ringbuf_sz
-user_ringbuf/test_user_ringbuf_basic             # ringbuf_basic_skel unexpected error: -524 (errno 524)
-user_ringbuf/test_user_ringbuf_sample_full_ring_buffer
-user_ringbuf/test_user_ringbuf_post_alignment_autoadjust
-user_ringbuf/test_user_ringbuf_overfill
-user_ringbuf/test_user_ringbuf_discards_properly_ignored
-user_ringbuf/test_user_ringbuf_loop
-user_ringbuf/test_user_ringbuf_msg_protocol
-user_ringbuf/test_user_ringbuf_blocking_reserve
-verify_pkcs7_sig                                 # test_verify_pkcs7_sig__attach unexpected error: -524 (errno 524)
-vmlinux                                          # skel_attach skeleton attach failed: -524

From 6ec7be9a2d2b09815f69fc2183ec31857eaa6e27 Mon Sep 17 00:00:00 2001
From: Kal Conley <kal.conley@dectris.com>
Date: Sun, 23 Apr 2023 20:01:56 +0200
Subject: [PATCH 16/57] xsk: Use pool->dma_pages to check for DMA

Compare pool->dma_pages instead of pool->dma_pages_cnt to check for an
active DMA mapping. pool->dma_pages needs to be read anyway to access
the map so this compiles to more efficient code.

Signed-off-by: Kal Conley <kal.conley@dectris.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
Link: https://lore.kernel.org/bpf/20230423180157.93559-1-kal.conley@dectris.com
---
 include/net/xsk_buff_pool.h | 2 +-
 net/xdp/xsk_buff_pool.c     | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index d318c769b445..a8d7b8a3688a 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -180,7 +180,7 @@ static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool,
 	if (likely(!cross_pg))
 		return false;
 
-	return pool->dma_pages_cnt &&
+	return pool->dma_pages &&
 	       !(pool->dma_pages[addr >> PAGE_SHIFT] & XSK_NEXT_PG_CONTIG_MASK);
 }
 
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index b2df1e0f8153..26f6d304451e 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -350,7 +350,7 @@ void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
 {
 	struct xsk_dma_map *dma_map;
 
-	if (pool->dma_pages_cnt == 0)
+	if (!pool->dma_pages)
 		return;
 
 	dma_map = xp_find_dma_map(pool);
@@ -364,6 +364,7 @@ void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
 
 	__xp_dma_unmap(dma_map, attrs);
 	kvfree(pool->dma_pages);
+	pool->dma_pages = NULL;
 	pool->dma_pages_cnt = 0;
 	pool->dev = NULL;
 }
@@ -503,7 +504,7 @@ static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool)
 	if (pool->unaligned) {
 		xskb = pool->free_heads[--pool->free_heads_cnt];
 		xp_init_xskb_addr(xskb, pool, addr);
-		if (pool->dma_pages_cnt)
+		if (pool->dma_pages)
 			xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr);
 	} else {
 		xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)];
@@ -569,7 +570,7 @@ static u32 xp_alloc_new_from_fq(struct xsk_buff_pool *pool, struct xdp_buff **xd
 		if (pool->unaligned) {
 			xskb = pool->free_heads[--pool->free_heads_cnt];
 			xp_init_xskb_addr(xskb, pool, addr);
-			if (pool->dma_pages_cnt)
+			if (pool->dma_pages)
 				xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr);
 		} else {
 			xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)];

From bf06c9393493a2862d4670beda2928423c774ff3 Mon Sep 17 00:00:00 2001
From: Xueming Feng <kuro@kuroa.me>
Date: Thu, 27 Apr 2023 20:03:13 +0800
Subject: [PATCH 17/57] bpftool: Dump map id instead of value for map_of_maps
 types

When using `bpftool map dump` with map_of_maps, it is usually
more convenient to show the inner map id instead of raw value.

We are changing the plain print behavior to show inner_map_id
instead of hex value, this would help with quick look up of
inner map with `bpftool map dump id <inner_map_id>`.
To avoid disrupting scripted behavior, we will add a new
`inner_map_id` field to json output instead of replacing value.

plain print:
```
$ bpftool map dump id 138

Without Patch:
key:
fc 00 00 00 00 00 00 00  00 00 00 00 00 00 00 05
27 16 06 00
value:
8b 00 00 00
Found 1 element

With Patch:
key:
fc 00 00 00 00 00 00 00  00 00 00 00 00 00 00 05
27 16 06 00
inner_map_id:
139
Found 1 element
```

json print:
```
$ bpftool -p map dump id 567

Without Patch:
[{
        "key": ["0xc0","0x00","0x02","0x05","0x27","0x16","0x06","0x00"
        ],
        "value": ["0x38","0x02","0x00","0x00"
        ]
    }
]

With Patch:
[{
        "key": ["0xc0","0x00","0x02","0x05","0x27","0x16","0x06","0x00"
        ],
        "value": ["0x38","0x02","0x00","0x00"
        ],
        "inner_map_id": 568
    }
]
```

Signed-off-by: Xueming Feng <kuro@kuroa.me>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20230427120313.43574-1-kuro@kuroa.me
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/bpf/bpftool/map.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index aaeb8939e137..ae9e822aa3fe 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -139,6 +139,9 @@ static void print_entry_json(struct bpf_map_info *info, unsigned char *key,
 		print_hex_data_json(key, info->key_size);
 		jsonw_name(json_wtr, "value");
 		print_hex_data_json(value, info->value_size);
+		if (map_is_map_of_maps(info->type))
+			jsonw_uint_field(json_wtr, "inner_map_id",
+					 *(unsigned int *)value);
 		if (btf) {
 			struct btf_dumper d = {
 				.btf = btf,
@@ -259,8 +262,13 @@ static void print_entry_plain(struct bpf_map_info *info, unsigned char *key,
 		}
 
 		if (info->value_size) {
-			printf("value:%c", break_names ? '\n' : ' ');
-			fprint_hex(stdout, value, info->value_size, " ");
+			if (map_is_map_of_maps(info->type)) {
+				printf("inner_map_id:%c", break_names ? '\n' : ' ');
+				printf("%u ", *(unsigned int *)value);
+			} else {
+				printf("value:%c", break_names ? '\n' : ' ');
+				fprint_hex(stdout, value, info->value_size, " ");
+			}
 		}
 
 		printf("\n");

From f1f5553d91a11663a5761b78e61f70c1db0bbd2f Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 25 Apr 2023 10:47:44 -0700
Subject: [PATCH 18/57] selftests/bpf: Fix selftest
 test_global_funcs/global_func1 failure with latest clang

The selftest test_global_funcs/global_func1 failed with the latest clang17.
The reason is due to upstream ArgumentPromotionPass ([1]),
which may manipulate static function parameters and cause inlining
although the funciton is marked as noinline.

The original code:
  static __attribute__ ((noinline))
  int f0(int var, struct __sk_buff *skb)
  {
        return skb->len;
  }

  __attribute__ ((noinline))
  int f1(struct __sk_buff *skb)
  {
	...
        return f0(0, skb) + skb->len;
  }

  ...

  SEC("tc")
  __failure __msg("combined stack size of 4 calls is 544")
  int global_func1(struct __sk_buff *skb)
  {
        return f0(1, skb) + f1(skb) + f2(2, skb) + f3(3, skb, 4);
  }

After ArgumentPromotionPass, the code is translated to
  static __attribute__ ((noinline))
  int f0(int var, int skb_len)
  {
        return skb_len;
  }

  __attribute__ ((noinline))
  int f1(struct __sk_buff *skb)
  {
	...
        return f0(0, skb->len) + skb->len;
  }

  ...

  SEC("tc")
  __failure __msg("combined stack size of 4 calls is 544")
  int global_func1(struct __sk_buff *skb)
  {
        return f0(1, skb->len) + f1(skb) + f2(2, skb) + f3(3, skb, 4);
  }

And later llvm InstCombine phase recognized that f0()
simplify returns the value of the second argument and removed f0()
completely and the final code looks like:
  __attribute__ ((noinline))
  int f1(struct __sk_buff *skb)
  {
	...
        return skb->len + skb->len;
  }

  ...

  SEC("tc")
  __failure __msg("combined stack size of 4 calls is 544")
  int global_func1(struct __sk_buff *skb)
  {
        return skb->len + f1(skb) + f2(2, skb) + f3(3, skb, 4);
  }

If f0() is not inlined, the verification will fail with stack size
544 for a particular callchain. With f0() inlined, the maximum
stack size is 512 which is in the limit.

Let us add a `asm volatile ("")` in f0() to prevent ArgumentPromotionPass
from hoisting the code to its caller, and this fixed the test failure.

  [1] https://reviews.llvm.org/D148269

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230425174744.1758515-1-yhs@fb.com
---
 tools/testing/selftests/bpf/progs/test_global_func1.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/testing/selftests/bpf/progs/test_global_func1.c b/tools/testing/selftests/bpf/progs/test_global_func1.c
index b85fc8c423ba..17a9f59bf5f3 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func1.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func1.c
@@ -10,6 +10,8 @@
 static __attribute__ ((noinline))
 int f0(int var, struct __sk_buff *skb)
 {
+	asm volatile ("");
+
 	return skb->len;
 }
 

From 31f4f810d533e7ed9a835c5f946eceaec015ce10 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Thu, 27 Apr 2023 20:37:44 -0700
Subject: [PATCH 19/57] selftests/bpf: Add fexit_sleep to DENYLIST.aarch64

It is reported that the fexit_sleep never returns in aarch64.
The remaining tests cannot start. Put this test into DENYLIST.aarch64
for now so that other tests can continue to run in the CI.

Acked-by: Manu Bretelle <chantr4@gmail.com>
Reported-by: Manu Bretelle <chantra@meta.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/DENYLIST.aarch64 | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64
index 4b6b18424140..cd42e2825bd2 100644
--- a/tools/testing/selftests/bpf/DENYLIST.aarch64
+++ b/tools/testing/selftests/bpf/DENYLIST.aarch64
@@ -1,5 +1,6 @@
 bpf_cookie/multi_kprobe_attach_api               # kprobe_multi_link_api_subtest:FAIL:fentry_raw_skel_load unexpected error: -3
 bpf_cookie/multi_kprobe_link_api                 # kprobe_multi_link_api_subtest:FAIL:fentry_raw_skel_load unexpected error: -3
+fexit_sleep                                      # The test never returns. The remaining tests cannot start.
 kprobe_multi_bench_attach                        # bpf_program__attach_kprobe_multi_opts unexpected error: -95
 kprobe_multi_test/attach_api_addrs               # bpf_program__attach_kprobe_multi_opts unexpected error: -95
 kprobe_multi_test/attach_api_pattern             # bpf_program__attach_kprobe_multi_opts unexpected error: -95

From c39028b333f3a3a765c5c0b9726b8e38aedf0ba1 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Thu, 27 Apr 2023 18:36:38 -0700
Subject: [PATCH 20/57] libbpf: btf_dump_type_data_check_overflow needs to
 consider BTF_MEMBER_BITFIELD_SIZE

The btf_dump/struct_data selftest is failing with:

  [...]
  test_btf_dump_struct_data:FAIL:unexpected return value dumping fs_context unexpected unexpected return value dumping fs_context: actual -7 != expected 264
  [...]

The reason is in btf_dump_type_data_check_overflow(). It does not use
BTF_MEMBER_BITFIELD_SIZE from the struct's member (btf_member). Instead,
it is using the enum size which is 4. It had been working till the recent
commit 4e04143c869c ("fs_context: drop the unused lsm_flags member")
removed an integer member which also removed the 4 bytes padding at the
end of the fs_context. Missing this 4 bytes padding exposed this bug. In
particular, when btf_dump_type_data_check_overflow() reaches the member
'phase', -E2BIG is returned.

The fix is to pass bit_sz to btf_dump_type_data_check_overflow(). In
btf_dump_type_data_check_overflow(), it does a different size check when
bit_sz is not zero.

The current fs_context:

[3600] ENUM 'fs_context_purpose' encoding=UNSIGNED size=4 vlen=3
	'FS_CONTEXT_FOR_MOUNT' val=0
	'FS_CONTEXT_FOR_SUBMOUNT' val=1
	'FS_CONTEXT_FOR_RECONFIGURE' val=2
[3601] ENUM 'fs_context_phase' encoding=UNSIGNED size=4 vlen=7
	'FS_CONTEXT_CREATE_PARAMS' val=0
	'FS_CONTEXT_CREATING' val=1
	'FS_CONTEXT_AWAITING_MOUNT' val=2
	'FS_CONTEXT_AWAITING_RECONF' val=3
	'FS_CONTEXT_RECONF_PARAMS' val=4
	'FS_CONTEXT_RECONFIGURING' val=5
	'FS_CONTEXT_FAILED' val=6
[3602] STRUCT 'fs_context' size=264 vlen=21
	'ops' type_id=3603 bits_offset=0
	'uapi_mutex' type_id=235 bits_offset=64
	'fs_type' type_id=872 bits_offset=1216
	'fs_private' type_id=21 bits_offset=1280
	'sget_key' type_id=21 bits_offset=1344
	'root' type_id=781 bits_offset=1408
	'user_ns' type_id=251 bits_offset=1472
	'net_ns' type_id=984 bits_offset=1536
	'cred' type_id=1785 bits_offset=1600
	'log' type_id=3621 bits_offset=1664
	'source' type_id=42 bits_offset=1792
	'security' type_id=21 bits_offset=1856
	's_fs_info' type_id=21 bits_offset=1920
	'sb_flags' type_id=20 bits_offset=1984
	'sb_flags_mask' type_id=20 bits_offset=2016
	's_iflags' type_id=20 bits_offset=2048
	'purpose' type_id=3600 bits_offset=2080 bitfield_size=8
	'phase' type_id=3601 bits_offset=2088 bitfield_size=8
	'need_free' type_id=67 bits_offset=2096 bitfield_size=1
	'global' type_id=67 bits_offset=2097 bitfield_size=1
	'oldapi' type_id=67 bits_offset=2098 bitfield_size=1

Fixes: 920d16af9b42 ("libbpf: BTF dumper support for typed data")
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20230428013638.1581263-1-martin.lau@linux.dev
---
 tools/lib/bpf/btf_dump.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index 580985ee5545..4d9f30bf7f01 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -2250,9 +2250,25 @@ static int btf_dump_type_data_check_overflow(struct btf_dump *d,
 					     const struct btf_type *t,
 					     __u32 id,
 					     const void *data,
-					     __u8 bits_offset)
+					     __u8 bits_offset,
+					     __u8 bit_sz)
 {
-	__s64 size = btf__resolve_size(d->btf, id);
+	__s64 size;
+
+	if (bit_sz) {
+		/* bits_offset is at most 7. bit_sz is at most 128. */
+		__u8 nr_bytes = (bits_offset + bit_sz + 7) / 8;
+
+		/* When bit_sz is non zero, it is called from
+		 * btf_dump_struct_data() where it only cares about
+		 * negative error value.
+		 * Return nr_bytes in success case to make it
+		 * consistent as the regular integer case below.
+		 */
+		return data + nr_bytes > d->typed_dump->data_end ? -E2BIG : nr_bytes;
+	}
+
+	size = btf__resolve_size(d->btf, id);
 
 	if (size < 0 || size >= INT_MAX) {
 		pr_warn("unexpected size [%zu] for id [%u]\n",
@@ -2407,7 +2423,7 @@ static int btf_dump_dump_type_data(struct btf_dump *d,
 {
 	int size, err = 0;
 
-	size = btf_dump_type_data_check_overflow(d, t, id, data, bits_offset);
+	size = btf_dump_type_data_check_overflow(d, t, id, data, bits_offset, bit_sz);
 	if (size < 0)
 		return size;
 	err = btf_dump_type_data_check_zero(d, t, id, data, bits_offset, bit_sz);

From 0a5c0de8b66f9789422f04de7304e374a2ea8df5 Mon Sep 17 00:00:00 2001
From: Stephen Veiss <sveiss@meta.com>
Date: Thu, 27 Apr 2023 15:53:32 -0700
Subject: [PATCH 21/57] selftests/bpf: Extract insert_test from parse_test_list

Split the logic to insert new tests into test filter sets out from
parse_test_list.

Fix the subtest insertion logic to reuse an existing top-level test
filter, which prevents the creation of duplicate top-level test filters
each with a single subtest.

Signed-off-by: Stephen Veiss <sveiss@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20230427225333.3506052-2-sveiss@meta.com
---
 .../selftests/bpf/prog_tests/arg_parsing.c    |  13 ++
 tools/testing/selftests/bpf/testing_helpers.c | 178 ++++++++++--------
 2 files changed, 117 insertions(+), 74 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/arg_parsing.c b/tools/testing/selftests/bpf/prog_tests/arg_parsing.c
index b17bfa0e0aac..3754cd5f8c0a 100644
--- a/tools/testing/selftests/bpf/prog_tests/arg_parsing.c
+++ b/tools/testing/selftests/bpf/prog_tests/arg_parsing.c
@@ -96,6 +96,19 @@ static void test_parse_test_list(void)
 		goto error;
 	ASSERT_OK(strcmp("*bpf_cookie*", set.tests[0].name), "test name");
 	ASSERT_OK(strcmp("*trace*", set.tests[0].subtests[0]), "subtest name");
+	free_test_filter_set(&set);
+
+	ASSERT_OK(parse_test_list("t/subtest1,t/subtest2", &set, true),
+		  "parsing");
+	if (!ASSERT_EQ(set.cnt, 1, "count of test filters"))
+		goto error;
+	if (!ASSERT_OK_PTR(set.tests, "test filters initialized"))
+		goto error;
+	if (!ASSERT_EQ(set.tests[0].subtest_cnt, 2, "subtest filters count"))
+		goto error;
+	ASSERT_OK(strcmp("t", set.tests[0].name), "test name");
+	ASSERT_OK(strcmp("subtest1", set.tests[0].subtests[0]), "subtest name");
+	ASSERT_OK(strcmp("subtest2", set.tests[0].subtests[1]), "subtest name");
 error:
 	free_test_filter_set(&set);
 }
diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c
index 0b5e0829e5be..fca617e87710 100644
--- a/tools/testing/selftests/bpf/testing_helpers.c
+++ b/tools/testing/selftests/bpf/testing_helpers.c
@@ -70,92 +70,122 @@ int parse_num_list(const char *s, bool **num_set, int *num_set_len)
 	return 0;
 }
 
+static int do_insert_test(struct test_filter_set *set,
+			  char *test_str,
+			  char *subtest_str)
+{
+	struct test_filter *tmp, *test;
+	char **ctmp;
+	int i;
+
+	for (i = 0; i < set->cnt; i++) {
+		test = &set->tests[i];
+
+		if (strcmp(test_str, test->name) == 0) {
+			free(test_str);
+			goto subtest;
+		}
+	}
+
+	tmp = realloc(set->tests, sizeof(*test) * (set->cnt + 1));
+	if (!tmp)
+		return -ENOMEM;
+
+	set->tests = tmp;
+	test = &set->tests[set->cnt];
+
+	test->name = test_str;
+	test->subtests = NULL;
+	test->subtest_cnt = 0;
+
+	set->cnt++;
+
+subtest:
+	if (!subtest_str)
+		return 0;
+
+	for (i = 0; i < test->subtest_cnt; i++) {
+		if (strcmp(subtest_str, test->subtests[i]) == 0) {
+			free(subtest_str);
+			return 0;
+		}
+	}
+
+	ctmp = realloc(test->subtests,
+		       sizeof(*test->subtests) * (test->subtest_cnt + 1));
+	if (!ctmp)
+		return -ENOMEM;
+
+	test->subtests = ctmp;
+	test->subtests[test->subtest_cnt] = subtest_str;
+
+	test->subtest_cnt++;
+
+	return 0;
+}
+
+static int insert_test(struct test_filter_set *set,
+		       char *test_spec,
+		       bool is_glob_pattern)
+{
+	char *pattern, *subtest_str, *ext_test_str, *ext_subtest_str = NULL;
+	int glob_chars = 0;
+
+	if (is_glob_pattern) {
+		pattern = "%s";
+	} else {
+		pattern = "*%s*";
+		glob_chars = 2;
+	}
+
+	subtest_str = strchr(test_spec, '/');
+	if (subtest_str) {
+		*subtest_str = '\0';
+		subtest_str += 1;
+	}
+
+	ext_test_str = malloc(strlen(test_spec) + glob_chars + 1);
+	if (!ext_test_str)
+		goto err;
+
+	sprintf(ext_test_str, pattern, test_spec);
+
+	if (subtest_str) {
+		ext_subtest_str = malloc(strlen(subtest_str) + glob_chars + 1);
+		if (!ext_subtest_str)
+			goto err;
+
+		sprintf(ext_subtest_str, pattern, subtest_str);
+	}
+
+	return do_insert_test(set, ext_test_str, ext_subtest_str);
+
+err:
+	free(ext_test_str);
+	free(ext_subtest_str);
+
+	return -ENOMEM;
+}
+
 int parse_test_list(const char *s,
 		    struct test_filter_set *set,
 		    bool is_glob_pattern)
 {
-	char *input, *state = NULL, *next;
-	struct test_filter *tmp, *tests = NULL;
-	int i, j, cnt = 0;
+	char *input, *state = NULL, *test_spec;
+	int err = 0;
 
 	input = strdup(s);
 	if (!input)
 		return -ENOMEM;
 
-	while ((next = strtok_r(state ? NULL : input, ",", &state))) {
-		char *subtest_str = strchr(next, '/');
-		char *pattern = NULL;
-		int glob_chars = 0;
-
-		tmp = realloc(tests, sizeof(*tests) * (cnt + 1));
-		if (!tmp)
-			goto err;
-		tests = tmp;
-
-		tests[cnt].subtest_cnt = 0;
-		tests[cnt].subtests = NULL;
-
-		if (is_glob_pattern) {
-			pattern = "%s";
-		} else {
-			pattern = "*%s*";
-			glob_chars = 2;
-		}
-
-		if (subtest_str) {
-			char **tmp_subtests = NULL;
-			int subtest_cnt = tests[cnt].subtest_cnt;
-
-			*subtest_str = '\0';
-			subtest_str += 1;
-			tmp_subtests = realloc(tests[cnt].subtests,
-					       sizeof(*tmp_subtests) *
-					       (subtest_cnt + 1));
-			if (!tmp_subtests)
-				goto err;
-			tests[cnt].subtests = tmp_subtests;
-
-			tests[cnt].subtests[subtest_cnt] =
-				malloc(strlen(subtest_str) + glob_chars + 1);
-			if (!tests[cnt].subtests[subtest_cnt])
-				goto err;
-			sprintf(tests[cnt].subtests[subtest_cnt],
-				pattern,
-				subtest_str);
-
-			tests[cnt].subtest_cnt++;
-		}
-
-		tests[cnt].name = malloc(strlen(next) + glob_chars + 1);
-		if (!tests[cnt].name)
-			goto err;
-		sprintf(tests[cnt].name, pattern, next);
-
-		cnt++;
+	while ((test_spec = strtok_r(state ? NULL : input, ",", &state))) {
+		err = insert_test(set, test_spec, is_glob_pattern);
+		if (err)
+			break;
 	}
 
-	tmp = realloc(set->tests, sizeof(*tests) * (cnt + set->cnt));
-	if (!tmp)
-		goto err;
-
-	memcpy(tmp +  set->cnt, tests, sizeof(*tests) * cnt);
-	set->tests = tmp;
-	set->cnt += cnt;
-
-	free(tests);
 	free(input);
-	return 0;
-
-err:
-	for (i = 0; i < cnt; i++) {
-		for (j = 0; j < tests[i].subtest_cnt; j++)
-			free(tests[i].subtests[j]);
-
-		free(tests[i].name);
-	}
-	free(tests);
-	free(input);
-	return -ENOMEM;
+	return err;
 }
 
 __u32 link_info_prog_id(const struct bpf_link *link, struct bpf_link_info *info)

From 64276f01dce85f72853617e513622b5645497d81 Mon Sep 17 00:00:00 2001
From: Stephen Veiss <sveiss@meta.com>
Date: Thu, 27 Apr 2023 15:53:33 -0700
Subject: [PATCH 22/57] selftests/bpf: Test_progs can read test lists from file

Improve test selection logic when using -a/-b/-d/-t options.
The list of tests to include or exclude can now be read from a file,
specified as @<filename>.

The file contains one name (or wildcard pattern) per line, and
comments beginning with # are ignored.

These options can be passed multiple times to read more than one file.

Signed-off-by: Stephen Veiss <sveiss@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20230427225333.3506052-3-sveiss@meta.com
---
 .../selftests/bpf/prog_tests/arg_parsing.c    | 55 +++++++++++++++++++
 tools/testing/selftests/bpf/test_progs.c      | 37 +++++++++----
 tools/testing/selftests/bpf/testing_helpers.c | 47 ++++++++++++++++
 tools/testing/selftests/bpf/testing_helpers.h |  3 +
 4 files changed, 132 insertions(+), 10 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/arg_parsing.c b/tools/testing/selftests/bpf/prog_tests/arg_parsing.c
index 3754cd5f8c0a..bb143de68875 100644
--- a/tools/testing/selftests/bpf/prog_tests/arg_parsing.c
+++ b/tools/testing/selftests/bpf/prog_tests/arg_parsing.c
@@ -113,8 +113,63 @@ error:
 	free_test_filter_set(&set);
 }
 
+static void test_parse_test_list_file(void)
+{
+	struct test_filter_set set;
+	char tmpfile[80];
+	FILE *fp;
+	int fd;
+
+	snprintf(tmpfile, sizeof(tmpfile), "/tmp/bpf_arg_parsing_test.XXXXXX");
+	fd = mkstemp(tmpfile);
+	if (!ASSERT_GE(fd, 0, "create tmp"))
+		return;
+
+	fp = fdopen(fd, "w");
+	if (!ASSERT_NEQ(fp, NULL, "fdopen tmp")) {
+		close(fd);
+		goto out_remove;
+	}
+
+	fprintf(fp, "# comment\n");
+	fprintf(fp, "  test_with_spaces    \n");
+	fprintf(fp, "testA/subtest    # comment\n");
+	fprintf(fp, "testB#comment with no space\n");
+	fprintf(fp, "testB # duplicate\n");
+	fprintf(fp, "testA/subtest # subtest duplicate\n");
+	fprintf(fp, "testA/subtest2\n");
+	fprintf(fp, "testC_no_eof_newline");
+	fflush(fp);
+
+	if (!ASSERT_OK(ferror(fp), "prepare tmp"))
+		goto out_fclose;
+
+	init_test_filter_set(&set);
+
+	ASSERT_OK(parse_test_list_file(tmpfile, &set, true), "parse file");
+
+	ASSERT_EQ(set.cnt, 4, "test  count");
+	ASSERT_OK(strcmp("test_with_spaces", set.tests[0].name), "test 0 name");
+	ASSERT_EQ(set.tests[0].subtest_cnt, 0, "test 0 subtest count");
+	ASSERT_OK(strcmp("testA", set.tests[1].name), "test 1 name");
+	ASSERT_EQ(set.tests[1].subtest_cnt, 2, "test 1 subtest count");
+	ASSERT_OK(strcmp("subtest", set.tests[1].subtests[0]), "test 1 subtest 0");
+	ASSERT_OK(strcmp("subtest2", set.tests[1].subtests[1]), "test 1 subtest 1");
+	ASSERT_OK(strcmp("testB", set.tests[2].name), "test 2 name");
+	ASSERT_OK(strcmp("testC_no_eof_newline", set.tests[3].name), "test 3 name");
+
+	free_test_filter_set(&set);
+
+out_fclose:
+	fclose(fp);
+out_remove:
+	remove(tmpfile);
+}
+
 void test_arg_parsing(void)
 {
 	if (test__start_subtest("test_parse_test_list"))
 		test_parse_test_list();
+	if (test__start_subtest("test_parse_test_list_file"))
+		test_parse_test_list_file();
 }
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index ea82921110da..793689dcc170 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -714,7 +714,13 @@ static struct test_state test_states[ARRAY_SIZE(prog_test_defs)];
 
 const char *argp_program_version = "test_progs 0.1";
 const char *argp_program_bug_address = "<bpf@vger.kernel.org>";
-static const char argp_program_doc[] = "BPF selftests test runner";
+static const char argp_program_doc[] =
+"BPF selftests test runner\v"
+"Options accepting the NAMES parameter take either a comma-separated list\n"
+"of test names, or a filename prefixed with @. The file contains one name\n"
+"(or wildcard pattern) per line, and comments beginning with # are ignored.\n"
+"\n"
+"These options can be passed repeatedly to read multiple files.\n";
 
 enum ARG_KEYS {
 	ARG_TEST_NUM = 'n',
@@ -797,6 +803,7 @@ extern int extra_prog_load_log_flags;
 static error_t parse_arg(int key, char *arg, struct argp_state *state)
 {
 	struct test_env *env = state->input;
+	int err = 0;
 
 	switch (key) {
 	case ARG_TEST_NUM: {
@@ -821,18 +828,28 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 	}
 	case ARG_TEST_NAME_GLOB_ALLOWLIST:
 	case ARG_TEST_NAME: {
-		if (parse_test_list(arg,
-				    &env->test_selector.whitelist,
-				    key == ARG_TEST_NAME_GLOB_ALLOWLIST))
-			return -ENOMEM;
+		if (arg[0] == '@')
+			err = parse_test_list_file(arg + 1,
+						   &env->test_selector.whitelist,
+						   key == ARG_TEST_NAME_GLOB_ALLOWLIST);
+		else
+			err = parse_test_list(arg,
+					      &env->test_selector.whitelist,
+					      key == ARG_TEST_NAME_GLOB_ALLOWLIST);
+
 		break;
 	}
 	case ARG_TEST_NAME_GLOB_DENYLIST:
 	case ARG_TEST_NAME_BLACKLIST: {
-		if (parse_test_list(arg,
-				    &env->test_selector.blacklist,
-				    key == ARG_TEST_NAME_GLOB_DENYLIST))
-			return -ENOMEM;
+		if (arg[0] == '@')
+			err = parse_test_list_file(arg + 1,
+						   &env->test_selector.blacklist,
+						   key == ARG_TEST_NAME_GLOB_DENYLIST);
+		else
+			err = parse_test_list(arg,
+					      &env->test_selector.blacklist,
+					      key == ARG_TEST_NAME_GLOB_DENYLIST);
+
 		break;
 	}
 	case ARG_VERIFIER_STATS:
@@ -900,7 +917,7 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 	default:
 		return ARGP_ERR_UNKNOWN;
 	}
-	return 0;
+	return err;
 }
 
 /*
diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c
index fca617e87710..dc9595ade8de 100644
--- a/tools/testing/selftests/bpf/testing_helpers.c
+++ b/tools/testing/selftests/bpf/testing_helpers.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
 /* Copyright (C) 2019 Netronome Systems, Inc. */
 /* Copyright (C) 2020 Facebook, Inc. */
+#include <ctype.h>
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
@@ -167,6 +168,52 @@ err:
 	return -ENOMEM;
 }
 
+int parse_test_list_file(const char *path,
+			 struct test_filter_set *set,
+			 bool is_glob_pattern)
+{
+	char *buf = NULL, *capture_start, *capture_end, *scan_end;
+	size_t buflen = 0;
+	int err = 0;
+	FILE *f;
+
+	f = fopen(path, "r");
+	if (!f) {
+		err = -errno;
+		fprintf(stderr, "Failed to open '%s': %d\n", path, err);
+		return err;
+	}
+
+	while (getline(&buf, &buflen, f) != -1) {
+		capture_start = buf;
+
+		while (isspace(*capture_start))
+			++capture_start;
+
+		capture_end = capture_start;
+		scan_end = capture_start;
+
+		while (*scan_end && *scan_end != '#') {
+			if (!isspace(*scan_end))
+				capture_end = scan_end;
+
+			++scan_end;
+		}
+
+		if (capture_end == capture_start)
+			continue;
+
+		*(++capture_end) = '\0';
+
+		err = insert_test(set, capture_start, is_glob_pattern);
+		if (err)
+			break;
+	}
+
+	fclose(f);
+	return err;
+}
+
 int parse_test_list(const char *s,
 		    struct test_filter_set *set,
 		    bool is_glob_pattern)
diff --git a/tools/testing/selftests/bpf/testing_helpers.h b/tools/testing/selftests/bpf/testing_helpers.h
index eb8790f928e4..98f09bbae86f 100644
--- a/tools/testing/selftests/bpf/testing_helpers.h
+++ b/tools/testing/selftests/bpf/testing_helpers.h
@@ -20,5 +20,8 @@ struct test_filter_set;
 int parse_test_list(const char *s,
 		    struct test_filter_set *test_set,
 		    bool is_glob_pattern);
+int parse_test_list_file(const char *path,
+			 struct test_filter_set *test_set,
+			 bool is_glob_pattern);
 
 __u64 read_perf_max_sample_freq(void);

From bf6882aebd0ea67558764d44ddeffc7d253c6eb8 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 2 May 2023 11:05:43 -0700
Subject: [PATCH 23/57] bpf: Emit struct bpf_tcp_sock type in vmlinux BTF

In one of our internal testing, we found a case where
  - uapi struct bpf_tcp_sock is in vmlinux.h where vmlinux.h is not
    generated from the testing kernel
  - struct bpf_tcp_sock is not in vmlinux BTF

The above combination caused bpf load failure as the following
memory access
  struct bpf_tcp_sock *tcp_sock = ...;
  ... tcp_sock->snd_cwnd ...
needs CORE relocation but the relocation cannot be resolved since
the kernel BTF does not have corresponding type.

Similar to other previous cases (nf_conn___init, tcp6_sock, mctcp_sock, etc.),
add the type to vmlinux BTF with BTF_EMIT_TYPE macro.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230502180543.1832140-1-yhs@fb.com
---
 net/core/filter.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index d9ce04ca22ce..451b0ec7f242 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6916,6 +6916,8 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
 					FIELD));			\
 	} while (0)
 
+	BTF_TYPE_EMIT(struct bpf_tcp_sock);
+
 	switch (si->off) {
 	case offsetof(struct bpf_tcp_sock, rtt_min):
 		BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=

From fedf99200ab086c42a572fca1d7266b06cdc3e3f Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Tue, 2 May 2023 11:14:18 -0700
Subject: [PATCH 24/57] bpf: Print a warning only if writing to
 unprivileged_bpf_disabled.

Only print the warning message if you are writing to
"/proc/sys/kernel/unprivileged_bpf_disabled".

The kernel may print an annoying warning when you read
"/proc/sys/kernel/unprivileged_bpf_disabled" saying

  WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible
  via Spectre v2 BHB attacks!

However, this message is only meaningful when the feature is
disabled or enabled.

Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20230502181418.308479-1-kuifeng@meta.com
---
 kernel/bpf/syscall.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 14f39c1e573e..909c112ef537 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5380,7 +5380,8 @@ static int bpf_unpriv_handler(struct ctl_table *table, int write,
 		*(int *)table->data = unpriv_enable;
 	}
 
-	unpriv_ebpf_notify(unpriv_enable);
+	if (write)
+		unpriv_ebpf_notify(unpriv_enable);
 
 	return ret;
 }

From 7866fc6aa0deb2294eb781fc3ac94b7d4c6eb242 Mon Sep 17 00:00:00 2001
From: Kenjiro Nakayama <nakayamakenjiro@gmail.com>
Date: Thu, 4 May 2023 12:54:43 +0900
Subject: [PATCH 25/57] libbpf: Fix comment about arc and riscv arch in
 bpf_tracing.h

To make comments about arc and riscv arch in bpf_tracing.h accurate,
this patch fixes the comment about arc and adds the comment for riscv.

Signed-off-by: Kenjiro Nakayama <nakayamakenjiro@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230504035443.427927-1-nakayamakenjiro@gmail.com
---
 tools/lib/bpf/bpf_tracing.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 6fb3d0f9af17..be076a4041ab 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -351,6 +351,7 @@ struct pt_regs___arm64 {
  * https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-cc.adoc#risc-v-calling-conventions
  */
 
+/* riscv provides struct user_regs_struct instead of struct pt_regs to userspace */
 #define __PT_REGS_CAST(x) ((const struct user_regs_struct *)(x))
 #define __PT_PARM1_REG a0
 #define __PT_PARM2_REG a1
@@ -383,7 +384,7 @@ struct pt_regs___arm64 {
  * https://raw.githubusercontent.com/wiki/foss-for-synopsys-dwc-arc-processors/toolchain/files/ARCv2_ABI.pdf
  */
 
-/* arc provides struct user_pt_regs instead of struct pt_regs to userspace */
+/* arc provides struct user_regs_struct instead of struct pt_regs to userspace */
 #define __PT_REGS_CAST(x) ((const struct user_regs_struct *)(x))
 #define __PT_PARM1_REG scratch.r0
 #define __PT_PARM2_REG scratch.r1

From 5956f3011604f03be073cba0fbe5f399b4d779ec Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 4 May 2023 21:33:08 -0700
Subject: [PATCH 26/57] veristat: add -t flag for adding BPF_F_TEST_STATE_FREQ
 program flag

Sometimes during debugging it's important that BPF program is loaded
with BPF_F_TEST_STATE_FREQ flag set to force verifier to do frequent
state checkpointing. Teach veristat to do this when -t ("test state")
flag is specified.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230505043317.3629845-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/veristat.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index 1db7185181da..655095810d4a 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -141,6 +141,7 @@ static struct env {
 	bool verbose;
 	bool debug;
 	bool quiet;
+	bool force_checkpoints;
 	enum resfmt out_fmt;
 	bool show_version;
 	bool comparison_mode;
@@ -209,6 +210,8 @@ static const struct argp_option opts[] = {
 	{ "log-level", 'l', "LEVEL", 0, "Verifier log level (default 0 for normal mode, 1 for verbose mode)" },
 	{ "log-fixed", OPT_LOG_FIXED, NULL, 0, "Disable verifier log rotation" },
 	{ "log-size", OPT_LOG_SIZE, "BYTES", 0, "Customize verifier log size (default to 16MB)" },
+	{ "test-states", 't', NULL, 0,
+	  "Force frequent BPF verifier state checkpointing (set BPF_F_TEST_STATE_FREQ program flag)" },
 	{ "quiet", 'q', NULL, 0, "Quiet mode" },
 	{ "emit", 'e', "SPEC", 0, "Specify stats to be emitted" },
 	{ "sort", 's', "SPEC", 0, "Specify sort order" },
@@ -284,6 +287,9 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 			argp_usage(state);
 		}
 		break;
+	case 't':
+		env.force_checkpoints = true;
+		break;
 	case 'C':
 		env.comparison_mode = true;
 		break;
@@ -989,6 +995,9 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf
 	/* increase chances of successful BPF object loading */
 	fixup_obj(obj, prog, base_filename);
 
+	if (env.force_checkpoints)
+		bpf_program__set_flags(prog, bpf_program__flags(prog) | BPF_F_TEST_STATE_FREQ);
+
 	err = bpf_object__load(obj);
 	env.progs_processed++;
 

From e0bf462276b6ee23203365eacb5c599f42a5a084 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 4 May 2023 21:33:09 -0700
Subject: [PATCH 27/57] bpf: mark relevant stack slots scratched for register
 read instructions

When handling instructions that read register slots, mark relevant stack
slots as scratched so that verifier log would contain those slots' states, in
addition to currently emitted registers with stack slot offsets.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230505043317.3629845-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ff4a8ab99f08..da8a5834f2ca 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4109,6 +4109,7 @@ static void mark_reg_stack_read(struct bpf_verifier_env *env,
 	for (i = min_off; i < max_off; i++) {
 		slot = -i - 1;
 		spi = slot / BPF_REG_SIZE;
+		mark_stack_slot_scratched(env, spi);
 		stype = ptr_state->stack[spi].slot_type;
 		if (stype[slot % BPF_REG_SIZE] != STACK_ZERO)
 			break;
@@ -4160,6 +4161,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 	stype = reg_state->stack[spi].slot_type;
 	reg = &reg_state->stack[spi].spilled_ptr;
 
+	mark_stack_slot_scratched(env, spi);
+
 	if (is_spilled_reg(&reg_state->stack[spi])) {
 		u8 spill_size = 1;
 

From 407958a0e980b9e1842ab87b5a1040521e1e24e9 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 4 May 2023 21:33:10 -0700
Subject: [PATCH 28/57] bpf: encapsulate precision backtracking bookkeeping

Add struct backtrack_state and straightforward API around it to keep
track of register and stack masks used and maintained during precision
backtracking process. Having this logic separately allow to keep
high-level backtracking algorithm cleaner, but also it sets us up to
cleanly keep track of register and stack masks per frame, allowing (with
some further logic adjustments) to perform precision backpropagation
across multiple frames (i.e., subprog calls).

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230505043317.3629845-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  14 ++
 kernel/bpf/verifier.c        | 249 +++++++++++++++++++++++++----------
 2 files changed, 196 insertions(+), 67 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 3dd29a53b711..33f541366f4e 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -238,6 +238,10 @@ enum bpf_stack_slot_type {
 
 #define BPF_REG_SIZE 8	/* size of eBPF register in bytes */
 
+#define BPF_REGMASK_ARGS ((1 << BPF_REG_1) | (1 << BPF_REG_2) | \
+			  (1 << BPF_REG_3) | (1 << BPF_REG_4) | \
+			  (1 << BPF_REG_5))
+
 #define BPF_DYNPTR_SIZE		sizeof(struct bpf_dynptr_kern)
 #define BPF_DYNPTR_NR_SLOTS		(BPF_DYNPTR_SIZE / BPF_REG_SIZE)
 
@@ -541,6 +545,15 @@ struct bpf_subprog_info {
 	bool is_async_cb;
 };
 
+struct bpf_verifier_env;
+
+struct backtrack_state {
+	struct bpf_verifier_env *env;
+	u32 frame;
+	u32 reg_masks[MAX_CALL_FRAMES];
+	u64 stack_masks[MAX_CALL_FRAMES];
+};
+
 /* single container for all structs
  * one verifier_env per bpf_check() call
  */
@@ -578,6 +591,7 @@ struct bpf_verifier_env {
 		int *insn_stack;
 		int cur_stack;
 	} cfg;
+	struct backtrack_state bt;
 	u32 pass_cnt; /* number of times do_check() was called */
 	u32 subprog_cnt;
 	/* number of instructions analyzed by the verifier */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index da8a5834f2ca..9b2e571250e1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1296,6 +1296,12 @@ static bool is_spilled_reg(const struct bpf_stack_state *stack)
 	return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL;
 }
 
+static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack)
+{
+	return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL &&
+	       stack->spilled_ptr.type == SCALAR_VALUE;
+}
+
 static void scrub_spilled_slot(u8 *stype)
 {
 	if (*stype != STACK_INVALID)
@@ -3186,12 +3192,128 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
 	return btf_name_by_offset(desc_btf, func->name_off);
 }
 
+static inline void bt_init(struct backtrack_state *bt, u32 frame)
+{
+	bt->frame = frame;
+}
+
+static inline void bt_reset(struct backtrack_state *bt)
+{
+	struct bpf_verifier_env *env = bt->env;
+
+	memset(bt, 0, sizeof(*bt));
+	bt->env = env;
+}
+
+static inline u32 bt_empty(struct backtrack_state *bt)
+{
+	u64 mask = 0;
+	int i;
+
+	for (i = 0; i <= bt->frame; i++)
+		mask |= bt->reg_masks[i] | bt->stack_masks[i];
+
+	return mask == 0;
+}
+
+static inline int bt_subprog_enter(struct backtrack_state *bt)
+{
+	if (bt->frame == MAX_CALL_FRAMES - 1) {
+		verbose(bt->env, "BUG subprog enter from frame %d\n", bt->frame);
+		WARN_ONCE(1, "verifier backtracking bug");
+		return -EFAULT;
+	}
+	bt->frame++;
+	return 0;
+}
+
+static inline int bt_subprog_exit(struct backtrack_state *bt)
+{
+	if (bt->frame == 0) {
+		verbose(bt->env, "BUG subprog exit from frame 0\n");
+		WARN_ONCE(1, "verifier backtracking bug");
+		return -EFAULT;
+	}
+	bt->frame--;
+	return 0;
+}
+
+static inline void bt_set_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
+{
+	bt->reg_masks[frame] |= 1 << reg;
+}
+
+static inline void bt_clear_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
+{
+	bt->reg_masks[frame] &= ~(1 << reg);
+}
+
+static inline void bt_set_reg(struct backtrack_state *bt, u32 reg)
+{
+	bt_set_frame_reg(bt, bt->frame, reg);
+}
+
+static inline void bt_clear_reg(struct backtrack_state *bt, u32 reg)
+{
+	bt_clear_frame_reg(bt, bt->frame, reg);
+}
+
+static inline void bt_set_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+	bt->stack_masks[frame] |= 1ull << slot;
+}
+
+static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+	bt->stack_masks[frame] &= ~(1ull << slot);
+}
+
+static inline void bt_set_slot(struct backtrack_state *bt, u32 slot)
+{
+	bt_set_frame_slot(bt, bt->frame, slot);
+}
+
+static inline void bt_clear_slot(struct backtrack_state *bt, u32 slot)
+{
+	bt_clear_frame_slot(bt, bt->frame, slot);
+}
+
+static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame)
+{
+	return bt->reg_masks[frame];
+}
+
+static inline u32 bt_reg_mask(struct backtrack_state *bt)
+{
+	return bt->reg_masks[bt->frame];
+}
+
+static inline u64 bt_frame_stack_mask(struct backtrack_state *bt, u32 frame)
+{
+	return bt->stack_masks[frame];
+}
+
+static inline u64 bt_stack_mask(struct backtrack_state *bt)
+{
+	return bt->stack_masks[bt->frame];
+}
+
+static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
+{
+	return bt->reg_masks[bt->frame] & (1 << reg);
+}
+
+static inline bool bt_is_slot_set(struct backtrack_state *bt, u32 slot)
+{
+	return bt->stack_masks[bt->frame] & (1ull << slot);
+}
+
 /* For given verifier state backtrack_insn() is called from the last insn to
  * the first insn. Its purpose is to compute a bitmask of registers and
  * stack slots that needs precision in the parent verifier state.
  */
 static int backtrack_insn(struct bpf_verifier_env *env, int idx,
-			  u32 *reg_mask, u64 *stack_mask)
+			  struct backtrack_state *bt)
 {
 	const struct bpf_insn_cbs cbs = {
 		.cb_call	= disasm_kfunc_name,
@@ -3202,20 +3324,20 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
 	u8 class = BPF_CLASS(insn->code);
 	u8 opcode = BPF_OP(insn->code);
 	u8 mode = BPF_MODE(insn->code);
-	u32 dreg = 1u << insn->dst_reg;
-	u32 sreg = 1u << insn->src_reg;
+	u32 dreg = insn->dst_reg;
+	u32 sreg = insn->src_reg;
 	u32 spi;
 
 	if (insn->code == 0)
 		return 0;
 	if (env->log.level & BPF_LOG_LEVEL2) {
-		verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask);
+		verbose(env, "regs=%x stack=%llx before ", bt_reg_mask(bt), bt_stack_mask(bt));
 		verbose(env, "%d: ", idx);
 		print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
 	}
 
 	if (class == BPF_ALU || class == BPF_ALU64) {
-		if (!(*reg_mask & dreg))
+		if (!bt_is_reg_set(bt, dreg))
 			return 0;
 		if (opcode == BPF_MOV) {
 			if (BPF_SRC(insn->code) == BPF_X) {
@@ -3223,8 +3345,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
 				 * dreg needs precision after this insn
 				 * sreg needs precision before this insn
 				 */
-				*reg_mask &= ~dreg;
-				*reg_mask |= sreg;
+				bt_clear_reg(bt, dreg);
+				bt_set_reg(bt, sreg);
 			} else {
 				/* dreg = K
 				 * dreg needs precision after this insn.
@@ -3232,7 +3354,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
 				 * as precise=true in this verifier state.
 				 * No further markings in parent are necessary
 				 */
-				*reg_mask &= ~dreg;
+				bt_clear_reg(bt, dreg);
 			}
 		} else {
 			if (BPF_SRC(insn->code) == BPF_X) {
@@ -3240,15 +3362,15 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
 				 * both dreg and sreg need precision
 				 * before this insn
 				 */
-				*reg_mask |= sreg;
+				bt_set_reg(bt, sreg);
 			} /* else dreg += K
 			   * dreg still needs precision before this insn
 			   */
 		}
 	} else if (class == BPF_LDX) {
-		if (!(*reg_mask & dreg))
+		if (!bt_is_reg_set(bt, dreg))
 			return 0;
-		*reg_mask &= ~dreg;
+		bt_clear_reg(bt, dreg);
 
 		/* scalars can only be spilled into stack w/o losing precision.
 		 * Load from any other memory can be zero extended.
@@ -3269,9 +3391,9 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
 			WARN_ONCE(1, "verifier backtracking bug");
 			return -EFAULT;
 		}
-		*stack_mask |= 1ull << spi;
+		bt_set_slot(bt, spi);
 	} else if (class == BPF_STX || class == BPF_ST) {
-		if (*reg_mask & dreg)
+		if (bt_is_reg_set(bt, dreg))
 			/* stx & st shouldn't be using _scalar_ dst_reg
 			 * to access memory. It means backtracking
 			 * encountered a case of pointer subtraction.
@@ -3286,11 +3408,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
 			WARN_ONCE(1, "verifier backtracking bug");
 			return -EFAULT;
 		}
-		if (!(*stack_mask & (1ull << spi)))
+		if (!bt_is_slot_set(bt, spi))
 			return 0;
-		*stack_mask &= ~(1ull << spi);
+		bt_clear_slot(bt, spi);
 		if (class == BPF_STX)
-			*reg_mask |= sreg;
+			bt_set_reg(bt, sreg);
 	} else if (class == BPF_JMP || class == BPF_JMP32) {
 		if (opcode == BPF_CALL) {
 			if (insn->src_reg == BPF_PSEUDO_CALL)
@@ -3307,19 +3429,19 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
 			if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0)
 				return -ENOTSUPP;
 			/* regular helper call sets R0 */
-			*reg_mask &= ~1;
-			if (*reg_mask & 0x3f) {
+			bt_clear_reg(bt, BPF_REG_0);
+			if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
 				/* if backtracing was looking for registers R1-R5
 				 * they should have been found already.
 				 */
-				verbose(env, "BUG regs %x\n", *reg_mask);
+				verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
 				WARN_ONCE(1, "verifier backtracking bug");
 				return -EFAULT;
 			}
 		} else if (opcode == BPF_EXIT) {
 			return -ENOTSUPP;
 		} else if (BPF_SRC(insn->code) == BPF_X) {
-			if (!(*reg_mask & (dreg | sreg)))
+			if (!bt_is_reg_set(bt, dreg) && !bt_is_reg_set(bt, sreg))
 				return 0;
 			/* dreg <cond> sreg
 			 * Both dreg and sreg need precision before
@@ -3327,7 +3449,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
 			 * before it would be equally necessary to
 			 * propagate it to dreg.
 			 */
-			*reg_mask |= (sreg | dreg);
+			bt_set_reg(bt, dreg);
+			bt_set_reg(bt, sreg);
 			 /* else dreg <cond> K
 			  * Only dreg still needs precision before
 			  * this insn, so for the K-based conditional
@@ -3335,9 +3458,9 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
 			  */
 		}
 	} else if (class == BPF_LD) {
-		if (!(*reg_mask & dreg))
+		if (!bt_is_reg_set(bt, dreg))
 			return 0;
-		*reg_mask &= ~dreg;
+		bt_clear_reg(bt, dreg);
 		/* It's ld_imm64 or ld_abs or ld_ind.
 		 * For ld_imm64 no further tracking of precision
 		 * into parent is necessary
@@ -3550,20 +3673,21 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_
 static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno,
 				  int spi)
 {
+	struct backtrack_state *bt = &env->bt;
 	struct bpf_verifier_state *st = env->cur_state;
 	int first_idx = st->first_insn_idx;
 	int last_idx = env->insn_idx;
 	struct bpf_func_state *func;
 	struct bpf_reg_state *reg;
-	u32 reg_mask = regno >= 0 ? 1u << regno : 0;
-	u64 stack_mask = spi >= 0 ? 1ull << spi : 0;
 	bool skip_first = true;
-	bool new_marks = false;
 	int i, err;
 
 	if (!env->bpf_capable)
 		return 0;
 
+	/* set frame number from which we are starting to backtrack */
+	bt_init(bt, frame);
+
 	/* Do sanity checks against current state of register and/or stack
 	 * slot, but don't set precise flag in current state, as precision
 	 * tracking in the current state is unnecessary.
@@ -3575,26 +3699,17 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
 			WARN_ONCE(1, "backtracing misuse");
 			return -EFAULT;
 		}
-		new_marks = true;
+		bt_set_reg(bt, regno);
 	}
 
 	while (spi >= 0) {
-		if (!is_spilled_reg(&func->stack[spi])) {
-			stack_mask = 0;
+		if (!is_spilled_scalar_reg(&func->stack[spi]))
 			break;
-		}
-		reg = &func->stack[spi].spilled_ptr;
-		if (reg->type != SCALAR_VALUE) {
-			stack_mask = 0;
-			break;
-		}
-		new_marks = true;
+		bt_set_slot(bt, spi);
 		break;
 	}
 
-	if (!new_marks)
-		return 0;
-	if (!reg_mask && !stack_mask)
+	if (bt_empty(bt))
 		return 0;
 
 	for (;;) {
@@ -3613,12 +3728,13 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
 			if (st->curframe == 0 &&
 			    st->frame[0]->subprogno > 0 &&
 			    st->frame[0]->callsite == BPF_MAIN_FUNC &&
-			    stack_mask == 0 && (reg_mask & ~0x3e) == 0) {
-				bitmap_from_u64(mask, reg_mask);
+			    bt_stack_mask(bt) == 0 &&
+			    (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) == 0) {
+				bitmap_from_u64(mask, bt_reg_mask(bt));
 				for_each_set_bit(i, mask, 32) {
 					reg = &st->frame[0]->regs[i];
 					if (reg->type != SCALAR_VALUE) {
-						reg_mask &= ~(1u << i);
+						bt_clear_reg(bt, i);
 						continue;
 					}
 					reg->precise = true;
@@ -3626,8 +3742,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
 				return 0;
 			}
 
-			verbose(env, "BUG backtracing func entry subprog %d reg_mask %x stack_mask %llx\n",
-				st->frame[0]->subprogno, reg_mask, stack_mask);
+			verbose(env, "BUG backtracking func entry subprog %d reg_mask %x stack_mask %llx\n",
+				st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt));
 			WARN_ONCE(1, "verifier backtracking bug");
 			return -EFAULT;
 		}
@@ -3637,15 +3753,16 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
 				err = 0;
 				skip_first = false;
 			} else {
-				err = backtrack_insn(env, i, &reg_mask, &stack_mask);
+				err = backtrack_insn(env, i, bt);
 			}
 			if (err == -ENOTSUPP) {
 				mark_all_scalars_precise(env, st);
+				bt_reset(bt);
 				return 0;
 			} else if (err) {
 				return err;
 			}
-			if (!reg_mask && !stack_mask)
+			if (bt_empty(bt))
 				/* Found assignment(s) into tracked register in this state.
 				 * Since this state is already marked, just return.
 				 * Nothing to be tracked further in the parent state.
@@ -3670,21 +3787,21 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
 		if (!st)
 			break;
 
-		new_marks = false;
 		func = st->frame[frame];
-		bitmap_from_u64(mask, reg_mask);
+		bitmap_from_u64(mask, bt_reg_mask(bt));
 		for_each_set_bit(i, mask, 32) {
 			reg = &func->regs[i];
 			if (reg->type != SCALAR_VALUE) {
-				reg_mask &= ~(1u << i);
+				bt_clear_reg(bt, i);
 				continue;
 			}
-			if (!reg->precise)
-				new_marks = true;
-			reg->precise = true;
+			if (reg->precise)
+				bt_clear_reg(bt, i);
+			else
+				reg->precise = true;
 		}
 
-		bitmap_from_u64(mask, stack_mask);
+		bitmap_from_u64(mask, bt_stack_mask(bt));
 		for_each_set_bit(i, mask, 64) {
 			if (i >= func->allocated_stack / BPF_REG_SIZE) {
 				/* the sequence of instructions:
@@ -3701,32 +3818,28 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
 				 * In such case fallback to conservative.
 				 */
 				mark_all_scalars_precise(env, st);
+				bt_reset(bt);
 				return 0;
 			}
 
-			if (!is_spilled_reg(&func->stack[i])) {
-				stack_mask &= ~(1ull << i);
+			if (!is_spilled_scalar_reg(&func->stack[i])) {
+				bt_clear_slot(bt, i);
 				continue;
 			}
 			reg = &func->stack[i].spilled_ptr;
-			if (reg->type != SCALAR_VALUE) {
-				stack_mask &= ~(1ull << i);
-				continue;
-			}
-			if (!reg->precise)
-				new_marks = true;
-			reg->precise = true;
+			if (reg->precise)
+				bt_clear_slot(bt, i);
+			else
+				reg->precise = true;
 		}
 		if (env->log.level & BPF_LOG_LEVEL2) {
 			verbose(env, "parent %s regs=%x stack=%llx marks:",
-				new_marks ? "didn't have" : "already had",
-				reg_mask, stack_mask);
+				!bt_empty(bt) ? "didn't have" : "already had",
+				bt_reg_mask(bt), bt_stack_mask(bt));
 			print_verifier_state(env, func, true);
 		}
 
-		if (!reg_mask && !stack_mask)
-			break;
-		if (!new_marks)
+		if (bt_empty(bt))
 			break;
 
 		last_idx = st->last_insn_idx;
@@ -18872,6 +18985,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	if (!env)
 		return -ENOMEM;
 
+	env->bt.env = env;
+
 	len = (*prog)->len;
 	env->insn_aux_data =
 		vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len));

From d9439c21a9e4769bfd83a03ab39056164d44ac31 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 4 May 2023 21:33:11 -0700
Subject: [PATCH 29/57] bpf: improve precision backtrack logging

Add helper to format register and stack masks in more human-readable
format. Adjust logging a bit during backtrack propagation and especially
during forcing precision fallback logic to make it clearer what's going
on (with log_level=2, of course), and also start reporting affected
frame depth. This is in preparation for having more than one active
frame later when precision propagation between subprog calls is added.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230505043317.3629845-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h                  |  13 ++-
 kernel/bpf/verifier.c                         |  72 ++++++++++--
 .../testing/selftests/bpf/verifier/precise.c  | 106 +++++++++---------
 3 files changed, 128 insertions(+), 63 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 33f541366f4e..5b11a3b0fec0 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -18,8 +18,11 @@
  * that converting umax_value to int cannot overflow.
  */
 #define BPF_MAX_VAR_SIZ	(1 << 29)
-/* size of type_str_buf in bpf_verifier. */
-#define TYPE_STR_BUF_LEN 128
+/* size of tmp_str_buf in bpf_verifier.
+ * we need at least 306 bytes to fit full stack mask representation
+ * (in the "-8,-16,...,-512" form)
+ */
+#define TMP_STR_BUF_LEN 320
 
 /* Liveness marks, used for registers and spilled-regs (in stack slots).
  * Read marks propagate upwards until they find a write mark; they record that
@@ -620,8 +623,10 @@ struct bpf_verifier_env {
 	/* Same as scratched_regs but for stack slots */
 	u64 scratched_stack_slots;
 	u64 prev_log_pos, prev_insn_print_pos;
-	/* buffer used in reg_type_str() to generate reg_type string */
-	char type_str_buf[TYPE_STR_BUF_LEN];
+	/* buffer used to generate temporary string representations,
+	 * e.g., in reg_type_str() to generate reg_type string
+	 */
+	char tmp_str_buf[TMP_STR_BUF_LEN];
 };
 
 __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9b2e571250e1..5412c8c8511d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -605,9 +605,9 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
 		 type & PTR_TRUSTED ? "trusted_" : ""
 	);
 
-	snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
+	snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s",
 		 prefix, str[base_type(type)], postfix);
-	return env->type_str_buf;
+	return env->tmp_str_buf;
 }
 
 static char slot_type_char[] = {
@@ -3308,6 +3308,45 @@ static inline bool bt_is_slot_set(struct backtrack_state *bt, u32 slot)
 	return bt->stack_masks[bt->frame] & (1ull << slot);
 }
 
+/* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */
+static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask)
+{
+	DECLARE_BITMAP(mask, 64);
+	bool first = true;
+	int i, n;
+
+	buf[0] = '\0';
+
+	bitmap_from_u64(mask, reg_mask);
+	for_each_set_bit(i, mask, 32) {
+		n = snprintf(buf, buf_sz, "%sr%d", first ? "" : ",", i);
+		first = false;
+		buf += n;
+		buf_sz -= n;
+		if (buf_sz < 0)
+			break;
+	}
+}
+/* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */
+static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
+{
+	DECLARE_BITMAP(mask, 64);
+	bool first = true;
+	int i, n;
+
+	buf[0] = '\0';
+
+	bitmap_from_u64(mask, stack_mask);
+	for_each_set_bit(i, mask, 64) {
+		n = snprintf(buf, buf_sz, "%s%d", first ? "" : ",", -(i + 1) * 8);
+		first = false;
+		buf += n;
+		buf_sz -= n;
+		if (buf_sz < 0)
+			break;
+	}
+}
+
 /* For given verifier state backtrack_insn() is called from the last insn to
  * the first insn. Its purpose is to compute a bitmask of registers and
  * stack slots that needs precision in the parent verifier state.
@@ -3331,7 +3370,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
 	if (insn->code == 0)
 		return 0;
 	if (env->log.level & BPF_LOG_LEVEL2) {
-		verbose(env, "regs=%x stack=%llx before ", bt_reg_mask(bt), bt_stack_mask(bt));
+		fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt));
+		verbose(env, "mark_precise: frame%d: regs=%s ",
+			bt->frame, env->tmp_str_buf);
+		fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt));
+		verbose(env, "stack=%s before ", env->tmp_str_buf);
 		verbose(env, "%d: ", idx);
 		print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
 	}
@@ -3531,6 +3574,11 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,
 	struct bpf_reg_state *reg;
 	int i, j;
 
+	if (env->log.level & BPF_LOG_LEVEL2) {
+		verbose(env, "mark_precise: frame%d: falling back to forcing all scalars precise\n",
+			st->curframe);
+	}
+
 	/* big hammer: mark all scalars precise in this path.
 	 * pop_stack may still get !precise scalars.
 	 * We also skip current state and go straight to first parent state,
@@ -3542,17 +3590,25 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,
 			func = st->frame[i];
 			for (j = 0; j < BPF_REG_FP; j++) {
 				reg = &func->regs[j];
-				if (reg->type != SCALAR_VALUE)
+				if (reg->type != SCALAR_VALUE || reg->precise)
 					continue;
 				reg->precise = true;
+				if (env->log.level & BPF_LOG_LEVEL2) {
+					verbose(env, "force_precise: frame%d: forcing r%d to be precise\n",
+						i, j);
+				}
 			}
 			for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
 				if (!is_spilled_reg(&func->stack[j]))
 					continue;
 				reg = &func->stack[j].spilled_ptr;
-				if (reg->type != SCALAR_VALUE)
+				if (reg->type != SCALAR_VALUE || reg->precise)
 					continue;
 				reg->precise = true;
+				if (env->log.level & BPF_LOG_LEVEL2) {
+					verbose(env, "force_precise: frame%d: forcing fp%d to be precise\n",
+						i, -(j + 1) * 8);
+				}
 			}
 		}
 	}
@@ -3716,8 +3772,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
 		DECLARE_BITMAP(mask, 64);
 		u32 history = st->jmp_history_cnt;
 
-		if (env->log.level & BPF_LOG_LEVEL2)
-			verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
+		if (env->log.level & BPF_LOG_LEVEL2) {
+			verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d\n",
+				bt->frame, last_idx, first_idx);
+		}
 
 		if (last_idx < 0) {
 			/* we are at the entry into subprog, which
diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c
index 8f0340eed696..a22fabd404ed 100644
--- a/tools/testing/selftests/bpf/verifier/precise.c
+++ b/tools/testing/selftests/bpf/verifier/precise.c
@@ -38,25 +38,24 @@
 	.fixup_map_array_48b = { 1 },
 	.result = VERBOSE_ACCEPT,
 	.errstr =
-	"26: (85) call bpf_probe_read_kernel#113\
-	last_idx 26 first_idx 20\
-	regs=4 stack=0 before 25\
-	regs=4 stack=0 before 24\
-	regs=4 stack=0 before 23\
-	regs=4 stack=0 before 22\
-	regs=4 stack=0 before 20\
-	parent didn't have regs=4 stack=0 marks\
-	last_idx 19 first_idx 10\
-	regs=4 stack=0 before 19\
-	regs=200 stack=0 before 18\
-	regs=300 stack=0 before 17\
-	regs=201 stack=0 before 15\
-	regs=201 stack=0 before 14\
-	regs=200 stack=0 before 13\
-	regs=200 stack=0 before 12\
-	regs=200 stack=0 before 11\
-	regs=200 stack=0 before 10\
-	parent already had regs=0 stack=0 marks",
+	"mark_precise: frame0: last_idx 26 first_idx 20\
+	mark_precise: frame0: regs=r2 stack= before 25\
+	mark_precise: frame0: regs=r2 stack= before 24\
+	mark_precise: frame0: regs=r2 stack= before 23\
+	mark_precise: frame0: regs=r2 stack= before 22\
+	mark_precise: frame0: regs=r2 stack= before 20\
+	parent didn't have regs=4 stack=0 marks:\
+	mark_precise: frame0: last_idx 19 first_idx 10\
+	mark_precise: frame0: regs=r2 stack= before 19\
+	mark_precise: frame0: regs=r9 stack= before 18\
+	mark_precise: frame0: regs=r8,r9 stack= before 17\
+	mark_precise: frame0: regs=r0,r9 stack= before 15\
+	mark_precise: frame0: regs=r0,r9 stack= before 14\
+	mark_precise: frame0: regs=r9 stack= before 13\
+	mark_precise: frame0: regs=r9 stack= before 12\
+	mark_precise: frame0: regs=r9 stack= before 11\
+	mark_precise: frame0: regs=r9 stack= before 10\
+	parent already had regs=0 stack=0 marks:",
 },
 {
 	"precise: test 2",
@@ -100,20 +99,20 @@
 	.flags = BPF_F_TEST_STATE_FREQ,
 	.errstr =
 	"26: (85) call bpf_probe_read_kernel#113\
-	last_idx 26 first_idx 22\
-	regs=4 stack=0 before 25\
-	regs=4 stack=0 before 24\
-	regs=4 stack=0 before 23\
-	regs=4 stack=0 before 22\
-	parent didn't have regs=4 stack=0 marks\
-	last_idx 20 first_idx 20\
-	regs=4 stack=0 before 20\
-	parent didn't have regs=4 stack=0 marks\
-	last_idx 19 first_idx 17\
-	regs=4 stack=0 before 19\
-	regs=200 stack=0 before 18\
-	regs=300 stack=0 before 17\
-	parent already had regs=0 stack=0 marks",
+	mark_precise: frame0: last_idx 26 first_idx 22\
+	mark_precise: frame0: regs=r2 stack= before 25\
+	mark_precise: frame0: regs=r2 stack= before 24\
+	mark_precise: frame0: regs=r2 stack= before 23\
+	mark_precise: frame0: regs=r2 stack= before 22\
+	parent didn't have regs=4 stack=0 marks:\
+	mark_precise: frame0: last_idx 20 first_idx 20\
+	mark_precise: frame0: regs=r2 stack= before 20\
+	parent didn't have regs=4 stack=0 marks:\
+	mark_precise: frame0: last_idx 19 first_idx 17\
+	mark_precise: frame0: regs=r2 stack= before 19\
+	mark_precise: frame0: regs=r9 stack= before 18\
+	mark_precise: frame0: regs=r8,r9 stack= before 17\
+	parent already had regs=0 stack=0 marks:",
 },
 {
 	"precise: cross frame pruning",
@@ -153,15 +152,15 @@
 	},
 	.prog_type = BPF_PROG_TYPE_XDP,
 	.flags = BPF_F_TEST_STATE_FREQ,
-	.errstr = "5: (2d) if r4 > r0 goto pc+0\
-	last_idx 5 first_idx 5\
-	parent didn't have regs=10 stack=0 marks\
-	last_idx 4 first_idx 2\
-	regs=10 stack=0 before 4\
-	regs=10 stack=0 before 3\
-	regs=0 stack=1 before 2\
-	last_idx 5 first_idx 5\
-	parent didn't have regs=1 stack=0 marks",
+	.errstr = "mark_precise: frame0: last_idx 5 first_idx 5\
+	parent didn't have regs=10 stack=0 marks:\
+	mark_precise: frame0: last_idx 4 first_idx 2\
+	mark_precise: frame0: regs=r4 stack= before 4\
+	mark_precise: frame0: regs=r4 stack= before 3\
+	mark_precise: frame0: regs= stack=-8 before 2\
+	mark_precise: frame0: falling back to forcing all scalars precise\
+	mark_precise: frame0: last_idx 5 first_idx 5\
+	parent didn't have regs=1 stack=0 marks:",
 	.result = VERBOSE_ACCEPT,
 	.retval = -1,
 },
@@ -179,16 +178,19 @@
 	},
 	.prog_type = BPF_PROG_TYPE_XDP,
 	.flags = BPF_F_TEST_STATE_FREQ,
-	.errstr = "last_idx 6 first_idx 6\
-	parent didn't have regs=10 stack=0 marks\
-	last_idx 5 first_idx 3\
-	regs=10 stack=0 before 5\
-	regs=10 stack=0 before 4\
-	regs=0 stack=1 before 3\
-	last_idx 6 first_idx 6\
-	parent didn't have regs=1 stack=0 marks\
-	last_idx 5 first_idx 3\
-	regs=1 stack=0 before 5",
+	.errstr = "mark_precise: frame0: last_idx 6 first_idx 6\
+	parent didn't have regs=10 stack=0 marks:\
+	mark_precise: frame0: last_idx 5 first_idx 3\
+	mark_precise: frame0: regs=r4 stack= before 5\
+	mark_precise: frame0: regs=r4 stack= before 4\
+	mark_precise: frame0: regs= stack=-8 before 3\
+	mark_precise: frame0: falling back to forcing all scalars precise\
+	force_precise: frame0: forcing r0 to be precise\
+	force_precise: frame0: forcing r0 to be precise\
+	mark_precise: frame0: last_idx 6 first_idx 6\
+	parent didn't have regs=1 stack=0 marks:\
+	mark_precise: frame0: last_idx 5 first_idx 3\
+	mark_precise: frame0: regs=r0 stack= before 5",
 	.result = VERBOSE_ACCEPT,
 	.retval = -1,
 },

From 1ef22b6865a73a8aed36d43375fe8c7b30869326 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 4 May 2023 21:33:12 -0700
Subject: [PATCH 30/57] bpf: maintain bitmasks across all active frames in
 __mark_chain_precision

Teach __mark_chain_precision logic to maintain register/stack masks
across all active frames when going from child state to parent state.
Currently this should be mostly no-op, as precision backtracking usually
bails out when encountering subprog entry/exit.

It's not very apparent from the diff due to increased indentation, but
the logic remains the same, except everything is done on specific `fr`
frame index. Calls to bt_clear_reg() and bt_clear_slot() are replaced
with frame-specific bt_clear_frame_reg() and bt_clear_frame_slot(),
where frame index is passed explicitly, instead of using current frame
number.

We also adjust logging to emit affected frame number. And we also add
better logging of human-readable register and stack slot masks, similar
to previous patch.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230505043317.3629845-6-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                         | 102 +++++++++---------
 .../testing/selftests/bpf/verifier/precise.c  |  18 ++--
 2 files changed, 63 insertions(+), 57 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5412c8c8511d..5a7997bc96f5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3736,7 +3736,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
 	struct bpf_func_state *func;
 	struct bpf_reg_state *reg;
 	bool skip_first = true;
-	int i, err;
+	int i, fr, err;
 
 	if (!env->bpf_capable)
 		return 0;
@@ -3845,56 +3845,62 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
 		if (!st)
 			break;
 
-		func = st->frame[frame];
-		bitmap_from_u64(mask, bt_reg_mask(bt));
-		for_each_set_bit(i, mask, 32) {
-			reg = &func->regs[i];
-			if (reg->type != SCALAR_VALUE) {
-				bt_clear_reg(bt, i);
-				continue;
-			}
-			if (reg->precise)
-				bt_clear_reg(bt, i);
-			else
-				reg->precise = true;
-		}
-
-		bitmap_from_u64(mask, bt_stack_mask(bt));
-		for_each_set_bit(i, mask, 64) {
-			if (i >= func->allocated_stack / BPF_REG_SIZE) {
-				/* the sequence of instructions:
-				 * 2: (bf) r3 = r10
-				 * 3: (7b) *(u64 *)(r3 -8) = r0
-				 * 4: (79) r4 = *(u64 *)(r10 -8)
-				 * doesn't contain jmps. It's backtracked
-				 * as a single block.
-				 * During backtracking insn 3 is not recognized as
-				 * stack access, so at the end of backtracking
-				 * stack slot fp-8 is still marked in stack_mask.
-				 * However the parent state may not have accessed
-				 * fp-8 and it's "unallocated" stack space.
-				 * In such case fallback to conservative.
-				 */
-				mark_all_scalars_precise(env, st);
-				bt_reset(bt);
-				return 0;
+		for (fr = bt->frame; fr >= 0; fr--) {
+			func = st->frame[fr];
+			bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
+			for_each_set_bit(i, mask, 32) {
+				reg = &func->regs[i];
+				if (reg->type != SCALAR_VALUE) {
+					bt_clear_frame_reg(bt, fr, i);
+					continue;
+				}
+				if (reg->precise)
+					bt_clear_frame_reg(bt, fr, i);
+				else
+					reg->precise = true;
 			}
 
-			if (!is_spilled_scalar_reg(&func->stack[i])) {
-				bt_clear_slot(bt, i);
-				continue;
+			bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
+			for_each_set_bit(i, mask, 64) {
+				if (i >= func->allocated_stack / BPF_REG_SIZE) {
+					/* the sequence of instructions:
+					 * 2: (bf) r3 = r10
+					 * 3: (7b) *(u64 *)(r3 -8) = r0
+					 * 4: (79) r4 = *(u64 *)(r10 -8)
+					 * doesn't contain jmps. It's backtracked
+					 * as a single block.
+					 * During backtracking insn 3 is not recognized as
+					 * stack access, so at the end of backtracking
+					 * stack slot fp-8 is still marked in stack_mask.
+					 * However the parent state may not have accessed
+					 * fp-8 and it's "unallocated" stack space.
+					 * In such case fallback to conservative.
+					 */
+					mark_all_scalars_precise(env, st);
+					bt_reset(bt);
+					return 0;
+				}
+
+				if (!is_spilled_scalar_reg(&func->stack[i])) {
+					bt_clear_frame_slot(bt, fr, i);
+					continue;
+				}
+				reg = &func->stack[i].spilled_ptr;
+				if (reg->precise)
+					bt_clear_frame_slot(bt, fr, i);
+				else
+					reg->precise = true;
+			}
+			if (env->log.level & BPF_LOG_LEVEL2) {
+				fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
+					     bt_frame_reg_mask(bt, fr));
+				verbose(env, "mark_precise: frame%d: parent state regs=%s ",
+					fr, env->tmp_str_buf);
+				fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
+					       bt_frame_stack_mask(bt, fr));
+				verbose(env, "stack=%s: ", env->tmp_str_buf);
+				print_verifier_state(env, func, true);
 			}
-			reg = &func->stack[i].spilled_ptr;
-			if (reg->precise)
-				bt_clear_slot(bt, i);
-			else
-				reg->precise = true;
-		}
-		if (env->log.level & BPF_LOG_LEVEL2) {
-			verbose(env, "parent %s regs=%x stack=%llx marks:",
-				!bt_empty(bt) ? "didn't have" : "already had",
-				bt_reg_mask(bt), bt_stack_mask(bt));
-			print_verifier_state(env, func, true);
 		}
 
 		if (bt_empty(bt))
diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c
index a22fabd404ed..77ea018582c5 100644
--- a/tools/testing/selftests/bpf/verifier/precise.c
+++ b/tools/testing/selftests/bpf/verifier/precise.c
@@ -44,7 +44,7 @@
 	mark_precise: frame0: regs=r2 stack= before 23\
 	mark_precise: frame0: regs=r2 stack= before 22\
 	mark_precise: frame0: regs=r2 stack= before 20\
-	parent didn't have regs=4 stack=0 marks:\
+	mark_precise: frame0: parent state regs=r2 stack=:\
 	mark_precise: frame0: last_idx 19 first_idx 10\
 	mark_precise: frame0: regs=r2 stack= before 19\
 	mark_precise: frame0: regs=r9 stack= before 18\
@@ -55,7 +55,7 @@
 	mark_precise: frame0: regs=r9 stack= before 12\
 	mark_precise: frame0: regs=r9 stack= before 11\
 	mark_precise: frame0: regs=r9 stack= before 10\
-	parent already had regs=0 stack=0 marks:",
+	mark_precise: frame0: parent state regs= stack=:",
 },
 {
 	"precise: test 2",
@@ -104,15 +104,15 @@
 	mark_precise: frame0: regs=r2 stack= before 24\
 	mark_precise: frame0: regs=r2 stack= before 23\
 	mark_precise: frame0: regs=r2 stack= before 22\
-	parent didn't have regs=4 stack=0 marks:\
+	mark_precise: frame0: parent state regs=r2 stack=:\
 	mark_precise: frame0: last_idx 20 first_idx 20\
 	mark_precise: frame0: regs=r2 stack= before 20\
-	parent didn't have regs=4 stack=0 marks:\
+	mark_precise: frame0: parent state regs=r2 stack=:\
 	mark_precise: frame0: last_idx 19 first_idx 17\
 	mark_precise: frame0: regs=r2 stack= before 19\
 	mark_precise: frame0: regs=r9 stack= before 18\
 	mark_precise: frame0: regs=r8,r9 stack= before 17\
-	parent already had regs=0 stack=0 marks:",
+	mark_precise: frame0: parent state regs= stack=:",
 },
 {
 	"precise: cross frame pruning",
@@ -153,14 +153,14 @@
 	.prog_type = BPF_PROG_TYPE_XDP,
 	.flags = BPF_F_TEST_STATE_FREQ,
 	.errstr = "mark_precise: frame0: last_idx 5 first_idx 5\
-	parent didn't have regs=10 stack=0 marks:\
+	mark_precise: frame0: parent state regs=r4 stack=:\
 	mark_precise: frame0: last_idx 4 first_idx 2\
 	mark_precise: frame0: regs=r4 stack= before 4\
 	mark_precise: frame0: regs=r4 stack= before 3\
 	mark_precise: frame0: regs= stack=-8 before 2\
 	mark_precise: frame0: falling back to forcing all scalars precise\
 	mark_precise: frame0: last_idx 5 first_idx 5\
-	parent didn't have regs=1 stack=0 marks:",
+	mark_precise: frame0: parent state regs=r0 stack=:",
 	.result = VERBOSE_ACCEPT,
 	.retval = -1,
 },
@@ -179,7 +179,7 @@
 	.prog_type = BPF_PROG_TYPE_XDP,
 	.flags = BPF_F_TEST_STATE_FREQ,
 	.errstr = "mark_precise: frame0: last_idx 6 first_idx 6\
-	parent didn't have regs=10 stack=0 marks:\
+	mark_precise: frame0: parent state regs=r4 stack=:\
 	mark_precise: frame0: last_idx 5 first_idx 3\
 	mark_precise: frame0: regs=r4 stack= before 5\
 	mark_precise: frame0: regs=r4 stack= before 4\
@@ -188,7 +188,7 @@
 	force_precise: frame0: forcing r0 to be precise\
 	force_precise: frame0: forcing r0 to be precise\
 	mark_precise: frame0: last_idx 6 first_idx 6\
-	parent didn't have regs=1 stack=0 marks:\
+	mark_precise: frame0: parent state regs=r0 stack=:\
 	mark_precise: frame0: last_idx 5 first_idx 3\
 	mark_precise: frame0: regs=r0 stack= before 5",
 	.result = VERBOSE_ACCEPT,

From f655badf2a8fc028433d9583bf86a6b473721f09 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 4 May 2023 21:33:13 -0700
Subject: [PATCH 31/57] bpf: fix propagate_precision() logic for inner frames

Fix propagate_precision() logic to perform propagation of all necessary
registers and stack slots across all active frames *in one batch step*.

Doing this for each register/slot in each individual frame is wasteful,
but the main problem is that backtracking of instruction in any frame
except the deepest one just doesn't work. This is due to backtracking
logic relying on jump history, and available jump history always starts
(or ends, depending how you view it) in current frame. So, if
prog A (frame #0) called subprog B (frame #1) and we need to propagate
precision of, say, register R6 (callee-saved) within frame #0, we
actually don't even know where jump history that corresponds to prog
A even starts. We'd need to skip subprog part of jump history first to
be able to do this.

Luckily, with struct backtrack_state and __mark_chain_precision()
handling bitmasks tracking/propagation across all active frames at the
same time (added in previous patch), propagate_precision() can be both
fixed and sped up by setting all the necessary bits across all frames
and then performing one __mark_chain_precision() pass. This makes it
unnecessary to skip subprog parts of jump history.

We also improve logging along the way, to clearly specify which
registers' and slots' precision markings are propagated within which
frame. Each frame will have dedicated line and all registers and stack
slots from that frame will be reported in format similar to precision
backtrack regs/stack logging. E.g.:

frame 1: propagating r1,r2,r3,fp-8,fp-16
frame 0: propagating r3,r9,fp-120

Fixes: 529409ea92d5 ("bpf: propagate precision across all frames, not just the last one")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230505043317.3629845-7-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 65 +++++++++++++++++++++++--------------------
 1 file changed, 35 insertions(+), 30 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5a7997bc96f5..13bbaa2485fc 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3726,8 +3726,7 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_
  * mark_all_scalars_imprecise() to hopefully get more permissive and generic
  * finalized states which help in short circuiting more future states.
  */
-static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno,
-				  int spi)
+static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 {
 	struct backtrack_state *bt = &env->bt;
 	struct bpf_verifier_state *st = env->cur_state;
@@ -3742,13 +3741,13 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
 		return 0;
 
 	/* set frame number from which we are starting to backtrack */
-	bt_init(bt, frame);
+	bt_init(bt, env->cur_state->curframe);
 
 	/* Do sanity checks against current state of register and/or stack
 	 * slot, but don't set precise flag in current state, as precision
 	 * tracking in the current state is unnecessary.
 	 */
-	func = st->frame[frame];
+	func = st->frame[bt->frame];
 	if (regno >= 0) {
 		reg = &func->regs[regno];
 		if (reg->type != SCALAR_VALUE) {
@@ -3758,13 +3757,6 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
 		bt_set_reg(bt, regno);
 	}
 
-	while (spi >= 0) {
-		if (!is_spilled_scalar_reg(&func->stack[spi]))
-			break;
-		bt_set_slot(bt, spi);
-		break;
-	}
-
 	if (bt_empty(bt))
 		return 0;
 
@@ -3914,17 +3906,15 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
 
 int mark_chain_precision(struct bpf_verifier_env *env, int regno)
 {
-	return __mark_chain_precision(env, env->cur_state->curframe, regno, -1);
+	return __mark_chain_precision(env, regno);
 }
 
-static int mark_chain_precision_frame(struct bpf_verifier_env *env, int frame, int regno)
+/* mark_chain_precision_batch() assumes that env->bt is set in the caller to
+ * desired reg and stack masks across all relevant frames
+ */
+static int mark_chain_precision_batch(struct bpf_verifier_env *env)
 {
-	return __mark_chain_precision(env, frame, regno, -1);
-}
-
-static int mark_chain_precision_stack_frame(struct bpf_verifier_env *env, int frame, int spi)
-{
-	return __mark_chain_precision(env, frame, -1, spi);
+	return __mark_chain_precision(env, -1);
 }
 
 static bool is_spillable_regtype(enum bpf_reg_type type)
@@ -15361,20 +15351,25 @@ static int propagate_precision(struct bpf_verifier_env *env,
 	struct bpf_reg_state *state_reg;
 	struct bpf_func_state *state;
 	int i, err = 0, fr;
+	bool first;
 
 	for (fr = old->curframe; fr >= 0; fr--) {
 		state = old->frame[fr];
 		state_reg = state->regs;
+		first = true;
 		for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
 			if (state_reg->type != SCALAR_VALUE ||
 			    !state_reg->precise ||
 			    !(state_reg->live & REG_LIVE_READ))
 				continue;
-			if (env->log.level & BPF_LOG_LEVEL2)
-				verbose(env, "frame %d: propagating r%d\n", fr, i);
-			err = mark_chain_precision_frame(env, fr, i);
-			if (err < 0)
-				return err;
+			if (env->log.level & BPF_LOG_LEVEL2) {
+				if (first)
+					verbose(env, "frame %d: propagating r%d", fr, i);
+				else
+					verbose(env, ",r%d", i);
+			}
+			bt_set_frame_reg(&env->bt, fr, i);
+			first = false;
 		}
 
 		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
@@ -15385,14 +15380,24 @@ static int propagate_precision(struct bpf_verifier_env *env,
 			    !state_reg->precise ||
 			    !(state_reg->live & REG_LIVE_READ))
 				continue;
-			if (env->log.level & BPF_LOG_LEVEL2)
-				verbose(env, "frame %d: propagating fp%d\n",
-					fr, (-i - 1) * BPF_REG_SIZE);
-			err = mark_chain_precision_stack_frame(env, fr, i);
-			if (err < 0)
-				return err;
+			if (env->log.level & BPF_LOG_LEVEL2) {
+				if (first)
+					verbose(env, "frame %d: propagating fp%d",
+						fr, (-i - 1) * BPF_REG_SIZE);
+				else
+					verbose(env, ",fp%d", (-i - 1) * BPF_REG_SIZE);
+			}
+			bt_set_frame_slot(&env->bt, fr, i);
+			first = false;
 		}
+		if (!first)
+			verbose(env, "\n");
 	}
+
+	err = mark_chain_precision_batch(env);
+	if (err < 0)
+		return err;
+
 	return 0;
 }
 

From c50c0b57a515826b5d2e1ce85cd85f24f0da10c2 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 4 May 2023 21:33:14 -0700
Subject: [PATCH 32/57] bpf: fix mark_all_scalars_precise use in
 mark_chain_precision

When precision backtracking bails out due to some unsupported sequence
of instructions (e.g., stack access through register other than r10), we
need to mark all SCALAR registers as precise to be safe. Currently,
though, we mark SCALARs precise only starting from the state we detected
unsupported condition, which could be one of the parent states of the
actual current state. This will leave some registers potentially not
marked as precise, even though they should. So make sure we start
marking scalars as precise from current state (env->cur_state).

Further, we don't currently detect a situation when we end up with some
stack slots marked as needing precision, but we ran out of available
states to find the instructions that populate those stack slots. This is
akin the `i >= func->allocated_stack / BPF_REG_SIZE` check and should be
handled similarly by falling back to marking all SCALARs precise. Add
this check when we run out of states.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230505043317.3629845-8-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                          | 16 +++++++++++++---
 tools/testing/selftests/bpf/verifier/precise.c |  9 +++++----
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 13bbaa2485fc..899122832d8e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3806,7 +3806,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 				err = backtrack_insn(env, i, bt);
 			}
 			if (err == -ENOTSUPP) {
-				mark_all_scalars_precise(env, st);
+				mark_all_scalars_precise(env, env->cur_state);
 				bt_reset(bt);
 				return 0;
 			} else if (err) {
@@ -3868,7 +3868,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 					 * fp-8 and it's "unallocated" stack space.
 					 * In such case fallback to conservative.
 					 */
-					mark_all_scalars_precise(env, st);
+					mark_all_scalars_precise(env, env->cur_state);
 					bt_reset(bt);
 					return 0;
 				}
@@ -3896,11 +3896,21 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 		}
 
 		if (bt_empty(bt))
-			break;
+			return 0;
 
 		last_idx = st->last_insn_idx;
 		first_idx = st->first_insn_idx;
 	}
+
+	/* if we still have requested precise regs or slots, we missed
+	 * something (e.g., stack access through non-r10 register), so
+	 * fallback to marking all precise
+	 */
+	if (!bt_empty(bt)) {
+		mark_all_scalars_precise(env, env->cur_state);
+		bt_reset(bt);
+	}
+
 	return 0;
 }
 
diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c
index 77ea018582c5..b8c0aae8e7ec 100644
--- a/tools/testing/selftests/bpf/verifier/precise.c
+++ b/tools/testing/selftests/bpf/verifier/precise.c
@@ -159,8 +159,9 @@
 	mark_precise: frame0: regs=r4 stack= before 3\
 	mark_precise: frame0: regs= stack=-8 before 2\
 	mark_precise: frame0: falling back to forcing all scalars precise\
+	force_precise: frame0: forcing r0 to be precise\
 	mark_precise: frame0: last_idx 5 first_idx 5\
-	mark_precise: frame0: parent state regs=r0 stack=:",
+	mark_precise: frame0: parent state regs= stack=:",
 	.result = VERBOSE_ACCEPT,
 	.retval = -1,
 },
@@ -187,10 +188,10 @@
 	mark_precise: frame0: falling back to forcing all scalars precise\
 	force_precise: frame0: forcing r0 to be precise\
 	force_precise: frame0: forcing r0 to be precise\
+	force_precise: frame0: forcing r0 to be precise\
+	force_precise: frame0: forcing r0 to be precise\
 	mark_precise: frame0: last_idx 6 first_idx 6\
-	mark_precise: frame0: parent state regs=r0 stack=:\
-	mark_precise: frame0: last_idx 5 first_idx 3\
-	mark_precise: frame0: regs=r0 stack= before 5",
+	mark_precise: frame0: parent state regs= stack=:",
 	.result = VERBOSE_ACCEPT,
 	.retval = -1,
 },

From fde2a3882bd07876c144f2e00f7ae6893c378180 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 4 May 2023 21:33:15 -0700
Subject: [PATCH 33/57] bpf: support precision propagation in the presence of
 subprogs

Add support precision backtracking in the presence of subprogram frames in
jump history.

This means supporting a few different kinds of subprogram invocation
situations, all requiring a slightly different handling in precision
backtracking handling logic:
  - static subprogram calls;
  - global subprogram calls;
  - callback-calling helpers/kfuncs.

For each of those we need to handle a few precision propagation cases:
  - what to do with precision of subprog returns (r0);
  - what to do with precision of input arguments;
  - for all of them callee-saved registers in caller function should be
    propagated ignoring subprog/callback part of jump history.

N.B. Async callback-calling helpers (currently only
bpf_timer_set_callback()) are transparent to all this because they set
a separate async callback environment and thus callback's history is not
shared with main program's history. So as far as all the changes in this
commit goes, such helper is just a regular helper.

Let's look at all these situation in more details. Let's start with
static subprogram being called, using an exxerpt of a simple main
program and its static subprog, indenting subprog's frame slightly to
make everything clear.

frame 0				frame 1			precision set
=======				=======			=============

 9: r6 = 456;
10: r1 = 123;						fr0: r6
11: call pc+10;						fr0: r1, r6
				22: r0 = r1;		fr0: r6;     fr1: r1
				23: exit		fr0: r6;     fr1: r0
12: r1 = <map_pointer>					fr0: r0, r6
13: r1 += r0;						fr0: r0, r6
14: r1 += r6;						fr0: r6
15: exit

As can be seen above main function is passing 123 as single argument to
an identity (`return x;`) subprog. Returned value is used to adjust map
pointer offset, which forces r0 to be marked as precise. Then
instruction #14 does the same for callee-saved r6, which will have to be
backtracked all the way to instruction #9. For brevity, precision sets
for instruction #13 and #14 are combined in the diagram above.

First, for subprog calls, r0 returned from subprog (in frame 0) has to
go into subprog's frame 1, and should be cleared from frame 0. So we go
back into subprog's frame knowing we need to mark r0 precise. We then
see that insn #22 sets r0 from r1, so now we care about marking r1
precise.  When we pop up from subprog's frame back into caller at
insn #11 we keep r1, as it's an argument-passing register, so we eventually
find `10: r1 = 123;` and satify precision propagation chain for insn #13.

This example demonstrates two sets of rules:
  - r0 returned after subprog call has to be moved into subprog's r0 set;
  - *static* subprog arguments (r1-r5) are moved back to caller precision set.

Let's look at what happens with callee-saved precision propagation. Insn #14
mark r6 as precise. When we get into subprog's frame, we keep r6 in
frame 0's precision set *only*. Subprog itself has its own set of
independent r6-r10 registers and is not affected. When we eventually
made our way out of subprog frame we keep r6 in precision set until we
reach `9: r6 = 456;`, satisfying propagation. r6-r10 propagation is
perhaps the simplest aspect, it always stays in its original frame.

That's pretty much all we have to do to support precision propagation
across *static subprog* invocation.

Let's look at what happens when we have global subprog invocation.

frame 0				frame 1			precision set
=======				=======			=============

 9: r6 = 456;
10: r1 = 123;						fr0: r6
11: call pc+10; # global subprog			fr0: r6
12: r1 = <map_pointer>					fr0: r0, r6
13: r1 += r0;						fr0: r0, r6
14: r1 += r6;						fr0: r6;
15: exit

Starting from insn #13, r0 has to be precise. We backtrack all the way
to insn #11 (call pc+10) and see that subprog is global, so was already
validated in isolation. As opposed to static subprog, global subprog
always returns unknown scalar r0, so that satisfies precision
propagation and we drop r0 from precision set. We are done for insns #13.

Now for insn #14. r6 is in precision set, we backtrack to `call pc+10;`.
Here we need to recognize that this is effectively both exit and entry
to global subprog, which means we stay in caller's frame. So we carry on
with r6 still in precision set, until we satisfy it at insn #9. The only
hard part with global subprogs is just knowing when it's a global func.

Lastly, callback-calling helpers and kfuncs do simulate subprog calls,
so jump history will have subprog instructions in between caller
program's instructions, but the rules of propagating r0 and r1-r5
differ, because we don't actually directly call callback. We actually
call helper/kfunc, which at runtime will call subprog, so the only
difference between normal helper/kfunc handling is that we need to make
sure to skip callback simulatinog part of jump history.
Let's look at an example to make this clearer.

frame 0				frame 1			precision set
=======				=======			=============

 8: r6 = 456;
 9: r1 = 123;						fr0: r6
10: r2 = &callback;					fr0: r6
11: call bpf_loop;					fr0: r6
				22: r0 = r1;		fr0: r6      fr1:
				23: exit		fr0: r6      fr1:
12: r1 = <map_pointer>					fr0: r0, r6
13: r1 += r0;						fr0: r0, r6
14: r1 += r6;						fr0: r6;
15: exit

Again, insn #13 forces r0 to be precise. As soon as we get to `23: exit`
we see that this isn't actually a static subprog call (it's `call
bpf_loop;` helper call instead). So we clear r0 from precision set.

For callee-saved register, there is no difference: it stays in frame 0's
precision set, we go through insn #22 and #23, ignoring them until we
get back to caller frame 0, eventually satisfying precision backtrack
logic at insn #8 (`r6 = 456;`).

Assuming callback needed to set r0 as precise at insn #23, we'd
backtrack to insn #22, switching from r0 to r1, and then at the point
when we pop back to frame 0 at insn #11, we'll clear r1-r5 from
precision set, as we don't really do a subprog call directly, so there
is no input argument precision propagation.

That's pretty much it. With these changes, it seems like the only still
unsupported situation for precision backpropagation is the case when
program is accessing stack through registers other than r10. This is
still left as unsupported (though rare) case for now.

As for results. For selftests, few positive changes for bigger programs,
cls_redirect in dynptr variant benefitting the most:

[vmuser@archvm bpf]$ ./veristat -C ~/subprog-precise-before-results.csv ~/subprog-precise-after-results.csv -f @veristat.cfg -e file,prog,insns -f 'insns_diff!=0'
File                                      Program        Insns (A)  Insns (B)  Insns     (DIFF)
----------------------------------------  -------------  ---------  ---------  ----------------
pyperf600_bpf_loop.bpf.linked1.o          on_event            2060       2002      -58 (-2.82%)
test_cls_redirect_dynptr.bpf.linked1.o    cls_redirect       15660       2914  -12746 (-81.39%)
test_cls_redirect_subprogs.bpf.linked1.o  cls_redirect       61620      59088    -2532 (-4.11%)
xdp_synproxy_kern.bpf.linked1.o           syncookie_tc      109980      86278  -23702 (-21.55%)
xdp_synproxy_kern.bpf.linked1.o           syncookie_xdp      97716      85147  -12569 (-12.86%)

Cilium progress don't really regress. They don't use subprogs and are
mostly unaffected, but some other fixes and improvements could have
changed something. This doesn't appear to be the case:

[vmuser@archvm bpf]$ ./veristat -C ~/subprog-precise-before-results-cilium.csv ~/subprog-precise-after-results-cilium.csv -e file,prog,insns -f 'insns_diff!=0'
File           Program                         Insns (A)  Insns (B)  Insns (DIFF)
-------------  ------------------------------  ---------  ---------  ------------
bpf_host.o     tail_nodeport_nat_ingress_ipv6       4983       5003  +20 (+0.40%)
bpf_lxc.o      tail_nodeport_nat_ingress_ipv6       4983       5003  +20 (+0.40%)
bpf_overlay.o  tail_nodeport_nat_ingress_ipv6       4983       5003  +20 (+0.40%)
bpf_xdp.o      tail_handle_nat_fwd_ipv6            12475      12504  +29 (+0.23%)
bpf_xdp.o      tail_nodeport_nat_ingress_ipv6       6363       6371   +8 (+0.13%)

Looking at (somewhat anonymized) Meta production programs, we see mostly
insignificant variation in number of instructions, with one program
(syar_bind6_protect6) benefitting the most at -17%.

[vmuser@archvm bpf]$ ./veristat -C ~/subprog-precise-before-results-fbcode.csv ~/subprog-precise-after-results-fbcode.csv -e prog,insns -f 'insns_diff!=0'
Program                   Insns (A)  Insns (B)  Insns     (DIFF)
------------------------  ---------  ---------  ----------------
on_request_context_event        597        585      -12 (-2.01%)
read_async_py_stack           43789      43657     -132 (-0.30%)
read_sync_py_stack            35041      37599    +2558 (+7.30%)
rrm_usdt                        946        940       -6 (-0.63%)
sysarmor_inet6_bind           28863      28249     -614 (-2.13%)
sysarmor_inet_bind            28845      28240     -605 (-2.10%)
syar_bind4_protect4          154145     147640    -6505 (-4.22%)
syar_bind6_protect6          165242     137088  -28154 (-17.04%)
syar_task_exit_setgid         21289      19720    -1569 (-7.37%)
syar_task_exit_setuid         21290      19721    -1569 (-7.37%)
do_uprobe                     19967      19413     -554 (-2.77%)
tw_twfw_ingress              215877     204833   -11044 (-5.12%)
tw_twfw_tc_in                215877     204833   -11044 (-5.12%)

But checking duration (wall clock) differences, that is the actual time taken
by verifier to validate programs, we see a sometimes dramatic improvements, all
the way to about 16x improvements:

[vmuser@archvm bpf]$ ./veristat -C ~/subprog-precise-before-results-meta.csv ~/subprog-precise-after-results-meta.csv -e prog,duration -s duration_diff^ | head -n20
Program                                   Duration (us) (A)  Duration (us) (B)  Duration (us) (DIFF)
----------------------------------------  -----------------  -----------------  --------------------
tw_twfw_ingress                                     4488374             272836    -4215538 (-93.92%)
tw_twfw_tc_in                                       4339111             268175    -4070936 (-93.82%)
tw_twfw_egress                                      3521816             270751    -3251065 (-92.31%)
tw_twfw_tc_eg                                       3472878             284294    -3188584 (-91.81%)
balancer_ingress                                     343119             291391      -51728 (-15.08%)
syar_bind6_protect6                                   78992              64782      -14210 (-17.99%)
ttls_tc_ingress                                       11739               8176       -3563 (-30.35%)
kprobe__security_inode_link                           13864              11341       -2523 (-18.20%)
read_sync_py_stack                                    21927              19442       -2485 (-11.33%)
read_async_py_stack                                   30444              28136        -2308 (-7.58%)
syar_task_exit_setuid                                 10256               8440       -1816 (-17.71%)

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230505043317.3629845-9-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 163 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 143 insertions(+), 20 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 899122832d8e..0fa96581eb77 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -240,6 +240,12 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
 			     (poisoned ? BPF_MAP_KEY_POISON : 0ULL);
 }
 
+static bool bpf_helper_call(const struct bpf_insn *insn)
+{
+	return insn->code == (BPF_JMP | BPF_CALL) &&
+	       insn->src_reg == 0;
+}
+
 static bool bpf_pseudo_call(const struct bpf_insn *insn)
 {
 	return insn->code == (BPF_JMP | BPF_CALL) &&
@@ -469,6 +475,13 @@ static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
 	return rec;
 }
 
+static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog)
+{
+	struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux;
+
+	return aux && aux[subprog].linkage == BTF_FUNC_GLOBAL;
+}
+
 static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
 {
 	return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK);
@@ -516,6 +529,8 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id)
 	return func_id == BPF_FUNC_dynptr_data;
 }
 
+static bool is_callback_calling_kfunc(u32 btf_id);
+
 static bool is_callback_calling_function(enum bpf_func_id func_id)
 {
 	return func_id == BPF_FUNC_for_each_map_elem ||
@@ -525,6 +540,11 @@ static bool is_callback_calling_function(enum bpf_func_id func_id)
 	       func_id == BPF_FUNC_user_ringbuf_drain;
 }
 
+static bool is_async_callback_calling_function(enum bpf_func_id func_id)
+{
+	return func_id == BPF_FUNC_timer_set_callback;
+}
+
 static bool is_storage_get_function(enum bpf_func_id func_id)
 {
 	return func_id == BPF_FUNC_sk_storage_get ||
@@ -3350,8 +3370,13 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
 /* For given verifier state backtrack_insn() is called from the last insn to
  * the first insn. Its purpose is to compute a bitmask of registers and
  * stack slots that needs precision in the parent verifier state.
+ *
+ * @idx is an index of the instruction we are currently processing;
+ * @subseq_idx is an index of the subsequent instruction that:
+ *   - *would be* executed next, if jump history is viewed in forward order;
+ *   - *was* processed previously during backtracking.
  */
-static int backtrack_insn(struct bpf_verifier_env *env, int idx,
+static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 			  struct backtrack_state *bt)
 {
 	const struct bpf_insn_cbs cbs = {
@@ -3365,7 +3390,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
 	u8 mode = BPF_MODE(insn->code);
 	u32 dreg = insn->dst_reg;
 	u32 sreg = insn->src_reg;
-	u32 spi;
+	u32 spi, i;
 
 	if (insn->code == 0)
 		return 0;
@@ -3457,14 +3482,86 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
 		if (class == BPF_STX)
 			bt_set_reg(bt, sreg);
 	} else if (class == BPF_JMP || class == BPF_JMP32) {
-		if (opcode == BPF_CALL) {
-			if (insn->src_reg == BPF_PSEUDO_CALL)
-				return -ENOTSUPP;
-			/* BPF helpers that invoke callback subprogs are
-			 * equivalent to BPF_PSEUDO_CALL above
+		if (bpf_pseudo_call(insn)) {
+			int subprog_insn_idx, subprog;
+
+			subprog_insn_idx = idx + insn->imm + 1;
+			subprog = find_subprog(env, subprog_insn_idx);
+			if (subprog < 0)
+				return -EFAULT;
+
+			if (subprog_is_global(env, subprog)) {
+				/* check that jump history doesn't have any
+				 * extra instructions from subprog; the next
+				 * instruction after call to global subprog
+				 * should be literally next instruction in
+				 * caller program
+				 */
+				WARN_ONCE(idx + 1 != subseq_idx, "verifier backtracking bug");
+				/* r1-r5 are invalidated after subprog call,
+				 * so for global func call it shouldn't be set
+				 * anymore
+				 */
+				if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
+					verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
+					WARN_ONCE(1, "verifier backtracking bug");
+					return -EFAULT;
+				}
+				/* global subprog always sets R0 */
+				bt_clear_reg(bt, BPF_REG_0);
+				return 0;
+			} else {
+				/* static subprog call instruction, which
+				 * means that we are exiting current subprog,
+				 * so only r1-r5 could be still requested as
+				 * precise, r0 and r6-r10 or any stack slot in
+				 * the current frame should be zero by now
+				 */
+				if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
+					verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
+					WARN_ONCE(1, "verifier backtracking bug");
+					return -EFAULT;
+				}
+				/* we don't track register spills perfectly,
+				 * so fallback to force-precise instead of failing */
+				if (bt_stack_mask(bt) != 0)
+					return -ENOTSUPP;
+				/* propagate r1-r5 to the caller */
+				for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
+					if (bt_is_reg_set(bt, i)) {
+						bt_clear_reg(bt, i);
+						bt_set_frame_reg(bt, bt->frame - 1, i);
+					}
+				}
+				if (bt_subprog_exit(bt))
+					return -EFAULT;
+				return 0;
+			}
+		} else if ((bpf_helper_call(insn) &&
+			    is_callback_calling_function(insn->imm) &&
+			    !is_async_callback_calling_function(insn->imm)) ||
+			   (bpf_pseudo_kfunc_call(insn) && is_callback_calling_kfunc(insn->imm))) {
+			/* callback-calling helper or kfunc call, which means
+			 * we are exiting from subprog, but unlike the subprog
+			 * call handling above, we shouldn't propagate
+			 * precision of r1-r5 (if any requested), as they are
+			 * not actually arguments passed directly to callback
+			 * subprogs
 			 */
-			if (insn->src_reg == 0 && is_callback_calling_function(insn->imm))
+			if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
+				verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
+				WARN_ONCE(1, "verifier backtracking bug");
+				return -EFAULT;
+			}
+			if (bt_stack_mask(bt) != 0)
 				return -ENOTSUPP;
+			/* clear r1-r5 in callback subprog's mask */
+			for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+				bt_clear_reg(bt, i);
+			if (bt_subprog_exit(bt))
+				return -EFAULT;
+			return 0;
+		} else if (opcode == BPF_CALL) {
 			/* kfunc with imm==0 is invalid and fixup_kfunc_call will
 			 * catch this error later. Make backtracking conservative
 			 * with ENOTSUPP.
@@ -3482,7 +3579,39 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
 				return -EFAULT;
 			}
 		} else if (opcode == BPF_EXIT) {
-			return -ENOTSUPP;
+			bool r0_precise;
+
+			if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
+				/* if backtracing was looking for registers R1-R5
+				 * they should have been found already.
+				 */
+				verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
+				WARN_ONCE(1, "verifier backtracking bug");
+				return -EFAULT;
+			}
+
+			/* BPF_EXIT in subprog or callback always returns
+			 * right after the call instruction, so by checking
+			 * whether the instruction at subseq_idx-1 is subprog
+			 * call or not we can distinguish actual exit from
+			 * *subprog* from exit from *callback*. In the former
+			 * case, we need to propagate r0 precision, if
+			 * necessary. In the former we never do that.
+			 */
+			r0_precise = subseq_idx - 1 >= 0 &&
+				     bpf_pseudo_call(&env->prog->insnsi[subseq_idx - 1]) &&
+				     bt_is_reg_set(bt, BPF_REG_0);
+
+			bt_clear_reg(bt, BPF_REG_0);
+			if (bt_subprog_enter(bt))
+				return -EFAULT;
+
+			if (r0_precise)
+				bt_set_reg(bt, BPF_REG_0);
+			/* r6-r9 and stack slots will stay set in caller frame
+			 * bitmasks until we return back from callee(s)
+			 */
+			return 0;
 		} else if (BPF_SRC(insn->code) == BPF_X) {
 			if (!bt_is_reg_set(bt, dreg) && !bt_is_reg_set(bt, sreg))
 				return 0;
@@ -3735,7 +3864,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 	struct bpf_func_state *func;
 	struct bpf_reg_state *reg;
 	bool skip_first = true;
-	int i, fr, err;
+	int i, prev_i, fr, err;
 
 	if (!env->bpf_capable)
 		return 0;
@@ -3798,12 +3927,12 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 			return -EFAULT;
 		}
 
-		for (i = last_idx;;) {
+		for (i = last_idx, prev_i = -1;;) {
 			if (skip_first) {
 				err = 0;
 				skip_first = false;
 			} else {
-				err = backtrack_insn(env, i, bt);
+				err = backtrack_insn(env, i, prev_i, bt);
 			}
 			if (err == -ENOTSUPP) {
 				mark_all_scalars_precise(env, env->cur_state);
@@ -3820,6 +3949,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 				return 0;
 			if (i == first_idx)
 				break;
+			prev_i = i;
 			i = get_prev_insn_idx(st, i, &history);
 			if (i >= env->prog->len) {
 				/* This can happen if backtracking reached insn 0
@@ -8400,17 +8530,13 @@ static int set_callee_state(struct bpf_verifier_env *env,
 			    struct bpf_func_state *caller,
 			    struct bpf_func_state *callee, int insn_idx);
 
-static bool is_callback_calling_kfunc(u32 btf_id);
-
 static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			     int *insn_idx, int subprog,
 			     set_callee_state_fn set_callee_state_cb)
 {
 	struct bpf_verifier_state *state = env->cur_state;
-	struct bpf_func_info_aux *func_info_aux;
 	struct bpf_func_state *caller, *callee;
 	int err;
-	bool is_global = false;
 
 	if (state->curframe + 1 >= MAX_CALL_FRAMES) {
 		verbose(env, "the call stack of %d frames is too deep\n",
@@ -8425,13 +8551,10 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		return -EFAULT;
 	}
 
-	func_info_aux = env->prog->aux->func_info_aux;
-	if (func_info_aux)
-		is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
 	err = btf_check_subprog_call(env, subprog, caller->regs);
 	if (err == -EFAULT)
 		return err;
-	if (is_global) {
+	if (subprog_is_global(env, subprog)) {
 		if (err) {
 			verbose(env, "Caller passes invalid args into func#%d\n",
 				subprog);

From 3ef3d2177b1a5484908c53d19269b964c488c20f Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 4 May 2023 21:33:16 -0700
Subject: [PATCH 34/57] selftests/bpf: add precision propagation tests in the
 presence of subprogs

Add a bunch of tests validating verifier's precision backpropagation
logic in the presence of subprog calls and/or callback-calling
helpers/kfuncs.

We validate the following conditions:
  - subprog_result_precise: static subprog r0 result precision handling;
  - global_subprog_result_precise: global subprog r0 precision
    shortcutting, similar to BPF helper handling;
  - callback_result_precise: similarly r0 marking precise for
    callback-calling helpers;
  - parent_callee_saved_reg_precise, parent_callee_saved_reg_precise_global:
    propagation of precision for callee-saved registers bypassing
    static/global subprogs;
  - parent_callee_saved_reg_precise_with_callback: same as above, but in
    the presence of callback-calling helper;
  - parent_stack_slot_precise, parent_stack_slot_precise_global:
    similar to above, but instead propagating precision of stack slot
    (spilled SCALAR reg);
  - parent_stack_slot_precise_with_callback: same as above, but in the
    presence of callback-calling helper;
  - subprog_arg_precise: propagation of precision of static subprog's
    input argument back to caller;
  - subprog_spill_into_parent_stack_slot_precise: negative test
    validating that verifier currently can't support backtracking of stack
    access with non-r10 register, we validate that we fallback to
    forcing precision for all SCALARs.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230505043317.3629845-10-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/verifier.c       |   2 +
 tools/testing/selftests/bpf/progs/bpf_misc.h  |   4 +
 .../bpf/progs/verifier_subprog_precision.c    | 536 ++++++++++++++++++
 3 files changed, 542 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/verifier_subprog_precision.c

diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index 2497716ee379..531621adef42 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -55,6 +55,7 @@
 #include "verifier_spill_fill.skel.h"
 #include "verifier_spin_lock.skel.h"
 #include "verifier_stack_ptr.skel.h"
+#include "verifier_subprog_precision.skel.h"
 #include "verifier_subreg.skel.h"
 #include "verifier_uninit.skel.h"
 #include "verifier_unpriv.skel.h"
@@ -154,6 +155,7 @@ void test_verifier_sock(void)                 { RUN(verifier_sock); }
 void test_verifier_spill_fill(void)           { RUN(verifier_spill_fill); }
 void test_verifier_spin_lock(void)            { RUN(verifier_spin_lock); }
 void test_verifier_stack_ptr(void)            { RUN(verifier_stack_ptr); }
+void test_verifier_subprog_precision(void)    { RUN(verifier_subprog_precision); }
 void test_verifier_subreg(void)               { RUN(verifier_subreg); }
 void test_verifier_uninit(void)               { RUN(verifier_uninit); }
 void test_verifier_unpriv(void)               { RUN(verifier_unpriv); }
diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h
index d3c1217ba79a..38a57a2e70db 100644
--- a/tools/testing/selftests/bpf/progs/bpf_misc.h
+++ b/tools/testing/selftests/bpf/progs/bpf_misc.h
@@ -86,6 +86,10 @@
 #define POINTER_VALUE	0xcafe4all
 #define TEST_DATA_LEN	64
 
+#ifndef __used
+#define __used __attribute__((used))
+#endif
+
 #if defined(__TARGET_ARCH_x86)
 #define SYSCALL_WRAPPER 1
 #define SYS_PREFIX "__x64_"
diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
new file mode 100644
index 000000000000..db6b3143338b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
@@ -0,0 +1,536 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
+
+#include <errno.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
+
+int vals[] SEC(".data.vals") = {1, 2, 3, 4};
+
+__naked __noinline __used
+static unsigned long identity_subprog()
+{
+	/* the simplest *static* 64-bit identity function */
+	asm volatile (
+		"r0 = r1;"
+		"exit;"
+	);
+}
+
+__noinline __used
+unsigned long global_identity_subprog(__u64 x)
+{
+	/* the simplest *global* 64-bit identity function */
+	return x;
+}
+
+__naked __noinline __used
+static unsigned long callback_subprog()
+{
+	/* the simplest callback function */
+	asm volatile (
+		"r0 = 0;"
+		"exit;"
+	);
+}
+
+SEC("?raw_tp")
+__success __log_level(2)
+__msg("7: (0f) r1 += r0")
+__msg("mark_precise: frame0: regs=r0 stack= before 6: (bf) r1 = r7")
+__msg("mark_precise: frame0: regs=r0 stack= before 5: (27) r0 *= 4")
+__msg("mark_precise: frame0: regs=r0 stack= before 11: (95) exit")
+__msg("mark_precise: frame1: regs=r0 stack= before 10: (bf) r0 = r1")
+__msg("mark_precise: frame1: regs=r1 stack= before 4: (85) call pc+5")
+__msg("mark_precise: frame0: regs=r1 stack= before 3: (bf) r1 = r6")
+__msg("mark_precise: frame0: regs=r6 stack= before 2: (b7) r6 = 3")
+__naked int subprog_result_precise(void)
+{
+	asm volatile (
+		"r6 = 3;"
+		/* pass r6 through r1 into subprog to get it back as r0;
+		 * this whole chain will have to be marked as precise later
+		 */
+		"r1 = r6;"
+		"call identity_subprog;"
+		/* now use subprog's returned value (which is a
+		 * r6 -> r1 -> r0 chain), as index into vals array, forcing
+		 * all of that to be known precisely
+		 */
+		"r0 *= 4;"
+		"r1 = %[vals];"
+		/* here r0->r1->r6 chain is forced to be precise and has to be
+		 * propagated back to the beginning, including through the
+		 * subprog call
+		 */
+		"r1 += r0;"
+		"r0 = *(u32 *)(r1 + 0);"
+		"exit;"
+		:
+		: __imm_ptr(vals)
+		: __clobber_common, "r6"
+	);
+}
+
+SEC("?raw_tp")
+__success __log_level(2)
+__msg("9: (0f) r1 += r0")
+__msg("mark_precise: frame0: last_idx 9 first_idx 0")
+__msg("mark_precise: frame0: regs=r0 stack= before 8: (bf) r1 = r7")
+__msg("mark_precise: frame0: regs=r0 stack= before 7: (27) r0 *= 4")
+__msg("mark_precise: frame0: regs=r0 stack= before 5: (a5) if r0 < 0x4 goto pc+1")
+__msg("mark_precise: frame0: regs=r0 stack= before 4: (85) call pc+7")
+__naked int global_subprog_result_precise(void)
+{
+	asm volatile (
+		"r6 = 3;"
+		/* pass r6 through r1 into subprog to get it back as r0;
+		 * given global_identity_subprog is global, precision won't
+		 * propagate all the way back to r6
+		 */
+		"r1 = r6;"
+		"call global_identity_subprog;"
+		/* now use subprog's returned value (which is unknown now, so
+		 * we need to clamp it), as index into vals array, forcing r0
+		 * to be marked precise (with no effect on r6, though)
+		 */
+		"if r0 < %[vals_arr_sz] goto 1f;"
+		"r0 = %[vals_arr_sz] - 1;"
+	"1:"
+		"r0 *= 4;"
+		"r1 = %[vals];"
+		/* here r0 is forced to be precise and has to be
+		 * propagated back to the global subprog call, but it
+		 * shouldn't go all the way to mark r6 as precise
+		 */
+		"r1 += r0;"
+		"r0 = *(u32 *)(r1 + 0);"
+		"exit;"
+		:
+		: __imm_ptr(vals),
+		  __imm_const(vals_arr_sz, ARRAY_SIZE(vals))
+		: __clobber_common, "r6"
+	);
+}
+
+SEC("?raw_tp")
+__success __log_level(2)
+__msg("14: (0f) r1 += r6")
+__msg("mark_precise: frame0: last_idx 14 first_idx 10")
+__msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7")
+__msg("mark_precise: frame0: regs=r6 stack= before 12: (27) r6 *= 4")
+__msg("mark_precise: frame0: regs=r6 stack= before 11: (25) if r6 > 0x3 goto pc+4")
+__msg("mark_precise: frame0: regs=r6 stack= before 10: (bf) r6 = r0")
+__msg("mark_precise: frame0: parent state regs=r0 stack=:")
+__msg("mark_precise: frame0: last_idx 18 first_idx 0")
+__msg("mark_precise: frame0: regs=r0 stack= before 18: (95) exit")
+__naked int callback_result_precise(void)
+{
+	asm volatile (
+		"r6 = 3;"
+
+		/* call subprog and use result; r0 shouldn't propagate back to
+		 * callback_subprog
+		 */
+		"r1 = r6;"			/* nr_loops */
+		"r2 = %[callback_subprog];"	/* callback_fn */
+		"r3 = 0;"			/* callback_ctx */
+		"r4 = 0;"			/* flags */
+		"call %[bpf_loop];"
+
+		"r6 = r0;"
+		"if r6 > 3 goto 1f;"
+		"r6 *= 4;"
+		"r1 = %[vals];"
+		/* here r6 is forced to be precise and has to be propagated
+		 * back to the bpf_loop() call, but not beyond
+		 */
+		"r1 += r6;"
+		"r0 = *(u32 *)(r1 + 0);"
+	"1:"
+		"exit;"
+		:
+		: __imm_ptr(vals),
+		  __imm_ptr(callback_subprog),
+		  __imm(bpf_loop)
+		: __clobber_common, "r6"
+	);
+}
+
+SEC("?raw_tp")
+__success __log_level(2)
+__msg("7: (0f) r1 += r6")
+__msg("mark_precise: frame0: last_idx 7 first_idx 0")
+__msg("mark_precise: frame0: regs=r6 stack= before 6: (bf) r1 = r7")
+__msg("mark_precise: frame0: regs=r6 stack= before 5: (27) r6 *= 4")
+__msg("mark_precise: frame0: regs=r6 stack= before 11: (95) exit")
+__msg("mark_precise: frame1: regs= stack= before 10: (bf) r0 = r1")
+__msg("mark_precise: frame1: regs= stack= before 4: (85) call pc+5")
+__msg("mark_precise: frame0: regs=r6 stack= before 3: (b7) r1 = 0")
+__msg("mark_precise: frame0: regs=r6 stack= before 2: (b7) r6 = 3")
+__naked int parent_callee_saved_reg_precise(void)
+{
+	asm volatile (
+		"r6 = 3;"
+
+		/* call subprog and ignore result; we need this call only to
+		 * complicate jump history
+		 */
+		"r1 = 0;"
+		"call identity_subprog;"
+
+		"r6 *= 4;"
+		"r1 = %[vals];"
+		/* here r6 is forced to be precise and has to be propagated
+		 * back to the beginning, handling (and ignoring) subprog call
+		 */
+		"r1 += r6;"
+		"r0 = *(u32 *)(r1 + 0);"
+		"exit;"
+		:
+		: __imm_ptr(vals)
+		: __clobber_common, "r6"
+	);
+}
+
+SEC("?raw_tp")
+__success __log_level(2)
+__msg("7: (0f) r1 += r6")
+__msg("mark_precise: frame0: last_idx 7 first_idx 0")
+__msg("mark_precise: frame0: regs=r6 stack= before 6: (bf) r1 = r7")
+__msg("mark_precise: frame0: regs=r6 stack= before 5: (27) r6 *= 4")
+__msg("mark_precise: frame0: regs=r6 stack= before 4: (85) call pc+5")
+__msg("mark_precise: frame0: regs=r6 stack= before 3: (b7) r1 = 0")
+__msg("mark_precise: frame0: regs=r6 stack= before 2: (b7) r6 = 3")
+__naked int parent_callee_saved_reg_precise_global(void)
+{
+	asm volatile (
+		"r6 = 3;"
+
+		/* call subprog and ignore result; we need this call only to
+		 * complicate jump history
+		 */
+		"r1 = 0;"
+		"call global_identity_subprog;"
+
+		"r6 *= 4;"
+		"r1 = %[vals];"
+		/* here r6 is forced to be precise and has to be propagated
+		 * back to the beginning, handling (and ignoring) subprog call
+		 */
+		"r1 += r6;"
+		"r0 = *(u32 *)(r1 + 0);"
+		"exit;"
+		:
+		: __imm_ptr(vals)
+		: __clobber_common, "r6"
+	);
+}
+
+SEC("?raw_tp")
+__success __log_level(2)
+__msg("12: (0f) r1 += r6")
+__msg("mark_precise: frame0: last_idx 12 first_idx 10")
+__msg("mark_precise: frame0: regs=r6 stack= before 11: (bf) r1 = r7")
+__msg("mark_precise: frame0: regs=r6 stack= before 10: (27) r6 *= 4")
+__msg("mark_precise: frame0: parent state regs=r6 stack=:")
+__msg("mark_precise: frame0: last_idx 16 first_idx 0")
+__msg("mark_precise: frame0: regs=r6 stack= before 16: (95) exit")
+__msg("mark_precise: frame1: regs= stack= before 15: (b7) r0 = 0")
+__msg("mark_precise: frame1: regs= stack= before 9: (85) call bpf_loop#181")
+__msg("mark_precise: frame0: regs=r6 stack= before 8: (b7) r4 = 0")
+__msg("mark_precise: frame0: regs=r6 stack= before 7: (b7) r3 = 0")
+__msg("mark_precise: frame0: regs=r6 stack= before 6: (bf) r2 = r8")
+__msg("mark_precise: frame0: regs=r6 stack= before 5: (b7) r1 = 1")
+__msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3")
+__naked int parent_callee_saved_reg_precise_with_callback(void)
+{
+	asm volatile (
+		"r6 = 3;"
+
+		/* call subprog and ignore result; we need this call only to
+		 * complicate jump history
+		 */
+		"r1 = 1;"			/* nr_loops */
+		"r2 = %[callback_subprog];"	/* callback_fn */
+		"r3 = 0;"			/* callback_ctx */
+		"r4 = 0;"			/* flags */
+		"call %[bpf_loop];"
+
+		"r6 *= 4;"
+		"r1 = %[vals];"
+		/* here r6 is forced to be precise and has to be propagated
+		 * back to the beginning, handling (and ignoring) callback call
+		 */
+		"r1 += r6;"
+		"r0 = *(u32 *)(r1 + 0);"
+		"exit;"
+		:
+		: __imm_ptr(vals),
+		  __imm_ptr(callback_subprog),
+		  __imm(bpf_loop)
+		: __clobber_common, "r6"
+	);
+}
+
+SEC("?raw_tp")
+__success __log_level(2)
+__msg("9: (0f) r1 += r6")
+__msg("mark_precise: frame0: last_idx 9 first_idx 6")
+__msg("mark_precise: frame0: regs=r6 stack= before 8: (bf) r1 = r7")
+__msg("mark_precise: frame0: regs=r6 stack= before 7: (27) r6 *= 4")
+__msg("mark_precise: frame0: regs=r6 stack= before 6: (79) r6 = *(u64 *)(r10 -8)")
+__msg("mark_precise: frame0: parent state regs= stack=-8:")
+__msg("mark_precise: frame0: last_idx 13 first_idx 0")
+__msg("mark_precise: frame0: regs= stack=-8 before 13: (95) exit")
+__msg("mark_precise: frame1: regs= stack= before 12: (bf) r0 = r1")
+__msg("mark_precise: frame1: regs= stack= before 5: (85) call pc+6")
+__msg("mark_precise: frame0: regs= stack=-8 before 4: (b7) r1 = 0")
+__msg("mark_precise: frame0: regs= stack=-8 before 3: (7b) *(u64 *)(r10 -8) = r6")
+__msg("mark_precise: frame0: regs=r6 stack= before 2: (b7) r6 = 3")
+__naked int parent_stack_slot_precise(void)
+{
+	asm volatile (
+		/* spill reg */
+		"r6 = 3;"
+		"*(u64 *)(r10 - 8) = r6;"
+
+		/* call subprog and ignore result; we need this call only to
+		 * complicate jump history
+		 */
+		"r1 = 0;"
+		"call identity_subprog;"
+
+		/* restore reg from stack; in this case we'll be carrying
+		 * stack mask when going back into subprog through jump
+		 * history
+		 */
+		"r6 = *(u64 *)(r10 - 8);"
+
+		"r6 *= 4;"
+		"r1 = %[vals];"
+		/* here r6 is forced to be precise and has to be propagated
+		 * back to the beginning, handling (and ignoring) subprog call
+		 */
+		"r1 += r6;"
+		"r0 = *(u32 *)(r1 + 0);"
+		"exit;"
+		:
+		: __imm_ptr(vals)
+		: __clobber_common, "r6"
+	);
+}
+
+SEC("?raw_tp")
+__success __log_level(2)
+__msg("9: (0f) r1 += r6")
+__msg("mark_precise: frame0: last_idx 9 first_idx 6")
+__msg("mark_precise: frame0: regs=r6 stack= before 8: (bf) r1 = r7")
+__msg("mark_precise: frame0: regs=r6 stack= before 7: (27) r6 *= 4")
+__msg("mark_precise: frame0: regs=r6 stack= before 6: (79) r6 = *(u64 *)(r10 -8)")
+__msg("mark_precise: frame0: parent state regs= stack=-8:")
+__msg("mark_precise: frame0: last_idx 5 first_idx 0")
+__msg("mark_precise: frame0: regs= stack=-8 before 5: (85) call pc+6")
+__msg("mark_precise: frame0: regs= stack=-8 before 4: (b7) r1 = 0")
+__msg("mark_precise: frame0: regs= stack=-8 before 3: (7b) *(u64 *)(r10 -8) = r6")
+__msg("mark_precise: frame0: regs=r6 stack= before 2: (b7) r6 = 3")
+__naked int parent_stack_slot_precise_global(void)
+{
+	asm volatile (
+		/* spill reg */
+		"r6 = 3;"
+		"*(u64 *)(r10 - 8) = r6;"
+
+		/* call subprog and ignore result; we need this call only to
+		 * complicate jump history
+		 */
+		"r1 = 0;"
+		"call global_identity_subprog;"
+
+		/* restore reg from stack; in this case we'll be carrying
+		 * stack mask when going back into subprog through jump
+		 * history
+		 */
+		"r6 = *(u64 *)(r10 - 8);"
+
+		"r6 *= 4;"
+		"r1 = %[vals];"
+		/* here r6 is forced to be precise and has to be propagated
+		 * back to the beginning, handling (and ignoring) subprog call
+		 */
+		"r1 += r6;"
+		"r0 = *(u32 *)(r1 + 0);"
+		"exit;"
+		:
+		: __imm_ptr(vals)
+		: __clobber_common, "r6"
+	);
+}
+
+SEC("?raw_tp")
+__success __log_level(2)
+__msg("14: (0f) r1 += r6")
+__msg("mark_precise: frame0: last_idx 14 first_idx 11")
+__msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7")
+__msg("mark_precise: frame0: regs=r6 stack= before 12: (27) r6 *= 4")
+__msg("mark_precise: frame0: regs=r6 stack= before 11: (79) r6 = *(u64 *)(r10 -8)")
+__msg("mark_precise: frame0: parent state regs= stack=-8:")
+__msg("mark_precise: frame0: last_idx 18 first_idx 0")
+__msg("mark_precise: frame0: regs= stack=-8 before 18: (95) exit")
+__msg("mark_precise: frame1: regs= stack= before 17: (b7) r0 = 0")
+__msg("mark_precise: frame1: regs= stack= before 10: (85) call bpf_loop#181")
+__msg("mark_precise: frame0: regs= stack=-8 before 9: (b7) r4 = 0")
+__msg("mark_precise: frame0: regs= stack=-8 before 8: (b7) r3 = 0")
+__msg("mark_precise: frame0: regs= stack=-8 before 7: (bf) r2 = r8")
+__msg("mark_precise: frame0: regs= stack=-8 before 6: (bf) r1 = r6")
+__msg("mark_precise: frame0: regs= stack=-8 before 5: (7b) *(u64 *)(r10 -8) = r6")
+__msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3")
+__naked int parent_stack_slot_precise_with_callback(void)
+{
+	asm volatile (
+		/* spill reg */
+		"r6 = 3;"
+		"*(u64 *)(r10 - 8) = r6;"
+
+		/* ensure we have callback frame in jump history */
+		"r1 = r6;"			/* nr_loops */
+		"r2 = %[callback_subprog];"	/* callback_fn */
+		"r3 = 0;"			/* callback_ctx */
+		"r4 = 0;"			/* flags */
+		"call %[bpf_loop];"
+
+		/* restore reg from stack; in this case we'll be carrying
+		 * stack mask when going back into subprog through jump
+		 * history
+		 */
+		"r6 = *(u64 *)(r10 - 8);"
+
+		"r6 *= 4;"
+		"r1 = %[vals];"
+		/* here r6 is forced to be precise and has to be propagated
+		 * back to the beginning, handling (and ignoring) subprog call
+		 */
+		"r1 += r6;"
+		"r0 = *(u32 *)(r1 + 0);"
+		"exit;"
+		:
+		: __imm_ptr(vals),
+		  __imm_ptr(callback_subprog),
+		  __imm(bpf_loop)
+		: __clobber_common, "r6"
+	);
+}
+
+__noinline __used
+static __u64 subprog_with_precise_arg(__u64 x)
+{
+	return vals[x]; /* x is forced to be precise */
+}
+
+SEC("?raw_tp")
+__success __log_level(2)
+__msg("8: (0f) r2 += r1")
+__msg("mark_precise: frame1: last_idx 8 first_idx 0")
+__msg("mark_precise: frame1: regs=r1 stack= before 6: (18) r2 = ")
+__msg("mark_precise: frame1: regs=r1 stack= before 5: (67) r1 <<= 2")
+__msg("mark_precise: frame1: regs=r1 stack= before 2: (85) call pc+2")
+__msg("mark_precise: frame0: regs=r1 stack= before 1: (bf) r1 = r6")
+__msg("mark_precise: frame0: regs=r6 stack= before 0: (b7) r6 = 3")
+__naked int subprog_arg_precise(void)
+{
+	asm volatile (
+		"r6 = 3;"
+		"r1 = r6;"
+		/* subprog_with_precise_arg expects its argument to be
+		 * precise, so r1->r6 will be marked precise from inside the
+		 * subprog
+		 */
+		"call subprog_with_precise_arg;"
+		"r0 += r6;"
+		"exit;"
+		:
+		:
+		: __clobber_common, "r6"
+	);
+}
+
+/* r1 is pointer to stack slot;
+ * r2 is a register to spill into that slot
+ * subprog also spills r2 into its own stack slot
+ */
+__naked __noinline __used
+static __u64 subprog_spill_reg_precise(void)
+{
+	asm volatile (
+		/* spill to parent stack */
+		"*(u64 *)(r1 + 0) = r2;"
+		/* spill to subprog stack (we use -16 offset to avoid
+		 * accidental confusion with parent's -8 stack slot in
+		 * verifier log output)
+		 */
+		"*(u64 *)(r10 - 16) = r2;"
+		/* use both spills as return result to propagete precision everywhere */
+		"r0 = *(u64 *)(r10 - 16);"
+		"r2 = *(u64 *)(r1 + 0);"
+		"r0 += r2;"
+		"exit;"
+	);
+}
+
+SEC("?raw_tp")
+__success __log_level(2)
+/* precision backtracking can't currently handle stack access not through r10,
+ * so we won't be able to mark stack slot fp-8 as precise, and so will
+ * fallback to forcing all as precise
+ */
+__msg("mark_precise: frame0: falling back to forcing all scalars precise")
+__naked int subprog_spill_into_parent_stack_slot_precise(void)
+{
+	asm volatile (
+		"r6 = 1;"
+
+		/* pass pointer to stack slot and r6 to subprog;
+		 * r6 will be marked precise and spilled into fp-8 slot, which
+		 * also should be marked precise
+		 */
+		"r1 = r10;"
+		"r1 += -8;"
+		"r2 = r6;"
+		"call subprog_spill_reg_precise;"
+
+		/* restore reg from stack; in this case we'll be carrying
+		 * stack mask when going back into subprog through jump
+		 * history
+		 */
+		"r7 = *(u64 *)(r10 - 8);"
+
+		"r7 *= 4;"
+		"r1 = %[vals];"
+		/* here r7 is forced to be precise and has to be propagated
+		 * back to the beginning, handling subprog call and logic
+		 */
+		"r1 += r7;"
+		"r0 = *(u32 *)(r1 + 0);"
+		"exit;"
+		:
+		: __imm_ptr(vals)
+		: __clobber_common, "r6", "r7"
+	);
+}
+
+__naked __noinline __used
+static __u64 subprog_with_checkpoint(void)
+{
+	asm volatile (
+		"r0 = 0;"
+		/* guaranteed checkpoint if BPF_F_TEST_STATE_FREQ is used */
+		"goto +0;"
+		"exit;"
+	);
+}
+
+char _license[] SEC("license") = "GPL";

From c91ab90cea7ae61334c7026daf310f5875dfdee7 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 4 May 2023 21:33:17 -0700
Subject: [PATCH 35/57] selftests/bpf: revert iter test subprog precision
 workaround

Now that precision propagation is supported fully in the presence of
subprogs, there is no need to work around iter test. Revert original
workaround.

This reverts be7dbd275dc6 ("selftests/bpf: avoid mark_all_scalars_precise() trigger in one of iter tests").

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230505043317.3629845-11-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/iters.c | 26 ++++++++++-------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c
index be16143ae292..6b9b3c56f009 100644
--- a/tools/testing/selftests/bpf/progs/iters.c
+++ b/tools/testing/selftests/bpf/progs/iters.c
@@ -651,29 +651,25 @@ int iter_stack_array_loop(const void *ctx)
 	return sum;
 }
 
-#define ARR_SZ 16
-
-static __noinline void fill(struct bpf_iter_num *it, int *arr, int mul)
+static __noinline void fill(struct bpf_iter_num *it, int *arr, __u32 n, int mul)
 {
-	int *t;
-	__u64 i;
+	int *t, i;
 
 	while ((t = bpf_iter_num_next(it))) {
 		i = *t;
-		if (i >= ARR_SZ)
+		if (i >= n)
 			break;
 		arr[i] =  i * mul;
 	}
 }
 
-static __noinline int sum(struct bpf_iter_num *it, int *arr)
+static __noinline int sum(struct bpf_iter_num *it, int *arr, __u32 n)
 {
-	int *t, sum = 0;;
-	__u64 i;
+	int *t, i, sum = 0;;
 
 	while ((t = bpf_iter_num_next(it))) {
 		i = *t;
-		if (i >= ARR_SZ)
+		if (i >= n)
 			break;
 		sum += arr[i];
 	}
@@ -685,7 +681,7 @@ SEC("raw_tp")
 __success
 int iter_pass_iter_ptr_to_subprog(const void *ctx)
 {
-	int arr1[ARR_SZ], arr2[ARR_SZ];
+	int arr1[16], arr2[32];
 	struct bpf_iter_num it;
 	int n, sum1, sum2;
 
@@ -694,25 +690,25 @@ int iter_pass_iter_ptr_to_subprog(const void *ctx)
 	/* fill arr1 */
 	n = ARRAY_SIZE(arr1);
 	bpf_iter_num_new(&it, 0, n);
-	fill(&it, arr1, 2);
+	fill(&it, arr1, n, 2);
 	bpf_iter_num_destroy(&it);
 
 	/* fill arr2 */
 	n = ARRAY_SIZE(arr2);
 	bpf_iter_num_new(&it, 0, n);
-	fill(&it, arr2, 10);
+	fill(&it, arr2, n, 10);
 	bpf_iter_num_destroy(&it);
 
 	/* sum arr1 */
 	n = ARRAY_SIZE(arr1);
 	bpf_iter_num_new(&it, 0, n);
-	sum1 = sum(&it, arr1);
+	sum1 = sum(&it, arr1, n);
 	bpf_iter_num_destroy(&it);
 
 	/* sum arr2 */
 	n = ARRAY_SIZE(arr2);
 	bpf_iter_num_new(&it, 0, n);
-	sum2 = sum(&it, arr2);
+	sum2 = sum(&it, arr2, n);
 	bpf_iter_num_destroy(&it);
 
 	bpf_printk("sum1=%d, sum2=%d", sum1, sum2);

From 69535186297b37e6e0a16290766666f4e8a55793 Mon Sep 17 00:00:00 2001
From: Will Hawkins <whh8b@obs.cr>
Date: Thu, 27 Apr 2023 22:30:15 -0400
Subject: [PATCH 36/57] bpf, docs: Update llvm_relocs.rst with typo fixes

Correct a few typographical errors and fix some mistakes in examples.

Signed-off-by: Will Hawkins <hawkinsw@obs.cr>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20230428023015.1698072-2-hawkinsw@obs.cr
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/llvm_reloc.rst | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/Documentation/bpf/llvm_reloc.rst b/Documentation/bpf/llvm_reloc.rst
index ca8957d5b671..e4a777a6a3a2 100644
--- a/Documentation/bpf/llvm_reloc.rst
+++ b/Documentation/bpf/llvm_reloc.rst
@@ -48,7 +48,7 @@ the code with ``llvm-objdump -dr test.o``::
       14:       0f 10 00 00 00 00 00 00 r0 += r1
       15:       95 00 00 00 00 00 00 00 exit
 
-There are four relations in the above for four ``LD_imm64`` instructions.
+There are four relocations in the above for four ``LD_imm64`` instructions.
 The following ``llvm-readelf -r test.o`` shows the binary values of the four
 relocations::
 
@@ -79,14 +79,16 @@ The following is the symbol table with ``llvm-readelf -s test.o``::
 The 6th entry is global variable ``g1`` with value 0.
 
 Similarly, the second relocation is at ``.text`` offset ``0x18``, instruction 3,
-for global variable ``g2`` which has a symbol value 4, the offset
-from the start of ``.data`` section.
+has a type of ``R_BPF_64_64`` and refers to entry 7 in the symbol table.
+The second relocation resolves to global variable ``g2`` which has a symbol
+value 4. The symbol value represents the offset from the start of ``.data``
+section where the initial value of the global variable ``g2`` is stored.
 
-The third and fourth relocations refers to static variables ``l1``
-and ``l2``. From ``.rel.text`` section above, it is not clear
-which symbols they really refers to as they both refers to
+The third and fourth relocations refer to static variables ``l1``
+and ``l2``. From the ``.rel.text`` section above, it is not clear
+to which symbols they really refer as they both refer to
 symbol table entry 4, symbol ``sec``, which has ``STT_SECTION`` type
-and represents a section. So for static variable or function,
+and represents a section. So for a static variable or function,
 the section offset is written to the original insn
 buffer, which is called ``A`` (addend). Looking at
 above insn ``7`` and ``11``, they have section offset ``8`` and ``12``.

From f4dea9689c5fea3d07170c2cb0703e216f1a0922 Mon Sep 17 00:00:00 2001
From: Pengcheng Yang <yangpc@wangsu.com>
Date: Fri, 5 May 2023 16:50:58 +0800
Subject: [PATCH 37/57] samples/bpf: Fix buffer overflow in tcp_basertt

Using sizeof(nv) or strlen(nv)+1 is correct.

Fixes: c890063e4404 ("bpf: sample BPF_SOCKET_OPS_BASE_RTT program")
Signed-off-by: Pengcheng Yang <yangpc@wangsu.com>
Link: https://lore.kernel.org/r/1683276658-2860-1-git-send-email-yangpc@wangsu.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/tcp_basertt_kern.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/bpf/tcp_basertt_kern.c b/samples/bpf/tcp_basertt_kern.c
index 8dfe09a92fec..822b0742b815 100644
--- a/samples/bpf/tcp_basertt_kern.c
+++ b/samples/bpf/tcp_basertt_kern.c
@@ -47,7 +47,7 @@ int bpf_basertt(struct bpf_sock_ops *skops)
 		case BPF_SOCK_OPS_BASE_RTT:
 			n = bpf_getsockopt(skops, SOL_TCP, TCP_CONGESTION,
 					   cong, sizeof(cong));
-			if (!n && !__builtin_memcmp(cong, nv, sizeof(nv)+1)) {
+			if (!n && !__builtin_memcmp(cong, nv, sizeof(nv))) {
 				/* Set base_rtt to 80us */
 				rv = 80;
 			} else if (n) {

From b5ad4cdc46c7d6e7f8d2c9e24b6c9a1edec95154 Mon Sep 17 00:00:00 2001
From: Feng Zhou <zhoufeng.zf@bytedance.com>
Date: Sat, 6 May 2023 11:15:44 +0800
Subject: [PATCH 38/57] bpf: Add bpf_task_under_cgroup() kfunc

Add a kfunc that's similar to the bpf_current_task_under_cgroup.
The difference is that it is a designated task.

When hook sched related functions, sometimes it is necessary to
specify a task instead of the current task.

Signed-off-by: Feng Zhou <zhoufeng.zf@bytedance.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20230506031545.35991-2-zhoufeng.zf@bytedance.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index bb6b4637ebf2..a128fe0ab2d0 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2149,6 +2149,22 @@ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid)
 		return NULL;
 	return cgrp;
 }
+
+/**
+ * bpf_task_under_cgroup - wrap task_under_cgroup_hierarchy() as a kfunc, test
+ * task's membership of cgroup ancestry.
+ * @task: the task to be tested
+ * @ancestor: possible ancestor of @task's cgroup
+ *
+ * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
+ * It follows all the same rules as cgroup_is_descendant, and only applies
+ * to the default hierarchy.
+ */
+__bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task,
+				       struct cgroup *ancestor)
+{
+	return task_under_cgroup_hierarchy(task, ancestor);
+}
 #endif /* CONFIG_CGROUPS */
 
 /**
@@ -2400,6 +2416,7 @@ BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
 #endif
 BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
 BTF_SET8_END(generic_btf_ids)

From 49e0263ab40f15a11cefa397bb17a0204505cead Mon Sep 17 00:00:00 2001
From: Feng Zhou <zhoufeng.zf@bytedance.com>
Date: Sat, 6 May 2023 11:15:45 +0800
Subject: [PATCH 39/57] selftests/bpf: Add testcase for bpf_task_under_cgroup

test_progs:
Tests new kfunc bpf_task_under_cgroup().

The bpf program saves the new task's pid within a given cgroup to
the remote_pid, which is convenient for the user-mode program to
verify the test correctness.

The user-mode program creates its own mount namespace, and mounts the
cgroupsv2 hierarchy in there, call the fork syscall, then check if
remote_pid and local_pid are unequal.

Signed-off-by: Feng Zhou <zhoufeng.zf@bytedance.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20230506031545.35991-3-zhoufeng.zf@bytedance.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/DENYLIST.s390x    |  1 +
 .../bpf/prog_tests/task_under_cgroup.c        | 53 +++++++++++++++++++
 .../bpf/progs/test_task_under_cgroup.c        | 51 ++++++++++++++++++
 3 files changed, 105 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/task_under_cgroup.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_task_under_cgroup.c

diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x
index c7463f3ec3c0..5061d9e24c16 100644
--- a/tools/testing/selftests/bpf/DENYLIST.s390x
+++ b/tools/testing/selftests/bpf/DENYLIST.s390x
@@ -26,3 +26,4 @@ user_ringbuf                             # failed to find kernel BTF type ID of
 verif_stats                              # trace_vprintk__open_and_load unexpected error: -9                           (?)
 xdp_bonding                              # failed to auto-attach program 'trace_on_entry': -524                        (trampoline)
 xdp_metadata                             # JIT does not support calling kernel function                                (kfunc)
+test_task_under_cgroup                   # JIT does not support calling kernel function                                (kfunc)
diff --git a/tools/testing/selftests/bpf/prog_tests/task_under_cgroup.c b/tools/testing/selftests/bpf/prog_tests/task_under_cgroup.c
new file mode 100644
index 000000000000..4224727fb364
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/task_under_cgroup.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Bytedance */
+
+#include <sys/syscall.h>
+#include <test_progs.h>
+#include <cgroup_helpers.h>
+#include "test_task_under_cgroup.skel.h"
+
+#define FOO	"/foo"
+
+void test_task_under_cgroup(void)
+{
+	struct test_task_under_cgroup *skel;
+	int ret, foo;
+	pid_t pid;
+
+	foo = test__join_cgroup(FOO);
+	if (!ASSERT_OK(foo < 0, "cgroup_join_foo"))
+		return;
+
+	skel = test_task_under_cgroup__open();
+	if (!ASSERT_OK_PTR(skel, "test_task_under_cgroup__open"))
+		goto cleanup;
+
+	skel->rodata->local_pid = getpid();
+	skel->bss->remote_pid = getpid();
+	skel->rodata->cgid = get_cgroup_id(FOO);
+
+	ret = test_task_under_cgroup__load(skel);
+	if (!ASSERT_OK(ret, "test_task_under_cgroup__load"))
+		goto cleanup;
+
+	ret = test_task_under_cgroup__attach(skel);
+	if (!ASSERT_OK(ret, "test_task_under_cgroup__attach"))
+		goto cleanup;
+
+	pid = fork();
+	if (pid == 0)
+		exit(0);
+
+	ret = (pid == -1);
+	if (ASSERT_OK(ret, "fork process"))
+		wait(NULL);
+
+	test_task_under_cgroup__detach(skel);
+
+	ASSERT_NEQ(skel->bss->remote_pid, skel->rodata->local_pid,
+		   "test task_under_cgroup");
+
+cleanup:
+	test_task_under_cgroup__destroy(skel);
+	close(foo);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_task_under_cgroup.c b/tools/testing/selftests/bpf/progs/test_task_under_cgroup.c
new file mode 100644
index 000000000000..56cdc0a553f0
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_task_under_cgroup.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Bytedance */
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+
+#include "bpf_misc.h"
+
+struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym;
+long bpf_task_under_cgroup(struct task_struct *task, struct cgroup *ancestor) __ksym;
+void bpf_cgroup_release(struct cgroup *p) __ksym;
+struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym;
+void bpf_task_release(struct task_struct *p) __ksym;
+
+const volatile int local_pid;
+const volatile __u64 cgid;
+int remote_pid;
+
+SEC("tp_btf/task_newtask")
+int BPF_PROG(handle__task_newtask, struct task_struct *task, u64 clone_flags)
+{
+	struct cgroup *cgrp = NULL;
+	struct task_struct *acquired;
+
+	if (local_pid != (bpf_get_current_pid_tgid() >> 32))
+		return 0;
+
+	acquired = bpf_task_acquire(task);
+	if (!acquired)
+		return 0;
+
+	if (local_pid == acquired->tgid)
+		goto out;
+
+	cgrp = bpf_cgroup_from_id(cgid);
+	if (!cgrp)
+		goto out;
+
+	if (bpf_task_under_cgroup(acquired, cgrp))
+		remote_pid = acquired->tgid;
+
+out:
+	if (cgrp)
+		bpf_cgroup_release(cgrp);
+	bpf_task_release(acquired);
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";

From 3bda08b63670c39be390fcb00e7718775508e673 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Fri, 5 May 2023 18:31:30 -0700
Subject: [PATCH 40/57] bpf: Allow NULL buffers in bpf_dynptr_slice(_rw)

bpf_dynptr_slice(_rw) uses a user provided buffer if it can not provide
a pointer to a block of contiguous memory. This buffer is unused in the
case of local dynptrs, and may be unused in other cases as well. There
is no need to require the buffer, as the kfunc can just return NULL if
it was needed and not provided.

This adds another kfunc annotation, __opt, which combines with __sz and
__szk to allow the buffer associated with the size to be NULL. If the
buffer is NULL, the verifier does not check that the buffer is of
sufficient size.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Link: https://lore.kernel.org/r/20230506013134.2492210-2-drosen@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/kfuncs.rst | 23 ++++++++++++++++++++++-
 include/linux/skbuff.h       |  2 +-
 kernel/bpf/helpers.c         | 30 ++++++++++++++++++------------
 kernel/bpf/verifier.c        | 17 +++++++++++++----
 4 files changed, 54 insertions(+), 18 deletions(-)

diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
index ea2516374d92..7a3d9de5f315 100644
--- a/Documentation/bpf/kfuncs.rst
+++ b/Documentation/bpf/kfuncs.rst
@@ -100,7 +100,7 @@ Hence, whenever a constant scalar argument is accepted by a kfunc which is not a
 size parameter, and the value of the constant matters for program safety, __k
 suffix should be used.
 
-2.2.2 __uninit Annotation
+2.2.3 __uninit Annotation
 -------------------------
 
 This annotation is used to indicate that the argument will be treated as
@@ -117,6 +117,27 @@ Here, the dynptr will be treated as an uninitialized dynptr. Without this
 annotation, the verifier will reject the program if the dynptr passed in is
 not initialized.
 
+2.2.4 __opt Annotation
+-------------------------
+
+This annotation is used to indicate that the buffer associated with an __sz or __szk
+argument may be null. If the function is passed a nullptr in place of the buffer,
+the verifier will not check that length is appropriate for the buffer. The kfunc is
+responsible for checking if this buffer is null before using it.
+
+An example is given below::
+
+        __bpf_kfunc void *bpf_dynptr_slice(..., void *buffer__opt, u32 buffer__szk)
+        {
+        ...
+        }
+
+Here, the buffer may be null. If buffer is not null, it at least of size buffer_szk.
+Either way, the returned buffer is either NULL, or of size buffer_szk. Without this
+annotation, the verifier will reject the program if a null pointer is passed in with
+a nonzero size.
+
+
 .. _BPF_kfunc_nodef:
 
 2.3 Using an existing kernel function
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 738776ab8838..8ddb4af1a501 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4033,7 +4033,7 @@ __skb_header_pointer(const struct sk_buff *skb, int offset, int len,
 	if (likely(hlen - offset >= len))
 		return (void *)data + offset;
 
-	if (!skb || unlikely(skb_copy_bits(skb, offset, buffer, len) < 0))
+	if (!skb || !buffer || unlikely(skb_copy_bits(skb, offset, buffer, len) < 0))
 		return NULL;
 
 	return buffer;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a128fe0ab2d0..4ef4c4f8a355 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2190,13 +2190,15 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
  * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
  * @ptr: The dynptr whose data slice to retrieve
  * @offset: Offset into the dynptr
- * @buffer: User-provided buffer to copy contents into
- * @buffer__szk: Size (in bytes) of the buffer. This is the length of the
- *		 requested slice. This must be a constant.
+ * @buffer__opt: User-provided buffer to copy contents into.  May be NULL
+ * @buffer__szk: Size (in bytes) of the buffer if present. This is the
+ *               length of the requested slice. This must be a constant.
  *
  * For non-skb and non-xdp type dynptrs, there is no difference between
  * bpf_dynptr_slice and bpf_dynptr_data.
  *
+ *  If buffer__opt is NULL, the call will fail if buffer_opt was needed.
+ *
  * If the intention is to write to the data slice, please use
  * bpf_dynptr_slice_rdwr.
  *
@@ -2213,7 +2215,7 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
  * direct pointer)
  */
 __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset,
-				   void *buffer, u32 buffer__szk)
+				   void *buffer__opt, u32 buffer__szk)
 {
 	enum bpf_dynptr_type type;
 	u32 len = buffer__szk;
@@ -2233,15 +2235,17 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
 	case BPF_DYNPTR_TYPE_RINGBUF:
 		return ptr->data + ptr->offset + offset;
 	case BPF_DYNPTR_TYPE_SKB:
-		return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
+		return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt);
 	case BPF_DYNPTR_TYPE_XDP:
 	{
 		void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len);
 		if (xdp_ptr)
 			return xdp_ptr;
 
-		bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer, len, false);
-		return buffer;
+		if (!buffer__opt)
+			return NULL;
+		bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false);
+		return buffer__opt;
 	}
 	default:
 		WARN_ONCE(true, "unknown dynptr type %d\n", type);
@@ -2253,13 +2257,15 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
  * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data.
  * @ptr: The dynptr whose data slice to retrieve
  * @offset: Offset into the dynptr
- * @buffer: User-provided buffer to copy contents into
- * @buffer__szk: Size (in bytes) of the buffer. This is the length of the
- *		 requested slice. This must be a constant.
+ * @buffer__opt: User-provided buffer to copy contents into. May be NULL
+ * @buffer__szk: Size (in bytes) of the buffer if present. This is the
+ *               length of the requested slice. This must be a constant.
  *
  * For non-skb and non-xdp type dynptrs, there is no difference between
  * bpf_dynptr_slice and bpf_dynptr_data.
  *
+ * If buffer__opt is NULL, the call will fail if buffer_opt was needed.
+ *
  * The returned pointer is writable and may point to either directly the dynptr
  * data at the requested offset or to the buffer if unable to obtain a direct
  * data pointer to (example: the requested slice is to the paged area of an skb
@@ -2290,7 +2296,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
  * direct pointer)
  */
 __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset,
-					void *buffer, u32 buffer__szk)
+					void *buffer__opt, u32 buffer__szk)
 {
 	if (!ptr->data || __bpf_dynptr_is_rdonly(ptr))
 		return NULL;
@@ -2317,7 +2323,7 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 o
 	 * will be copied out into the buffer and the user will need to call
 	 * bpf_dynptr_write() to commit changes.
 	 */
-	return bpf_dynptr_slice(ptr, offset, buffer, buffer__szk);
+	return bpf_dynptr_slice(ptr, offset, buffer__opt, buffer__szk);
 }
 
 __bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 end)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0fa96581eb77..7e6bbae9db81 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9743,6 +9743,11 @@ static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
 	return __kfunc_param_match_suffix(btf, arg, "__szk");
 }
 
+static bool is_kfunc_arg_optional(const struct btf *btf, const struct btf_param *arg)
+{
+	return __kfunc_param_match_suffix(btf, arg, "__opt");
+}
+
 static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
 {
 	return __kfunc_param_match_suffix(btf, arg, "__k");
@@ -10830,13 +10835,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			break;
 		case KF_ARG_PTR_TO_MEM_SIZE:
 		{
+			struct bpf_reg_state *buff_reg = &regs[regno];
+			const struct btf_param *buff_arg = &args[i];
 			struct bpf_reg_state *size_reg = &regs[regno + 1];
 			const struct btf_param *size_arg = &args[i + 1];
 
-			ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
-			if (ret < 0) {
-				verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
-				return ret;
+			if (!register_is_null(buff_reg) || !is_kfunc_arg_optional(meta->btf, buff_arg)) {
+				ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
+				if (ret < 0) {
+					verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
+					return ret;
+				}
 			}
 
 			if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) {

From 1ce33b6c846fbe0439eeee477b767de4bc3ad35f Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Fri, 5 May 2023 18:31:31 -0700
Subject: [PATCH 41/57] selftests/bpf: Test allowing NULL buffer in dynptr
 slice

bpf_dynptr_slice(_rw) no longer requires a buffer for verification. If the
buffer is needed, but not present, the function will return NULL.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Link: https://lore.kernel.org/r/20230506013134.2492210-3-drosen@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/dynptr.c |  1 +
 .../selftests/bpf/progs/dynptr_success.c        | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c
index 0478916aff37..13d4b9ab16e7 100644
--- a/tools/testing/selftests/bpf/prog_tests/dynptr.c
+++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c
@@ -26,6 +26,7 @@ static struct {
 	{"test_dynptr_is_null", SETUP_SYSCALL_SLEEP},
 	{"test_dynptr_is_rdonly", SETUP_SKB_PROG},
 	{"test_dynptr_clone", SETUP_SKB_PROG},
+	{"test_dynptr_skb_no_buff", SETUP_SKB_PROG},
 };
 
 static void verify_success(const char *prog_name, enum test_setup_type setup_type)
diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c
index be7de62de045..d299ef3b4d1f 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_success.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_success.c
@@ -505,3 +505,20 @@ int test_dynptr_clone(struct __sk_buff *skb)
 
 	return 0;
 }
+
+SEC("?cgroup_skb/egress")
+int test_dynptr_skb_no_buff(struct __sk_buff *skb)
+{
+	struct bpf_dynptr ptr;
+	__u64 *data;
+
+	if (bpf_dynptr_from_skb(skb, 0, &ptr)) {
+		err = 1;
+		return 1;
+	}
+
+	/* This may return NULL. SKB may require a buffer */
+	data = bpf_dynptr_slice(&ptr, 0, NULL, 1);
+
+	return !!data;
+}

From 3881fdfed21ff129a23979c0a92df6d3c5f49aa9 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Fri, 5 May 2023 18:31:32 -0700
Subject: [PATCH 42/57] selftests/bpf: Check overflow in optional buffer

This ensures we still reject invalid memory accesses in buffers that are
marked optional.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Link: https://lore.kernel.org/r/20230506013134.2492210-4-drosen@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/progs/dynptr_fail.c | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index efe4ce72d00e..c2f0e18af951 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -1665,3 +1665,23 @@ int clone_xdp_packet_data(struct xdp_md *xdp)
 
 	return 0;
 }
+
+/* Buffers that are provided must be sufficiently long */
+SEC("?cgroup_skb/egress")
+__failure __msg("memory, len pair leads to invalid memory access")
+int test_dynptr_skb_small_buff(struct __sk_buff *skb)
+{
+	struct bpf_dynptr ptr;
+	char buffer[8] = {};
+	__u64 *data;
+
+	if (bpf_dynptr_from_skb(skb, 0, &ptr)) {
+		err = 1;
+		return 1;
+	}
+
+	/* This may return NULL. SKB may require a buffer */
+	data = bpf_dynptr_slice(&ptr, 0, buffer, 9);
+
+	return !!data;
+}

From 2012c867c8005d72c949e274133df429ece78808 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Fri, 5 May 2023 18:31:33 -0700
Subject: [PATCH 43/57] bpf: verifier: Accept dynptr mem as mem in helpers

This allows using memory retrieved from dynptrs with helper functions
that accept ARG_PTR_TO_MEM. For instance, results from bpf_dynptr_data
can be passed along to bpf_strncmp.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Link: https://lore.kernel.org/r/20230506013134.2492210-5-drosen@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7e6bbae9db81..754129d41225 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7495,12 +7495,16 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
 	 * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL,
 	 * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL
 	 *
+	 * ARG_PTR_TO_MEM is compatible with PTR_TO_MEM that is tagged with a dynptr type.
+	 *
 	 * Therefore we fold these flags depending on the arg_type before comparison.
 	 */
 	if (arg_type & MEM_RDONLY)
 		type &= ~MEM_RDONLY;
 	if (arg_type & PTR_MAYBE_NULL)
 		type &= ~PTR_MAYBE_NULL;
+	if (base_type(arg_type) == ARG_PTR_TO_MEM)
+		type &= ~DYNPTR_TYPE_FLAG_MASK;
 
 	if (meta->func_id == BPF_FUNC_kptr_xchg && type & MEM_ALLOC)
 		type &= ~MEM_ALLOC;

From 798e48fc28fa64aa4eca6e8a404fa20ac8f7c09e Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Fri, 5 May 2023 18:31:34 -0700
Subject: [PATCH 44/57] selftests/bpf: Accept mem from dynptr in helper funcs

This ensures that buffers retrieved from dynptr_data are allowed to be
passed in to helpers that take mem, like bpf_strncmp

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Link: https://lore.kernel.org/r/20230506013134.2492210-6-drosen@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/prog_tests/dynptr.c |  1 +
 .../selftests/bpf/progs/dynptr_success.c      | 21 +++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c
index 13d4b9ab16e7..7cfac53c0d58 100644
--- a/tools/testing/selftests/bpf/prog_tests/dynptr.c
+++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c
@@ -27,6 +27,7 @@ static struct {
 	{"test_dynptr_is_rdonly", SETUP_SKB_PROG},
 	{"test_dynptr_clone", SETUP_SKB_PROG},
 	{"test_dynptr_skb_no_buff", SETUP_SKB_PROG},
+	{"test_dynptr_skb_strcmp", SETUP_SKB_PROG},
 };
 
 static void verify_success(const char *prog_name, enum test_setup_type setup_type)
diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c
index d299ef3b4d1f..0c053976f8f9 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_success.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_success.c
@@ -522,3 +522,24 @@ int test_dynptr_skb_no_buff(struct __sk_buff *skb)
 
 	return !!data;
 }
+
+SEC("?cgroup_skb/egress")
+int test_dynptr_skb_strcmp(struct __sk_buff *skb)
+{
+	struct bpf_dynptr ptr;
+	char *data;
+
+	if (bpf_dynptr_from_skb(skb, 0, &ptr)) {
+		err = 1;
+		return 1;
+	}
+
+	/* This may return NULL. SKB may require a buffer */
+	data = bpf_dynptr_slice(&ptr, 0, NULL, 10);
+	if (data) {
+		bpf_strncmp(data, 10, "foo");
+		return 1;
+	}
+
+	return 1;
+}

From 7b99f75942da332e3f4f865e55a10fec95a30d4f Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Wed, 10 May 2023 14:02:41 +0100
Subject: [PATCH 45/57] bpf: Add --skip_encoding_btf_inconsistent_proto,
 --btf_gen_optimized to pahole flags for v1.25

v1.25 of pahole supports filtering out functions with multiple inconsistent
function prototypes or optimized-out parameters from the BTF representation.
These present problems because there is no additional info in BTF saying which
inconsistent prototype matches which function instance to help guide attachment,
and functions with optimized-out parameters can lead to incorrect assumptions
about register contents.

So for now, filter out such functions while adding BTF representations for
functions that have "."-suffixes (foo.isra.0) but not optimized-out parameters.
This patch assumes that below linked changes land in pahole for v1.25.

Issues with pahole filtering being too aggressive in removing functions
appear to be resolved now, but CI and further testing will confirm.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20230510130241.1696561-1-alan.maguire@oracle.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 scripts/pahole-flags.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/pahole-flags.sh b/scripts/pahole-flags.sh
index 1f1f1d397c39..728d55190d97 100755
--- a/scripts/pahole-flags.sh
+++ b/scripts/pahole-flags.sh
@@ -23,5 +23,8 @@ if [ "${pahole_ver}" -ge "124" ]; then
 	# see PAHOLE_HAS_LANG_EXCLUDE
 	extra_paholeopt="${extra_paholeopt} --lang_exclude=rust"
 fi
+if [ "${pahole_ver}" -ge "125" ]; then
+	extra_paholeopt="${extra_paholeopt} --skip_encoding_btf_inconsistent_proto --btf_gen_optimized"
+fi
 
 echo ${extra_paholeopt}

From ee9fd0ac3017c4313be91a220a9ac4c99dde7ad4 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Wed, 10 May 2023 21:37:48 -0700
Subject: [PATCH 46/57] bpf: Address KCSAN report on bpf_lru_list

KCSAN reported a data-race when accessing node->ref.
Although node->ref does not have to be accurate,
take this chance to use a more common READ_ONCE() and WRITE_ONCE()
pattern instead of data_race().

There is an existing bpf_lru_node_is_ref() and bpf_lru_node_set_ref().
This patch also adds bpf_lru_node_clear_ref() to do the
WRITE_ONCE(node->ref, 0) also.

==================================================================
BUG: KCSAN: data-race in __bpf_lru_list_rotate / __htab_lru_percpu_map_update_elem

write to 0xffff888137038deb of 1 bytes by task 11240 on cpu 1:
__bpf_lru_node_move kernel/bpf/bpf_lru_list.c:113 [inline]
__bpf_lru_list_rotate_active kernel/bpf/bpf_lru_list.c:149 [inline]
__bpf_lru_list_rotate+0x1bf/0x750 kernel/bpf/bpf_lru_list.c:240
bpf_lru_list_pop_free_to_local kernel/bpf/bpf_lru_list.c:329 [inline]
bpf_common_lru_pop_free kernel/bpf/bpf_lru_list.c:447 [inline]
bpf_lru_pop_free+0x638/0xe20 kernel/bpf/bpf_lru_list.c:499
prealloc_lru_pop kernel/bpf/hashtab.c:290 [inline]
__htab_lru_percpu_map_update_elem+0xe7/0x820 kernel/bpf/hashtab.c:1316
bpf_percpu_hash_update+0x5e/0x90 kernel/bpf/hashtab.c:2313
bpf_map_update_value+0x2a9/0x370 kernel/bpf/syscall.c:200
generic_map_update_batch+0x3ae/0x4f0 kernel/bpf/syscall.c:1687
bpf_map_do_batch+0x2d9/0x3d0 kernel/bpf/syscall.c:4534
__sys_bpf+0x338/0x810
__do_sys_bpf kernel/bpf/syscall.c:5096 [inline]
__se_sys_bpf kernel/bpf/syscall.c:5094 [inline]
__x64_sys_bpf+0x43/0x50 kernel/bpf/syscall.c:5094
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x63/0xcd

read to 0xffff888137038deb of 1 bytes by task 11241 on cpu 0:
bpf_lru_node_set_ref kernel/bpf/bpf_lru_list.h:70 [inline]
__htab_lru_percpu_map_update_elem+0x2f1/0x820 kernel/bpf/hashtab.c:1332
bpf_percpu_hash_update+0x5e/0x90 kernel/bpf/hashtab.c:2313
bpf_map_update_value+0x2a9/0x370 kernel/bpf/syscall.c:200
generic_map_update_batch+0x3ae/0x4f0 kernel/bpf/syscall.c:1687
bpf_map_do_batch+0x2d9/0x3d0 kernel/bpf/syscall.c:4534
__sys_bpf+0x338/0x810
__do_sys_bpf kernel/bpf/syscall.c:5096 [inline]
__se_sys_bpf kernel/bpf/syscall.c:5094 [inline]
__x64_sys_bpf+0x43/0x50 kernel/bpf/syscall.c:5094
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x63/0xcd

value changed: 0x01 -> 0x00

Reported by Kernel Concurrency Sanitizer on:
CPU: 0 PID: 11241 Comm: syz-executor.3 Not tainted 6.3.0-rc7-syzkaller-00136-g6a66fdd29ea1 #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/30/2023
==================================================================

Reported-by: syzbot+ebe648a84e8784763f82@syzkaller.appspotmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20230511043748.1384166-1-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/bpf_lru_list.c | 21 +++++++++++++--------
 kernel/bpf/bpf_lru_list.h |  7 ++-----
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index d99e89f113c4..3dabdd137d10 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -41,7 +41,12 @@ static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l)
 /* bpf_lru_node helpers */
 static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node)
 {
-	return node->ref;
+	return READ_ONCE(node->ref);
+}
+
+static void bpf_lru_node_clear_ref(struct bpf_lru_node *node)
+{
+	WRITE_ONCE(node->ref, 0);
 }
 
 static void bpf_lru_list_count_inc(struct bpf_lru_list *l,
@@ -89,7 +94,7 @@ static void __bpf_lru_node_move_in(struct bpf_lru_list *l,
 
 	bpf_lru_list_count_inc(l, tgt_type);
 	node->type = tgt_type;
-	node->ref = 0;
+	bpf_lru_node_clear_ref(node);
 	list_move(&node->list, &l->lists[tgt_type]);
 }
 
@@ -110,7 +115,7 @@ static void __bpf_lru_node_move(struct bpf_lru_list *l,
 		bpf_lru_list_count_inc(l, tgt_type);
 		node->type = tgt_type;
 	}
-	node->ref = 0;
+	bpf_lru_node_clear_ref(node);
 
 	/* If the moving node is the next_inactive_rotation candidate,
 	 * move the next_inactive_rotation pointer also.
@@ -353,7 +358,7 @@ static void __local_list_add_pending(struct bpf_lru *lru,
 	*(u32 *)((void *)node + lru->hash_offset) = hash;
 	node->cpu = cpu;
 	node->type = BPF_LRU_LOCAL_LIST_T_PENDING;
-	node->ref = 0;
+	bpf_lru_node_clear_ref(node);
 	list_add(&node->list, local_pending_list(loc_l));
 }
 
@@ -419,7 +424,7 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
 	if (!list_empty(free_list)) {
 		node = list_first_entry(free_list, struct bpf_lru_node, list);
 		*(u32 *)((void *)node + lru->hash_offset) = hash;
-		node->ref = 0;
+		bpf_lru_node_clear_ref(node);
 		__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
 	}
 
@@ -522,7 +527,7 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru,
 		}
 
 		node->type = BPF_LRU_LOCAL_LIST_T_FREE;
-		node->ref = 0;
+		bpf_lru_node_clear_ref(node);
 		list_move(&node->list, local_free_list(loc_l));
 
 		raw_spin_unlock_irqrestore(&loc_l->lock, flags);
@@ -568,7 +573,7 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
 
 		node = (struct bpf_lru_node *)(buf + node_offset);
 		node->type = BPF_LRU_LIST_T_FREE;
-		node->ref = 0;
+		bpf_lru_node_clear_ref(node);
 		list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
 		buf += elem_size;
 	}
@@ -594,7 +599,7 @@ again:
 		node = (struct bpf_lru_node *)(buf + node_offset);
 		node->cpu = cpu;
 		node->type = BPF_LRU_LIST_T_FREE;
-		node->ref = 0;
+		bpf_lru_node_clear_ref(node);
 		list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
 		i++;
 		buf += elem_size;
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 4ea227c9c1ad..8f3c8b2b4490 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -64,11 +64,8 @@ struct bpf_lru {
 
 static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
 {
-	/* ref is an approximation on access frequency.  It does not
-	 * have to be very accurate.  Hence, no protection is used.
-	 */
-	if (!node->ref)
-		node->ref = 1;
+	if (!READ_ONCE(node->ref))
+		WRITE_ONCE(node->ref, 1);
 }
 
 int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,

From bdeeed3498c7871c17465bb4f11d1bc67f9098af Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 8 May 2023 23:55:02 -0700
Subject: [PATCH 47/57] libbpf: fix offsetof() and container_of() to work with
 CO-RE

It seems like __builtin_offset() doesn't preserve CO-RE field
relocations properly. So if offsetof() macro is defined through
__builtin_offset(), CO-RE-enabled BPF code using container_of() will be
subtly and silently broken.

To avoid this problem, redefine offsetof() and container_of() in the
form that works with CO-RE relocations more reliably.

Fixes: 5fbc220862fc ("tools/libpf: Add offsetof/container_of macro in bpf_helpers.h")
Reported-by: Lennart Poettering <lennart@poettering.net>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20230509065502.2306180-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/bpf_helpers.h | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
index 929a3baca8ef..bbab9ad9dc5a 100644
--- a/tools/lib/bpf/bpf_helpers.h
+++ b/tools/lib/bpf/bpf_helpers.h
@@ -77,16 +77,21 @@
 /*
  * Helper macros to manipulate data structures
  */
-#ifndef offsetof
-#define offsetof(TYPE, MEMBER)	((unsigned long)&((TYPE *)0)->MEMBER)
-#endif
-#ifndef container_of
+
+/* offsetof() definition that uses __builtin_offset() might not preserve field
+ * offset CO-RE relocation properly, so force-redefine offsetof() using
+ * old-school approach which works with CO-RE correctly
+ */
+#undef offsetof
+#define offsetof(type, member)	((unsigned long)&((type *)0)->member)
+
+/* redefined container_of() to ensure we use the above offsetof() macro */
+#undef container_of
 #define container_of(ptr, type, member)				\
 	({							\
 		void *__mptr = (void *)(ptr);			\
 		((type *)(__mptr - offsetof(type, member)));	\
 	})
-#endif
 
 /*
  * Compiler (optimization) barrier.

From 29ebbba7d46136cba324264e513a1e964ca16c0a Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 11 May 2023 10:04:53 -0700
Subject: [PATCH 48/57] bpf: Don't EFAULT for {g,s}setsockopt with wrong optlen

With the way the hooks implemented right now, we have a special
condition: optval larger than PAGE_SIZE will expose only first 4k into
BPF; any modifications to the optval are ignored. If the BPF program
doesn't handle this condition by resetting optlen to 0,
the userspace will get EFAULT.

The intention of the EFAULT was to make it apparent to the
developers that the program is doing something wrong.
However, this inadvertently might affect production workloads
with the BPF programs that are not too careful (i.e., returning EFAULT
for perfectly valid setsockopt/getsockopt calls).

Let's try to minimize the chance of BPF program screwing up userspace
by ignoring the output of those BPF programs (instead of returning
EFAULT to the userspace). pr_info_once those cases to
the dmesg to help with figuring out what's going wrong.

Fixes: 0d01da6afc54 ("bpf: implement getsockopt and setsockopt hooks")
Suggested-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20230511170456.1759459-2-sdf@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 kernel/bpf/cgroup.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index a06e118a9be5..14c870595428 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1826,6 +1826,12 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 		ret = 1;
 	} else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
 		/* optlen is out of bounds */
+		if (*optlen > PAGE_SIZE && ctx.optlen >= 0) {
+			pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
+				     ctx.optlen, max_optlen);
+			ret = 0;
+			goto out;
+		}
 		ret = -EFAULT;
 	} else {
 		/* optlen within bounds, run kernel handler */
@@ -1881,8 +1887,10 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 		.optname = optname,
 		.current_task = current,
 	};
+	int orig_optlen;
 	int ret;
 
+	orig_optlen = max_optlen;
 	ctx.optlen = max_optlen;
 	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
 	if (max_optlen < 0)
@@ -1905,6 +1913,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 			ret = -EFAULT;
 			goto out;
 		}
+		orig_optlen = ctx.optlen;
 
 		if (copy_from_user(ctx.optval, optval,
 				   min(ctx.optlen, max_optlen)) != 0) {
@@ -1922,6 +1931,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 		goto out;
 
 	if (optval && (ctx.optlen > max_optlen || ctx.optlen < 0)) {
+		if (orig_optlen > PAGE_SIZE && ctx.optlen >= 0) {
+			pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
+				     ctx.optlen, max_optlen);
+			ret = retval;
+			goto out;
+		}
 		ret = -EFAULT;
 		goto out;
 	}

From 989a4a7dbff21399da42e808a4ae134023546f41 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 11 May 2023 10:04:54 -0700
Subject: [PATCH 49/57] selftests/bpf: Update EFAULT {g,s}etsockopt selftests

Instead of assuming EFAULT, let's assume the BPF program's
output is ignored.

Remove "getsockopt: deny arbitrary ctx->retval" because it
was actually testing optlen. We have separate set of tests
for retval.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20230511170456.1759459-3-sdf@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../selftests/bpf/prog_tests/sockopt.c        | 96 +++++++++++++++++--
 1 file changed, 90 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt.c b/tools/testing/selftests/bpf/prog_tests/sockopt.c
index aa4debf62fc6..33dd4532e642 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt.c
@@ -5,10 +5,15 @@
 static char bpf_log_buf[4096];
 static bool verbose;
 
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+
 enum sockopt_test_error {
 	OK = 0,
 	DENY_LOAD,
 	DENY_ATTACH,
+	EOPNOTSUPP_GETSOCKOPT,
 	EPERM_GETSOCKOPT,
 	EFAULT_GETSOCKOPT,
 	EPERM_SETSOCKOPT,
@@ -273,10 +278,31 @@ static struct sockopt_test {
 		.error = EFAULT_GETSOCKOPT,
 	},
 	{
-		.descr = "getsockopt: deny arbitrary ctx->retval",
+		.descr = "getsockopt: ignore >PAGE_SIZE optlen",
 		.insns = {
-			/* ctx->retval = 123 */
-			BPF_MOV64_IMM(BPF_REG_0, 123),
+			/* write 0xFF to the first optval byte */
+
+			/* r6 = ctx->optval */
+			BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct bpf_sockopt, optval)),
+			/* r2 = ctx->optval */
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_6),
+			/* r6 = ctx->optval + 1 */
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+
+			/* r7 = ctx->optval_end */
+			BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct bpf_sockopt, optval_end)),
+
+			/* if (ctx->optval + 1 <= ctx->optval_end) { */
+			BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_7, 1),
+			/* ctx->optval[0] = 0xF0 */
+			BPF_ST_MEM(BPF_B, BPF_REG_2, 0, 0xFF),
+			/* } */
+
+			/* retval changes are ignored */
+			/* ctx->retval = 5 */
+			BPF_MOV64_IMM(BPF_REG_0, 5),
 			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
 				    offsetof(struct bpf_sockopt, retval)),
 
@@ -287,9 +313,11 @@ static struct sockopt_test {
 		.attach_type = BPF_CGROUP_GETSOCKOPT,
 		.expected_attach_type = BPF_CGROUP_GETSOCKOPT,
 
-		.get_optlen = 64,
-
-		.error = EFAULT_GETSOCKOPT,
+		.get_level = 1234,
+		.get_optname = 5678,
+		.get_optval = {}, /* the changes are ignored */
+		.get_optlen = PAGE_SIZE + 1,
+		.error = EOPNOTSUPP_GETSOCKOPT,
 	},
 	{
 		.descr = "getsockopt: support smaller ctx->optlen",
@@ -648,6 +676,45 @@ static struct sockopt_test {
 
 		.error = EFAULT_SETSOCKOPT,
 	},
+	{
+		.descr = "setsockopt: ignore >PAGE_SIZE optlen",
+		.insns = {
+			/* write 0xFF to the first optval byte */
+
+			/* r6 = ctx->optval */
+			BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct bpf_sockopt, optval)),
+			/* r2 = ctx->optval */
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_6),
+			/* r6 = ctx->optval + 1 */
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+
+			/* r7 = ctx->optval_end */
+			BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct bpf_sockopt, optval_end)),
+
+			/* if (ctx->optval + 1 <= ctx->optval_end) { */
+			BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_7, 1),
+			/* ctx->optval[0] = 0xF0 */
+			BPF_ST_MEM(BPF_B, BPF_REG_2, 0, 0xF0),
+			/* } */
+
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.attach_type = BPF_CGROUP_SETSOCKOPT,
+		.expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+		.set_level = SOL_IP,
+		.set_optname = IP_TOS,
+		.set_optval = {},
+		.set_optlen = PAGE_SIZE + 1,
+
+		.get_level = SOL_IP,
+		.get_optname = IP_TOS,
+		.get_optval = {}, /* the changes are ignored */
+		.get_optlen = 4,
+	},
 	{
 		.descr = "setsockopt: allow changing ctx->optlen within bounds",
 		.insns = {
@@ -906,6 +973,13 @@ static int run_test(int cgroup_fd, struct sockopt_test *test)
 	}
 
 	if (test->set_optlen) {
+		if (test->set_optlen >= PAGE_SIZE) {
+			int num_pages = test->set_optlen / PAGE_SIZE;
+			int remainder = test->set_optlen % PAGE_SIZE;
+
+			test->set_optlen = num_pages * sysconf(_SC_PAGESIZE) + remainder;
+		}
+
 		err = setsockopt(sock_fd, test->set_level, test->set_optname,
 				 test->set_optval, test->set_optlen);
 		if (err) {
@@ -921,7 +995,15 @@ static int run_test(int cgroup_fd, struct sockopt_test *test)
 	}
 
 	if (test->get_optlen) {
+		if (test->get_optlen >= PAGE_SIZE) {
+			int num_pages = test->get_optlen / PAGE_SIZE;
+			int remainder = test->get_optlen % PAGE_SIZE;
+
+			test->get_optlen = num_pages * sysconf(_SC_PAGESIZE) + remainder;
+		}
+
 		optval = malloc(test->get_optlen);
+		memset(optval, 0, test->get_optlen);
 		socklen_t optlen = test->get_optlen;
 		socklen_t expected_get_optlen = test->get_optlen_ret ?:
 			test->get_optlen;
@@ -929,6 +1011,8 @@ static int run_test(int cgroup_fd, struct sockopt_test *test)
 		err = getsockopt(sock_fd, test->get_level, test->get_optname,
 				 optval, &optlen);
 		if (err) {
+			if (errno == EOPNOTSUPP && test->error == EOPNOTSUPP_GETSOCKOPT)
+				goto free_optval;
 			if (errno == EPERM && test->error == EPERM_GETSOCKOPT)
 				goto free_optval;
 			if (errno == EFAULT && test->error == EFAULT_GETSOCKOPT)

From e01b4a72f132c1ca63e3ed851bef9b3c62ae6149 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 11 May 2023 10:04:55 -0700
Subject: [PATCH 50/57] selftests/bpf: Correctly handle optlen > 4096

Even though it's not relevant in selftests, the people
might still copy-paste from them. So let's take care
of optlen > 4096 cases explicitly.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20230511170456.1759459-4-sdf@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../bpf/prog_tests/cgroup_getset_retval.c     |  20 ++++
 .../bpf/prog_tests/sockopt_inherit.c          |  61 +++-------
 .../selftests/bpf/prog_tests/sockopt_multi.c  | 108 +++++-------------
 .../bpf/prog_tests/sockopt_qos_to_cc.c        |   2 +
 .../progs/cgroup_getset_retval_getsockopt.c   |  13 +++
 .../progs/cgroup_getset_retval_setsockopt.c   |  17 +++
 .../selftests/bpf/progs/sockopt_inherit.c     |  18 ++-
 .../selftests/bpf/progs/sockopt_multi.c       |  26 ++++-
 .../selftests/bpf/progs/sockopt_qos_to_cc.c   |  10 +-
 .../testing/selftests/bpf/progs/sockopt_sk.c  |  25 ++--
 10 files changed, 167 insertions(+), 133 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_getset_retval.c b/tools/testing/selftests/bpf/prog_tests/cgroup_getset_retval.c
index 4d2fa99273d8..2bb5773d6f99 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgroup_getset_retval.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_getset_retval.c
@@ -25,6 +25,8 @@ static void test_setsockopt_set(int cgroup_fd, int sock_fd)
 	if (!ASSERT_OK_PTR(obj, "skel-load"))
 		return;
 
+	obj->bss->page_size = sysconf(_SC_PAGESIZE);
+
 	/* Attach setsockopt that sets EUNATCH, assert that
 	 * we actually get that error when we run setsockopt()
 	 */
@@ -59,6 +61,8 @@ static void test_setsockopt_set_and_get(int cgroup_fd, int sock_fd)
 	if (!ASSERT_OK_PTR(obj, "skel-load"))
 		return;
 
+	obj->bss->page_size = sysconf(_SC_PAGESIZE);
+
 	/* Attach setsockopt that sets EUNATCH, and one that gets the
 	 * previously set errno. Assert that we get the same errno back.
 	 */
@@ -100,6 +104,8 @@ static void test_setsockopt_default_zero(int cgroup_fd, int sock_fd)
 	if (!ASSERT_OK_PTR(obj, "skel-load"))
 		return;
 
+	obj->bss->page_size = sysconf(_SC_PAGESIZE);
+
 	/* Attach setsockopt that gets the previously set errno.
 	 * Assert that, without anything setting one, we get 0.
 	 */
@@ -134,6 +140,8 @@ static void test_setsockopt_default_zero_and_set(int cgroup_fd, int sock_fd)
 	if (!ASSERT_OK_PTR(obj, "skel-load"))
 		return;
 
+	obj->bss->page_size = sysconf(_SC_PAGESIZE);
+
 	/* Attach setsockopt that gets the previously set errno, and then
 	 * one that sets the errno to EUNATCH. Assert that the get does not
 	 * see EUNATCH set later, and does not prevent EUNATCH from being set.
@@ -177,6 +185,8 @@ static void test_setsockopt_override(int cgroup_fd, int sock_fd)
 	if (!ASSERT_OK_PTR(obj, "skel-load"))
 		return;
 
+	obj->bss->page_size = sysconf(_SC_PAGESIZE);
+
 	/* Attach setsockopt that sets EUNATCH, then one that sets EISCONN,
 	 * and then one that gets the exported errno. Assert both the syscall
 	 * and the helper sees the last set errno.
@@ -224,6 +234,8 @@ static void test_setsockopt_legacy_eperm(int cgroup_fd, int sock_fd)
 	if (!ASSERT_OK_PTR(obj, "skel-load"))
 		return;
 
+	obj->bss->page_size = sysconf(_SC_PAGESIZE);
+
 	/* Attach setsockopt that return a reject without setting errno
 	 * (legacy reject), and one that gets the errno. Assert that for
 	 * backward compatibility the syscall result in EPERM, and this
@@ -268,6 +280,8 @@ static void test_setsockopt_legacy_no_override(int cgroup_fd, int sock_fd)
 	if (!ASSERT_OK_PTR(obj, "skel-load"))
 		return;
 
+	obj->bss->page_size = sysconf(_SC_PAGESIZE);
+
 	/* Attach setsockopt that sets EUNATCH, then one that return a reject
 	 * without setting errno, and then one that gets the exported errno.
 	 * Assert both the syscall and the helper's errno are unaffected by
@@ -319,6 +333,8 @@ static void test_getsockopt_get(int cgroup_fd, int sock_fd)
 	if (!ASSERT_OK_PTR(obj, "skel-load"))
 		return;
 
+	obj->bss->page_size = sysconf(_SC_PAGESIZE);
+
 	/* Attach getsockopt that gets previously set errno. Assert that the
 	 * error from kernel is in both ctx_retval_value and retval_value.
 	 */
@@ -359,6 +375,8 @@ static void test_getsockopt_override(int cgroup_fd, int sock_fd)
 	if (!ASSERT_OK_PTR(obj, "skel-load"))
 		return;
 
+	obj->bss->page_size = sysconf(_SC_PAGESIZE);
+
 	/* Attach getsockopt that sets retval to -EISCONN. Assert that this
 	 * overrides the value from kernel.
 	 */
@@ -396,6 +414,8 @@ static void test_getsockopt_retval_sync(int cgroup_fd, int sock_fd)
 	if (!ASSERT_OK_PTR(obj, "skel-load"))
 		return;
 
+	obj->bss->page_size = sysconf(_SC_PAGESIZE);
+
 	/* Attach getsockopt that sets retval to -EISCONN, and one that clears
 	 * ctx retval. Assert that the clearing ctx retval is synced to helper
 	 * and clears any errors both from kernel and BPF..
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
index 60c17a8e2789..917f486db826 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
@@ -2,6 +2,8 @@
 #include <test_progs.h>
 #include "cgroup_helpers.h"
 
+#include "sockopt_inherit.skel.h"
+
 #define SOL_CUSTOM			0xdeadbeef
 #define CUSTOM_INHERIT1			0
 #define CUSTOM_INHERIT2			1
@@ -132,58 +134,30 @@ static int start_server(void)
 	return fd;
 }
 
-static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title,
-		       const char *prog_name)
-{
-	enum bpf_attach_type attach_type;
-	enum bpf_prog_type prog_type;
-	struct bpf_program *prog;
-	int err;
-
-	err = libbpf_prog_type_by_name(title, &prog_type, &attach_type);
-	if (err) {
-		log_err("Failed to deduct types for %s BPF program", prog_name);
-		return -1;
-	}
-
-	prog = bpf_object__find_program_by_name(obj, prog_name);
-	if (!prog) {
-		log_err("Failed to find %s BPF program", prog_name);
-		return -1;
-	}
-
-	err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd,
-			      attach_type, 0);
-	if (err) {
-		log_err("Failed to attach %s BPF program", prog_name);
-		return -1;
-	}
-
-	return 0;
-}
-
 static void run_test(int cgroup_fd)
 {
+	struct bpf_link *link_getsockopt = NULL;
+	struct bpf_link *link_setsockopt = NULL;
 	int server_fd = -1, client_fd;
-	struct bpf_object *obj;
+	struct sockopt_inherit *obj;
 	void *server_err;
 	pthread_t tid;
 	int err;
 
-	obj = bpf_object__open_file("sockopt_inherit.bpf.o", NULL);
-	if (!ASSERT_OK_PTR(obj, "obj_open"))
+	obj = sockopt_inherit__open_and_load();
+	if (!ASSERT_OK_PTR(obj, "skel-load"))
 		return;
 
-	err = bpf_object__load(obj);
-	if (!ASSERT_OK(err, "obj_load"))
+	obj->bss->page_size = sysconf(_SC_PAGESIZE);
+
+	link_getsockopt = bpf_program__attach_cgroup(obj->progs._getsockopt,
+						     cgroup_fd);
+	if (!ASSERT_OK_PTR(link_getsockopt, "cg-attach-getsockopt"))
 		goto close_bpf_object;
 
-	err = prog_attach(obj, cgroup_fd, "cgroup/getsockopt", "_getsockopt");
-	if (!ASSERT_OK(err, "prog_attach _getsockopt"))
-		goto close_bpf_object;
-
-	err = prog_attach(obj, cgroup_fd, "cgroup/setsockopt", "_setsockopt");
-	if (!ASSERT_OK(err, "prog_attach _setsockopt"))
+	link_setsockopt = bpf_program__attach_cgroup(obj->progs._setsockopt,
+						     cgroup_fd);
+	if (!ASSERT_OK_PTR(link_setsockopt, "cg-attach-setsockopt"))
 		goto close_bpf_object;
 
 	server_fd = start_server();
@@ -217,7 +191,10 @@ static void run_test(int cgroup_fd)
 close_server_fd:
 	close(server_fd);
 close_bpf_object:
-	bpf_object__close(obj);
+	bpf_link__destroy(link_getsockopt);
+	bpf_link__destroy(link_setsockopt);
+
+	sockopt_inherit__destroy(obj);
 }
 
 void test_sockopt_inherit(void)
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c b/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c
index 7f5659349011..759bbb6f8c5f 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c
@@ -2,61 +2,13 @@
 #include <test_progs.h>
 #include "cgroup_helpers.h"
 
-static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title, const char *name)
-{
-	enum bpf_attach_type attach_type;
-	enum bpf_prog_type prog_type;
-	struct bpf_program *prog;
-	int err;
+#include "sockopt_multi.skel.h"
 
-	err = libbpf_prog_type_by_name(title, &prog_type, &attach_type);
-	if (err) {
-		log_err("Failed to deduct types for %s BPF program", title);
-		return -1;
-	}
-
-	prog = bpf_object__find_program_by_name(obj, name);
-	if (!prog) {
-		log_err("Failed to find %s BPF program", name);
-		return -1;
-	}
-
-	err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd,
-			      attach_type, BPF_F_ALLOW_MULTI);
-	if (err) {
-		log_err("Failed to attach %s BPF program", name);
-		return -1;
-	}
-
-	return 0;
-}
-
-static int prog_detach(struct bpf_object *obj, int cgroup_fd, const char *title, const char *name)
-{
-	enum bpf_attach_type attach_type;
-	enum bpf_prog_type prog_type;
-	struct bpf_program *prog;
-	int err;
-
-	err = libbpf_prog_type_by_name(title, &prog_type, &attach_type);
-	if (err)
-		return -1;
-
-	prog = bpf_object__find_program_by_name(obj, name);
-	if (!prog)
-		return -1;
-
-	err = bpf_prog_detach2(bpf_program__fd(prog), cgroup_fd,
-			       attach_type);
-	if (err)
-		return -1;
-
-	return 0;
-}
-
-static int run_getsockopt_test(struct bpf_object *obj, int cg_parent,
+static int run_getsockopt_test(struct sockopt_multi *obj, int cg_parent,
 			       int cg_child, int sock_fd)
 {
+	struct bpf_link *link_parent = NULL;
+	struct bpf_link *link_child = NULL;
 	socklen_t optlen;
 	__u8 buf;
 	int err;
@@ -89,8 +41,9 @@ static int run_getsockopt_test(struct bpf_object *obj, int cg_parent,
 	 * - child:  0x80 -> 0x90
 	 */
 
-	err = prog_attach(obj, cg_child, "cgroup/getsockopt", "_getsockopt_child");
-	if (err)
+	link_child = bpf_program__attach_cgroup(obj->progs._getsockopt_child,
+						cg_child);
+	if (!ASSERT_OK_PTR(link_child, "cg-attach-getsockopt_child"))
 		goto detach;
 
 	buf = 0x00;
@@ -113,8 +66,9 @@ static int run_getsockopt_test(struct bpf_object *obj, int cg_parent,
 	 * - parent: 0x90 -> 0xA0
 	 */
 
-	err = prog_attach(obj, cg_parent, "cgroup/getsockopt", "_getsockopt_parent");
-	if (err)
+	link_parent = bpf_program__attach_cgroup(obj->progs._getsockopt_parent,
+						 cg_parent);
+	if (!ASSERT_OK_PTR(link_parent, "cg-attach-getsockopt_parent"))
 		goto detach;
 
 	buf = 0x00;
@@ -157,11 +111,8 @@ static int run_getsockopt_test(struct bpf_object *obj, int cg_parent,
 	 * - parent: unexpected 0x40, EPERM
 	 */
 
-	err = prog_detach(obj, cg_child, "cgroup/getsockopt", "_getsockopt_child");
-	if (err) {
-		log_err("Failed to detach child program");
-		goto detach;
-	}
+	bpf_link__destroy(link_child);
+	link_child = NULL;
 
 	buf = 0x00;
 	optlen = 1;
@@ -198,15 +149,17 @@ static int run_getsockopt_test(struct bpf_object *obj, int cg_parent,
 	}
 
 detach:
-	prog_detach(obj, cg_child, "cgroup/getsockopt", "_getsockopt_child");
-	prog_detach(obj, cg_parent, "cgroup/getsockopt", "_getsockopt_parent");
+	bpf_link__destroy(link_child);
+	bpf_link__destroy(link_parent);
 
 	return err;
 }
 
-static int run_setsockopt_test(struct bpf_object *obj, int cg_parent,
+static int run_setsockopt_test(struct sockopt_multi *obj, int cg_parent,
 			       int cg_child, int sock_fd)
 {
+	struct bpf_link *link_parent = NULL;
+	struct bpf_link *link_child = NULL;
 	socklen_t optlen;
 	__u8 buf;
 	int err;
@@ -236,8 +189,9 @@ static int run_setsockopt_test(struct bpf_object *obj, int cg_parent,
 
 	/* Attach child program and make sure it adds 0x10. */
 
-	err = prog_attach(obj, cg_child, "cgroup/setsockopt", "_setsockopt");
-	if (err)
+	link_child = bpf_program__attach_cgroup(obj->progs._setsockopt,
+						cg_child);
+	if (!ASSERT_OK_PTR(link_child, "cg-attach-setsockopt_child"))
 		goto detach;
 
 	buf = 0x80;
@@ -263,8 +217,9 @@ static int run_setsockopt_test(struct bpf_object *obj, int cg_parent,
 
 	/* Attach parent program and make sure it adds another 0x10. */
 
-	err = prog_attach(obj, cg_parent, "cgroup/setsockopt", "_setsockopt");
-	if (err)
+	link_parent = bpf_program__attach_cgroup(obj->progs._setsockopt,
+						 cg_parent);
+	if (!ASSERT_OK_PTR(link_parent, "cg-attach-setsockopt_parent"))
 		goto detach;
 
 	buf = 0x80;
@@ -289,8 +244,8 @@ static int run_setsockopt_test(struct bpf_object *obj, int cg_parent,
 	}
 
 detach:
-	prog_detach(obj, cg_child, "cgroup/setsockopt", "_setsockopt");
-	prog_detach(obj, cg_parent, "cgroup/setsockopt", "_setsockopt");
+	bpf_link__destroy(link_child);
+	bpf_link__destroy(link_parent);
 
 	return err;
 }
@@ -298,9 +253,8 @@ detach:
 void test_sockopt_multi(void)
 {
 	int cg_parent = -1, cg_child = -1;
-	struct bpf_object *obj = NULL;
+	struct sockopt_multi *obj = NULL;
 	int sock_fd = -1;
-	int err = -1;
 
 	cg_parent = test__join_cgroup("/parent");
 	if (!ASSERT_GE(cg_parent, 0, "join_cgroup /parent"))
@@ -310,13 +264,11 @@ void test_sockopt_multi(void)
 	if (!ASSERT_GE(cg_child, 0, "join_cgroup /parent/child"))
 		goto out;
 
-	obj = bpf_object__open_file("sockopt_multi.bpf.o", NULL);
-	if (!ASSERT_OK_PTR(obj, "obj_load"))
+	obj = sockopt_multi__open_and_load();
+	if (!ASSERT_OK_PTR(obj, "skel-load"))
 		goto out;
 
-	err = bpf_object__load(obj);
-	if (!ASSERT_OK(err, "obj_load"))
-		goto out;
+	obj->bss->page_size = sysconf(_SC_PAGESIZE);
 
 	sock_fd = socket(AF_INET, SOCK_STREAM, 0);
 	if (!ASSERT_GE(sock_fd, 0, "socket"))
@@ -327,7 +279,7 @@ void test_sockopt_multi(void)
 
 out:
 	close(sock_fd);
-	bpf_object__close(obj);
+	sockopt_multi__destroy(obj);
 	close(cg_child);
 	close(cg_parent);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_qos_to_cc.c b/tools/testing/selftests/bpf/prog_tests/sockopt_qos_to_cc.c
index 6b53b3cb8dad..6b2d300e9fd4 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt_qos_to_cc.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_qos_to_cc.c
@@ -42,6 +42,8 @@ void test_sockopt_qos_to_cc(void)
 	if (!ASSERT_OK_PTR(skel, "skel"))
 		goto done;
 
+	skel->bss->page_size = sysconf(_SC_PAGESIZE);
+
 	sock_fd = socket(AF_INET6, SOCK_STREAM, 0);
 	if (!ASSERT_GE(sock_fd, 0, "v6 socket open"))
 		goto done;
diff --git a/tools/testing/selftests/bpf/progs/cgroup_getset_retval_getsockopt.c b/tools/testing/selftests/bpf/progs/cgroup_getset_retval_getsockopt.c
index b2a409e6382a..932b8ecd4ae3 100644
--- a/tools/testing/selftests/bpf/progs/cgroup_getset_retval_getsockopt.c
+++ b/tools/testing/selftests/bpf/progs/cgroup_getset_retval_getsockopt.c
@@ -12,6 +12,7 @@ __u32 invocations = 0;
 __u32 assertion_error = 0;
 __u32 retval_value = 0;
 __u32 ctx_retval_value = 0;
+__u32 page_size = 0;
 
 SEC("cgroup/getsockopt")
 int get_retval(struct bpf_sockopt *ctx)
@@ -20,6 +21,10 @@ int get_retval(struct bpf_sockopt *ctx)
 	ctx_retval_value = ctx->retval;
 	__sync_fetch_and_add(&invocations, 1);
 
+	/* optval larger than PAGE_SIZE use kernel's buffer. */
+	if (ctx->optlen > page_size)
+		ctx->optlen = 0;
+
 	return 1;
 }
 
@@ -31,6 +36,10 @@ int set_eisconn(struct bpf_sockopt *ctx)
 	if (bpf_set_retval(-EISCONN))
 		assertion_error = 1;
 
+	/* optval larger than PAGE_SIZE use kernel's buffer. */
+	if (ctx->optlen > page_size)
+		ctx->optlen = 0;
+
 	return 1;
 }
 
@@ -41,5 +50,9 @@ int clear_retval(struct bpf_sockopt *ctx)
 
 	ctx->retval = 0;
 
+	/* optval larger than PAGE_SIZE use kernel's buffer. */
+	if (ctx->optlen > page_size)
+		ctx->optlen = 0;
+
 	return 1;
 }
diff --git a/tools/testing/selftests/bpf/progs/cgroup_getset_retval_setsockopt.c b/tools/testing/selftests/bpf/progs/cgroup_getset_retval_setsockopt.c
index d6e5903e06ba..b7fa8804e19d 100644
--- a/tools/testing/selftests/bpf/progs/cgroup_getset_retval_setsockopt.c
+++ b/tools/testing/selftests/bpf/progs/cgroup_getset_retval_setsockopt.c
@@ -11,6 +11,7 @@
 __u32 invocations = 0;
 __u32 assertion_error = 0;
 __u32 retval_value = 0;
+__u32 page_size = 0;
 
 SEC("cgroup/setsockopt")
 int get_retval(struct bpf_sockopt *ctx)
@@ -18,6 +19,10 @@ int get_retval(struct bpf_sockopt *ctx)
 	retval_value = bpf_get_retval();
 	__sync_fetch_and_add(&invocations, 1);
 
+	/* optval larger than PAGE_SIZE use kernel's buffer. */
+	if (ctx->optlen > page_size)
+		ctx->optlen = 0;
+
 	return 1;
 }
 
@@ -29,6 +34,10 @@ int set_eunatch(struct bpf_sockopt *ctx)
 	if (bpf_set_retval(-EUNATCH))
 		assertion_error = 1;
 
+	/* optval larger than PAGE_SIZE use kernel's buffer. */
+	if (ctx->optlen > page_size)
+		ctx->optlen = 0;
+
 	return 0;
 }
 
@@ -40,6 +49,10 @@ int set_eisconn(struct bpf_sockopt *ctx)
 	if (bpf_set_retval(-EISCONN))
 		assertion_error = 1;
 
+	/* optval larger than PAGE_SIZE use kernel's buffer. */
+	if (ctx->optlen > page_size)
+		ctx->optlen = 0;
+
 	return 0;
 }
 
@@ -48,5 +61,9 @@ int legacy_eperm(struct bpf_sockopt *ctx)
 {
 	__sync_fetch_and_add(&invocations, 1);
 
+	/* optval larger than PAGE_SIZE use kernel's buffer. */
+	if (ctx->optlen > page_size)
+		ctx->optlen = 0;
+
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/progs/sockopt_inherit.c b/tools/testing/selftests/bpf/progs/sockopt_inherit.c
index 9fb241b97291..c8f59caa4639 100644
--- a/tools/testing/selftests/bpf/progs/sockopt_inherit.c
+++ b/tools/testing/selftests/bpf/progs/sockopt_inherit.c
@@ -9,6 +9,8 @@ char _license[] SEC("license") = "GPL";
 #define CUSTOM_INHERIT2			1
 #define CUSTOM_LISTENER			2
 
+__u32 page_size = 0;
+
 struct sockopt_inherit {
 	__u8 val;
 };
@@ -55,7 +57,7 @@ int _getsockopt(struct bpf_sockopt *ctx)
 	__u8 *optval = ctx->optval;
 
 	if (ctx->level != SOL_CUSTOM)
-		return 1; /* only interested in SOL_CUSTOM */
+		goto out; /* only interested in SOL_CUSTOM */
 
 	if (optval + 1 > optval_end)
 		return 0; /* EPERM, bounds check */
@@ -70,6 +72,12 @@ int _getsockopt(struct bpf_sockopt *ctx)
 	ctx->optlen = 1;
 
 	return 1;
+
+out:
+	/* optval larger than PAGE_SIZE use kernel's buffer. */
+	if (ctx->optlen > page_size)
+		ctx->optlen = 0;
+	return 1;
 }
 
 SEC("cgroup/setsockopt")
@@ -80,7 +88,7 @@ int _setsockopt(struct bpf_sockopt *ctx)
 	__u8 *optval = ctx->optval;
 
 	if (ctx->level != SOL_CUSTOM)
-		return 1; /* only interested in SOL_CUSTOM */
+		goto out; /* only interested in SOL_CUSTOM */
 
 	if (optval + 1 > optval_end)
 		return 0; /* EPERM, bounds check */
@@ -93,4 +101,10 @@ int _setsockopt(struct bpf_sockopt *ctx)
 	ctx->optlen = -1;
 
 	return 1;
+
+out:
+	/* optval larger than PAGE_SIZE use kernel's buffer. */
+	if (ctx->optlen > page_size)
+		ctx->optlen = 0;
+	return 1;
 }
diff --git a/tools/testing/selftests/bpf/progs/sockopt_multi.c b/tools/testing/selftests/bpf/progs/sockopt_multi.c
index 177a59069dae..96f29fce050b 100644
--- a/tools/testing/selftests/bpf/progs/sockopt_multi.c
+++ b/tools/testing/selftests/bpf/progs/sockopt_multi.c
@@ -5,6 +5,8 @@
 
 char _license[] SEC("license") = "GPL";
 
+__u32 page_size = 0;
+
 SEC("cgroup/getsockopt")
 int _getsockopt_child(struct bpf_sockopt *ctx)
 {
@@ -12,7 +14,7 @@ int _getsockopt_child(struct bpf_sockopt *ctx)
 	__u8 *optval = ctx->optval;
 
 	if (ctx->level != SOL_IP || ctx->optname != IP_TOS)
-		return 1;
+		goto out;
 
 	if (optval + 1 > optval_end)
 		return 0; /* EPERM, bounds check */
@@ -26,6 +28,12 @@ int _getsockopt_child(struct bpf_sockopt *ctx)
 	ctx->optlen = 1;
 
 	return 1;
+
+out:
+	/* optval larger than PAGE_SIZE use kernel's buffer. */
+	if (ctx->optlen > page_size)
+		ctx->optlen = 0;
+	return 1;
 }
 
 SEC("cgroup/getsockopt")
@@ -35,7 +43,7 @@ int _getsockopt_parent(struct bpf_sockopt *ctx)
 	__u8 *optval = ctx->optval;
 
 	if (ctx->level != SOL_IP || ctx->optname != IP_TOS)
-		return 1;
+		goto out;
 
 	if (optval + 1 > optval_end)
 		return 0; /* EPERM, bounds check */
@@ -49,6 +57,12 @@ int _getsockopt_parent(struct bpf_sockopt *ctx)
 	ctx->optlen = 1;
 
 	return 1;
+
+out:
+	/* optval larger than PAGE_SIZE use kernel's buffer. */
+	if (ctx->optlen > page_size)
+		ctx->optlen = 0;
+	return 1;
 }
 
 SEC("cgroup/setsockopt")
@@ -58,7 +72,7 @@ int _setsockopt(struct bpf_sockopt *ctx)
 	__u8 *optval = ctx->optval;
 
 	if (ctx->level != SOL_IP || ctx->optname != IP_TOS)
-		return 1;
+		goto out;
 
 	if (optval + 1 > optval_end)
 		return 0; /* EPERM, bounds check */
@@ -67,4 +81,10 @@ int _setsockopt(struct bpf_sockopt *ctx)
 	ctx->optlen = 1;
 
 	return 1;
+
+out:
+	/* optval larger than PAGE_SIZE use kernel's buffer. */
+	if (ctx->optlen > page_size)
+		ctx->optlen = 0;
+	return 1;
 }
diff --git a/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c b/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c
index 1bce83b6e3a7..dbe235ede7f3 100644
--- a/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c
+++ b/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c
@@ -9,6 +9,8 @@
 
 char _license[] SEC("license") = "GPL";
 
+__u32 page_size = 0;
+
 SEC("cgroup/setsockopt")
 int sockopt_qos_to_cc(struct bpf_sockopt *ctx)
 {
@@ -19,7 +21,7 @@ int sockopt_qos_to_cc(struct bpf_sockopt *ctx)
 	char cc_cubic[TCP_CA_NAME_MAX] = "cubic";
 
 	if (ctx->level != SOL_IPV6 || ctx->optname != IPV6_TCLASS)
-		return 1;
+		goto out;
 
 	if (optval + 1 > optval_end)
 		return 0; /* EPERM, bounds check */
@@ -36,4 +38,10 @@ int sockopt_qos_to_cc(struct bpf_sockopt *ctx)
 			return 0;
 	}
 	return 1;
+
+out:
+	/* optval larger than PAGE_SIZE use kernel's buffer. */
+	if (ctx->optlen > page_size)
+		ctx->optlen = 0;
+	return 1;
 }
diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c
index fe1df4cd206e..cb990a7d3d45 100644
--- a/tools/testing/selftests/bpf/progs/sockopt_sk.c
+++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c
@@ -37,7 +37,7 @@ int _getsockopt(struct bpf_sockopt *ctx)
 	/* Bypass AF_NETLINK. */
 	sk = ctx->sk;
 	if (sk && sk->family == AF_NETLINK)
-		return 1;
+		goto out;
 
 	/* Make sure bpf_get_netns_cookie is callable.
 	 */
@@ -52,8 +52,7 @@ int _getsockopt(struct bpf_sockopt *ctx)
 		 * let next BPF program in the cgroup chain or kernel
 		 * handle it.
 		 */
-		ctx->optlen = 0; /* bypass optval>PAGE_SIZE */
-		return 1;
+		goto out;
 	}
 
 	if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) {
@@ -61,7 +60,7 @@ int _getsockopt(struct bpf_sockopt *ctx)
 		 * let next BPF program in the cgroup chain or kernel
 		 * handle it.
 		 */
-		return 1;
+		goto out;
 	}
 
 	if (ctx->level == SOL_TCP && ctx->optname == TCP_CONGESTION) {
@@ -69,7 +68,7 @@ int _getsockopt(struct bpf_sockopt *ctx)
 		 * let next BPF program in the cgroup chain or kernel
 		 * handle it.
 		 */
-		return 1;
+		goto out;
 	}
 
 	if (ctx->level == SOL_TCP && ctx->optname == TCP_ZEROCOPY_RECEIVE) {
@@ -85,7 +84,7 @@ int _getsockopt(struct bpf_sockopt *ctx)
 		if (((struct tcp_zerocopy_receive *)optval)->address != 0)
 			return 0; /* unexpected data */
 
-		return 1;
+		goto out;
 	}
 
 	if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
@@ -129,6 +128,12 @@ int _getsockopt(struct bpf_sockopt *ctx)
 	ctx->optlen = 1;
 
 	return 1;
+
+out:
+	/* optval larger than PAGE_SIZE use kernel's buffer. */
+	if (ctx->optlen > page_size)
+		ctx->optlen = 0;
+	return 1;
 }
 
 SEC("cgroup/setsockopt")
@@ -142,7 +147,7 @@ int _setsockopt(struct bpf_sockopt *ctx)
 	/* Bypass AF_NETLINK. */
 	sk = ctx->sk;
 	if (sk && sk->family == AF_NETLINK)
-		return 1;
+		goto out;
 
 	/* Make sure bpf_get_netns_cookie is callable.
 	 */
@@ -224,4 +229,10 @@ int _setsockopt(struct bpf_sockopt *ctx)
 			   */
 
 	return 1;
+
+out:
+	/* optval larger than PAGE_SIZE use kernel's buffer. */
+	if (ctx->optlen > page_size)
+		ctx->optlen = 0;
+	return 1;
 }

From 6b6a23d5d8e857e0dda1bbe5043cf4d5e9c711d3 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 11 May 2023 10:04:56 -0700
Subject: [PATCH 51/57] bpf: Document EFAULT changes for sockopt

And add examples for how to correctly handle large optlens.
This is less relevant now when we don't EFAULT anymore, but
that's still the correct thing to do.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20230511170456.1759459-5-sdf@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 Documentation/bpf/prog_cgroup_sockopt.rst | 57 ++++++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/Documentation/bpf/prog_cgroup_sockopt.rst b/Documentation/bpf/prog_cgroup_sockopt.rst
index 172f957204bf..1226a94af07a 100644
--- a/Documentation/bpf/prog_cgroup_sockopt.rst
+++ b/Documentation/bpf/prog_cgroup_sockopt.rst
@@ -98,10 +98,65 @@ can access only the first ``PAGE_SIZE`` of that data. So it has to options:
   indicates that the kernel should use BPF's trimmed ``optval``.
 
 When the BPF program returns with the ``optlen`` greater than
-``PAGE_SIZE``, the userspace will receive ``EFAULT`` errno.
+``PAGE_SIZE``, the userspace will receive original kernel
+buffers without any modifications that the BPF program might have
+applied.
 
 Example
 =======
 
+Recommended way to handle BPF programs is as follows:
+
+.. code-block:: c
+
+	SEC("cgroup/getsockopt")
+	int getsockopt(struct bpf_sockopt *ctx)
+	{
+		/* Custom socket option. */
+		if (ctx->level == MY_SOL && ctx->optname == MY_OPTNAME) {
+			ctx->retval = 0;
+			optval[0] = ...;
+			ctx->optlen = 1;
+			return 1;
+		}
+
+		/* Modify kernel's socket option. */
+		if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
+			ctx->retval = 0;
+			optval[0] = ...;
+			ctx->optlen = 1;
+			return 1;
+		}
+
+		/* optval larger than PAGE_SIZE use kernel's buffer. */
+		if (ctx->optlen > PAGE_SIZE)
+			ctx->optlen = 0;
+
+		return 1;
+	}
+
+	SEC("cgroup/setsockopt")
+	int setsockopt(struct bpf_sockopt *ctx)
+	{
+		/* Custom socket option. */
+		if (ctx->level == MY_SOL && ctx->optname == MY_OPTNAME) {
+			/* do something */
+			ctx->optlen = -1;
+			return 1;
+		}
+
+		/* Modify kernel's socket option. */
+		if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
+			optval[0] = ...;
+			return 1;
+		}
+
+		/* optval larger than PAGE_SIZE use kernel's buffer. */
+		if (ctx->optlen > PAGE_SIZE)
+			ctx->optlen = 0;
+
+		return 1;
+	}
+
 See ``tools/testing/selftests/bpf/progs/sockopt_sk.c`` for an example
 of BPF program that handles socket options.

From 4d585f48ee6b38c54c075b151c5efd2ff65f8ffd Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Wed, 10 May 2023 14:30:47 -0700
Subject: [PATCH 52/57] bpf: Remove anonymous union in bpf_kfunc_call_arg_meta

For kfuncs like bpf_obj_drop and bpf_refcount_acquire - which take
user-defined types as input - the verifier needs to track the specific
type passed in when checking a particular kfunc call. This requires
tracking (btf, btf_id) tuple. In commit 7c50b1cb76ac
("bpf: Add bpf_refcount_acquire kfunc") I added an anonymous union with
inner structs named after the specific kfuncs tracking this information,
with the goal of making it more obvious which kfunc this data was being
tracked / expected to be tracked on behalf of.

In a recent series adding a new user of this tuple, Alexei mentioned
that he didn't like this union usage as it doesn't really help with
readability or bug-proofing ([0]). In an offline convo we agreed to
have the tuple be fields (arg_btf, arg_btf_id), with comments in
bpf_kfunc_call_arg_meta definition enumerating the uses of the fields by
kfunc-specific handling logic. Such a pattern is used by struct
bpf_reg_state without trouble.

Accordingly, this patch removes the anonymous union in favor of arg_btf
and arg_btf_id fields and comment enumerating their current uses. The
patch also removes struct btf_and_id, which was only being used by the
removed union's inner structs.

This is a mechanical change, existing linked_list and rbtree tests will
validate that correct (btf, btf_id) are being passed.

  [0]: https://lore.kernel.org/bpf/20230505021707.vlyiwy57vwxglbka@dhcp-172-26-102-232.dhcp.thefacebook.com

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230510213047.1633612-1-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 754129d41225..5c636276d907 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -279,11 +279,6 @@ struct bpf_call_arg_meta {
 	struct btf_field *kptr_field;
 };
 
-struct btf_and_id {
-	struct btf *btf;
-	u32 btf_id;
-};
-
 struct bpf_kfunc_call_arg_meta {
 	/* In parameters */
 	struct btf *btf;
@@ -302,10 +297,18 @@ struct bpf_kfunc_call_arg_meta {
 		u64 value;
 		bool found;
 	} arg_constant;
-	union {
-		struct btf_and_id arg_obj_drop;
-		struct btf_and_id arg_refcount_acquire;
-	};
+
+	/* arg_btf and arg_btf_id are used by kfunc-specific handling,
+	 * generally to pass info about user-defined local kptr types to later
+	 * verification logic
+	 *   bpf_obj_drop
+	 *     Record the local kptr type to be drop'd
+	 *   bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type)
+	 *     Record the local kptr type to be refcount_incr'd
+	 */
+	struct btf *arg_btf;
+	u32 arg_btf_id;
+
 	struct {
 		struct btf_field *field;
 	} arg_list_head;
@@ -10680,8 +10683,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			}
 			if (meta->btf == btf_vmlinux &&
 			    meta->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
-				meta->arg_obj_drop.btf = reg->btf;
-				meta->arg_obj_drop.btf_id = reg->btf_id;
+				meta->arg_btf = reg->btf;
+				meta->arg_btf_id = reg->btf_id;
 			}
 			break;
 		case KF_ARG_PTR_TO_DYNPTR:
@@ -10892,8 +10895,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 				verbose(env, "bpf_refcount_acquire calls are disabled for now\n");
 				return -EINVAL;
 			}
-			meta->arg_refcount_acquire.btf = reg->btf;
-			meta->arg_refcount_acquire.btf_id = reg->btf_id;
+			meta->arg_btf = reg->btf;
+			meta->arg_btf_id = reg->btf_id;
 			break;
 		}
 	}
@@ -11125,12 +11128,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			} else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
 				mark_reg_known_zero(env, regs, BPF_REG_0);
 				regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
-				regs[BPF_REG_0].btf = meta.arg_refcount_acquire.btf;
-				regs[BPF_REG_0].btf_id = meta.arg_refcount_acquire.btf_id;
+				regs[BPF_REG_0].btf = meta.arg_btf;
+				regs[BPF_REG_0].btf_id = meta.arg_btf_id;
 
 				insn_aux->kptr_struct_meta =
-					btf_find_struct_meta(meta.arg_refcount_acquire.btf,
-							     meta.arg_refcount_acquire.btf_id);
+					btf_find_struct_meta(meta.arg_btf,
+							     meta.arg_btf_id);
 			} else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] ||
 				   meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) {
 				struct btf_field *field = meta.arg_list_head.field;
@@ -11260,8 +11263,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
 			if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
 				insn_aux->kptr_struct_meta =
-					btf_find_struct_meta(meta.arg_obj_drop.btf,
-							     meta.arg_obj_drop.btf_id);
+					btf_find_struct_meta(meta.arg_btf,
+							     meta.arg_btf_id);
 			}
 		}
 	}

From d84b1a6708eec06b6cd9d33c5e0177bbd6ba4813 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 15 May 2023 11:07:10 -0700
Subject: [PATCH 53/57] bpf: fix calculation of subseq_idx during precision
 backtracking

Subsequent instruction index (subseq_idx) is an index of an instruction
that was verified/executed by verifier after the currently processed
instruction. It is maintained during precision backtracking processing
and is used to detect various subprog calling conditions.

This patch fixes the bug with incorrectly resetting subseq_idx to -1
when going from child state to parent state during backtracking. If we
don't maintain correct subseq_idx we can misidentify subprog calls
leading to precision tracking bugs.

One such case was triggered by test_global_funcs/global_func9 test where
global subprog call happened to be the very last instruction in parent
state, leading to subseq_idx==-1, triggering WARN_ONCE:

  [   36.045754] verifier backtracking bug
  [   36.045764] WARNING: CPU: 13 PID: 2073 at kernel/bpf/verifier.c:3503 __mark_chain_precision+0xcc6/0xde0
  [   36.046819] Modules linked in: aesni_intel(E) crypto_simd(E) cryptd(E) kvm_intel(E) kvm(E) irqbypass(E) i2c_piix4(E) serio_raw(E) i2c_core(E) crc32c_intel)
  [   36.048040] CPU: 13 PID: 2073 Comm: test_progs Tainted: G        W  OE      6.3.0-07976-g4d585f48ee6b-dirty #972
  [   36.048783] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
  [   36.049648] RIP: 0010:__mark_chain_precision+0xcc6/0xde0
  [   36.050038] Code: 3d 82 c6 05 bb 35 32 02 01 e8 66 21 ec ff 0f 0b b8 f2 ff ff ff e9 30 f5 ff ff 48 c7 c7 f3 61 3d 82 4c 89 0c 24 e8 4a 21 ec ff <0f> 0b 4c0

With the fix precision tracking across multiple states works correctly now:

mark_precise: frame0: last_idx 45 first_idx 38 subseq_idx -1
mark_precise: frame0: regs=r8 stack= before 44: (61) r7 = *(u32 *)(r10 -4)
mark_precise: frame0: regs=r8 stack= before 43: (85) call pc+41
mark_precise: frame0: regs=r8 stack= before 42: (07) r1 += -48
mark_precise: frame0: regs=r8 stack= before 41: (bf) r1 = r10
mark_precise: frame0: regs=r8 stack= before 40: (63) *(u32 *)(r10 -48) = r1
mark_precise: frame0: regs=r8 stack= before 39: (b4) w1 = 0
mark_precise: frame0: regs=r8 stack= before 38: (85) call pc+38
mark_precise: frame0: parent state regs=r8 stack=:  R0_w=scalar() R1_w=map_value(off=4,ks=4,vs=8,imm=0) R6=1 R7_w=scalar() R8_r=P0 R10=fpm
mark_precise: frame0: last_idx 36 first_idx 28 subseq_idx 38
mark_precise: frame0: regs=r8 stack= before 36: (18) r1 = 0xffff888104f2ed14
mark_precise: frame0: regs=r8 stack= before 35: (85) call pc+33
mark_precise: frame0: regs=r8 stack= before 33: (18) r1 = 0xffff888104f2ed10
mark_precise: frame0: regs=r8 stack= before 32: (85) call pc+36
mark_precise: frame0: regs=r8 stack= before 31: (07) r1 += -4
mark_precise: frame0: regs=r8 stack= before 30: (bf) r1 = r10
mark_precise: frame0: regs=r8 stack= before 29: (63) *(u32 *)(r10 -4) = r7
mark_precise: frame0: regs=r8 stack= before 28: (4c) w7 |= w0
mark_precise: frame0: parent state regs=r8 stack=:  R0_rw=scalar() R6=1 R7_rw=scalar() R8_rw=P0 R10=fp0 fp-48_r=mmmmmmmm
mark_precise: frame0: last_idx 27 first_idx 16 subseq_idx 28
mark_precise: frame0: regs=r8 stack= before 27: (85) call pc+31
mark_precise: frame0: regs=r8 stack= before 26: (b7) r1 = 0
mark_precise: frame0: regs=r8 stack= before 25: (b7) r8 = 0

Note how subseq_idx starts out as -1, then is preserved as 38 and then 28 as we
go up the parent state chain.

Reported-by: Alexei Starovoitov <ast@kernel.org>
Fixes: fde2a3882bd0 ("bpf: support precision propagation in the presence of subprogs")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230515180710.1535018-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5c636276d907..f597491259ab 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3864,10 +3864,11 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 	struct bpf_verifier_state *st = env->cur_state;
 	int first_idx = st->first_insn_idx;
 	int last_idx = env->insn_idx;
+	int subseq_idx = -1;
 	struct bpf_func_state *func;
 	struct bpf_reg_state *reg;
 	bool skip_first = true;
-	int i, prev_i, fr, err;
+	int i, fr, err;
 
 	if (!env->bpf_capable)
 		return 0;
@@ -3897,8 +3898,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 		u32 history = st->jmp_history_cnt;
 
 		if (env->log.level & BPF_LOG_LEVEL2) {
-			verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d\n",
-				bt->frame, last_idx, first_idx);
+			verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
+				bt->frame, last_idx, first_idx, subseq_idx);
 		}
 
 		if (last_idx < 0) {
@@ -3930,12 +3931,12 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 			return -EFAULT;
 		}
 
-		for (i = last_idx, prev_i = -1;;) {
+		for (i = last_idx;;) {
 			if (skip_first) {
 				err = 0;
 				skip_first = false;
 			} else {
-				err = backtrack_insn(env, i, prev_i, bt);
+				err = backtrack_insn(env, i, subseq_idx, bt);
 			}
 			if (err == -ENOTSUPP) {
 				mark_all_scalars_precise(env, env->cur_state);
@@ -3952,7 +3953,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 				return 0;
 			if (i == first_idx)
 				break;
-			prev_i = i;
+			subseq_idx = i;
 			i = get_prev_insn_idx(st, i, &history);
 			if (i >= env->prog->len) {
 				/* This can happen if backtracking reached insn 0
@@ -4031,6 +4032,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 		if (bt_empty(bt))
 			return 0;
 
+		subseq_idx = first_idx;
 		last_idx = st->last_insn_idx;
 		first_idx = st->first_insn_idx;
 	}

From 04cb8453a91c7c22f60ddadb6cef0d19abb33bb5 Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Fri, 12 May 2023 12:31:34 +0100
Subject: [PATCH 54/57] bpftool: JIT limited misreported as negative value on
 aarch64

On aarch64, "bpftool feature" reports an incorrect BPF JIT limit:

$ sudo /sbin/bpftool feature
Scanning system configuration...
bpf() syscall restricted to privileged users
JIT compiler is enabled
JIT compiler hardening is disabled
JIT compiler kallsyms exports are enabled for root
skipping kernel config, can't open file: No such file or directory
Global memory limit for JIT compiler for unprivileged users is -201326592 bytes

This is because /proc/sys/net/core/bpf_jit_limit reports

$ sudo cat /proc/sys/net/core/bpf_jit_limit
68169519595520

...and an int is assumed in read_procfs().  Change read_procfs()
to return a long to avoid negative value reporting.

Fixes: 7a4522bbef0c ("tools: bpftool: add probes for /proc/ eBPF parameters")
Reported-by: Nicky Veitch <nicky.veitch@oracle.com>
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/20230512113134.58996-1-alan.maguire@oracle.com
---
 tools/bpf/bpftool/feature.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index da16e6a27ccc..0675d6a46413 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -167,12 +167,12 @@ static int get_vendor_id(int ifindex)
 	return strtol(buf, NULL, 0);
 }
 
-static int read_procfs(const char *path)
+static long read_procfs(const char *path)
 {
 	char *endptr, *line = NULL;
 	size_t len = 0;
 	FILE *fd;
-	int res;
+	long res;
 
 	fd = fopen(path, "r");
 	if (!fd)
@@ -194,7 +194,7 @@ static int read_procfs(const char *path)
 
 static void probe_unprivileged_disabled(void)
 {
-	int res;
+	long res;
 
 	/* No support for C-style ouptut */
 
@@ -216,14 +216,14 @@ static void probe_unprivileged_disabled(void)
 			printf("Unable to retrieve required privileges for bpf() syscall\n");
 			break;
 		default:
-			printf("bpf() syscall restriction has unknown value %d\n", res);
+			printf("bpf() syscall restriction has unknown value %ld\n", res);
 		}
 	}
 }
 
 static void probe_jit_enable(void)
 {
-	int res;
+	long res;
 
 	/* No support for C-style ouptut */
 
@@ -245,7 +245,7 @@ static void probe_jit_enable(void)
 			printf("Unable to retrieve JIT-compiler status\n");
 			break;
 		default:
-			printf("JIT-compiler status has unknown value %d\n",
+			printf("JIT-compiler status has unknown value %ld\n",
 			       res);
 		}
 	}
@@ -253,7 +253,7 @@ static void probe_jit_enable(void)
 
 static void probe_jit_harden(void)
 {
-	int res;
+	long res;
 
 	/* No support for C-style ouptut */
 
@@ -275,7 +275,7 @@ static void probe_jit_harden(void)
 			printf("Unable to retrieve JIT hardening status\n");
 			break;
 		default:
-			printf("JIT hardening status has unknown value %d\n",
+			printf("JIT hardening status has unknown value %ld\n",
 			       res);
 		}
 	}
@@ -283,7 +283,7 @@ static void probe_jit_harden(void)
 
 static void probe_jit_kallsyms(void)
 {
-	int res;
+	long res;
 
 	/* No support for C-style ouptut */
 
@@ -302,14 +302,14 @@ static void probe_jit_kallsyms(void)
 			printf("Unable to retrieve JIT kallsyms export status\n");
 			break;
 		default:
-			printf("JIT kallsyms exports status has unknown value %d\n", res);
+			printf("JIT kallsyms exports status has unknown value %ld\n", res);
 		}
 	}
 }
 
 static void probe_jit_limit(void)
 {
-	int res;
+	long res;
 
 	/* No support for C-style ouptut */
 
@@ -322,7 +322,7 @@ static void probe_jit_limit(void)
 			printf("Unable to retrieve global memory limit for JIT compiler for unprivileged users\n");
 			break;
 		default:
-			printf("Global memory limit for JIT compiler for unprivileged users is %d bytes\n", res);
+			printf("Global memory limit for JIT compiler for unprivileged users is %ld bytes\n", res);
 		}
 	}
 }

From 90564f1e3dd66052af29e2c29cba3dc98b6604d0 Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Thu, 11 May 2023 16:05:07 +0200
Subject: [PATCH 55/57] bpf, arm64: Support struct arguments in the BPF
 trampoline

This extends the BPF trampoline JIT to support attachment to functions
that take small structures (up to 128bit) as argument. This is trivially
achieved by saving/restoring a number of "argument registers" rather
than a number of arguments.

The AAPCS64 section 6.8.2 describes the parameter passing ABI.
"Composite types" (like C structs) below 16 bytes (as enforced by the
BPF verifier) are provided as part of the 8 argument registers as
explained in the section C.12.

Signed-off-by: Florent Revest <revest@chromium.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Xu Kuohai <xukuohai@huawei.com>
Link: https://lore.kernel.org/bpf/20230511140507.514888-1-revest@chromium.org
---
 arch/arm64/net/bpf_jit_comp.c                | 55 ++++++++++----------
 tools/testing/selftests/bpf/DENYLIST.aarch64 |  1 -
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index b26da8efa616..145b540ec34f 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -1731,21 +1731,21 @@ static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl,
 	}
 }
 
-static void save_args(struct jit_ctx *ctx, int args_off, int nargs)
+static void save_args(struct jit_ctx *ctx, int args_off, int nregs)
 {
 	int i;
 
-	for (i = 0; i < nargs; i++) {
+	for (i = 0; i < nregs; i++) {
 		emit(A64_STR64I(i, A64_SP, args_off), ctx);
 		args_off += 8;
 	}
 }
 
-static void restore_args(struct jit_ctx *ctx, int args_off, int nargs)
+static void restore_args(struct jit_ctx *ctx, int args_off, int nregs)
 {
 	int i;
 
-	for (i = 0; i < nargs; i++) {
+	for (i = 0; i < nregs; i++) {
 		emit(A64_LDR64I(i, A64_SP, args_off), ctx);
 		args_off += 8;
 	}
@@ -1764,7 +1764,7 @@ static void restore_args(struct jit_ctx *ctx, int args_off, int nargs)
  */
 static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 			      struct bpf_tramp_links *tlinks, void *orig_call,
-			      int nargs, u32 flags)
+			      int nregs, u32 flags)
 {
 	int i;
 	int stack_size;
@@ -1772,7 +1772,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	int regs_off;
 	int retval_off;
 	int args_off;
-	int nargs_off;
+	int nregs_off;
 	int ip_off;
 	int run_ctx_off;
 	struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
@@ -1795,11 +1795,11 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	 * SP + retval_off  [ return value      ] BPF_TRAMP_F_CALL_ORIG or
 	 *                                        BPF_TRAMP_F_RET_FENTRY_RET
 	 *
-	 *                  [ argN              ]
+	 *                  [ arg reg N         ]
 	 *                  [ ...               ]
-	 * SP + args_off    [ arg1              ]
+	 * SP + args_off    [ arg reg 1         ]
 	 *
-	 * SP + nargs_off   [ args count        ]
+	 * SP + nregs_off   [ arg regs count    ]
 	 *
 	 * SP + ip_off      [ traced function   ] BPF_TRAMP_F_IP_ARG flag
 	 *
@@ -1816,13 +1816,13 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	if (flags & BPF_TRAMP_F_IP_ARG)
 		stack_size += 8;
 
-	nargs_off = stack_size;
+	nregs_off = stack_size;
 	/* room for args count */
 	stack_size += 8;
 
 	args_off = stack_size;
 	/* room for args */
-	stack_size += nargs * 8;
+	stack_size += nregs * 8;
 
 	/* room for return value */
 	retval_off = stack_size;
@@ -1865,12 +1865,12 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 		emit(A64_STR64I(A64_R(10), A64_SP, ip_off), ctx);
 	}
 
-	/* save args count*/
-	emit(A64_MOVZ(1, A64_R(10), nargs, 0), ctx);
-	emit(A64_STR64I(A64_R(10), A64_SP, nargs_off), ctx);
+	/* save arg regs count*/
+	emit(A64_MOVZ(1, A64_R(10), nregs, 0), ctx);
+	emit(A64_STR64I(A64_R(10), A64_SP, nregs_off), ctx);
 
-	/* save args */
-	save_args(ctx, args_off, nargs);
+	/* save arg regs */
+	save_args(ctx, args_off, nregs);
 
 	/* save callee saved registers */
 	emit(A64_STR64I(A64_R(19), A64_SP, regs_off), ctx);
@@ -1897,7 +1897,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	}
 
 	if (flags & BPF_TRAMP_F_CALL_ORIG) {
-		restore_args(ctx, args_off, nargs);
+		restore_args(ctx, args_off, nregs);
 		/* call original func */
 		emit(A64_LDR64I(A64_R(10), A64_SP, retaddr_off), ctx);
 		emit(A64_ADR(A64_LR, AARCH64_INSN_SIZE * 2), ctx);
@@ -1926,7 +1926,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	}
 
 	if (flags & BPF_TRAMP_F_RESTORE_REGS)
-		restore_args(ctx, args_off, nargs);
+		restore_args(ctx, args_off, nregs);
 
 	/* restore callee saved register x19 and x20 */
 	emit(A64_LDR64I(A64_R(19), A64_SP, regs_off), ctx);
@@ -1967,24 +1967,25 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 				void *orig_call)
 {
 	int i, ret;
-	int nargs = m->nr_args;
+	int nregs = m->nr_args;
 	int max_insns = ((long)image_end - (long)image) / AARCH64_INSN_SIZE;
 	struct jit_ctx ctx = {
 		.image = NULL,
 		.idx = 0,
 	};
 
-	/* the first 8 arguments are passed by registers */
-	if (nargs > 8)
-		return -ENOTSUPP;
-
-	/* don't support struct argument */
+	/* extra registers needed for struct argument */
 	for (i = 0; i < MAX_BPF_FUNC_ARGS; i++) {
+		/* The arg_size is at most 16 bytes, enforced by the verifier. */
 		if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
-			return -ENOTSUPP;
+			nregs += (m->arg_size[i] + 7) / 8 - 1;
 	}
 
-	ret = prepare_trampoline(&ctx, im, tlinks, orig_call, nargs, flags);
+	/* the first 8 registers are used for arguments */
+	if (nregs > 8)
+		return -ENOTSUPP;
+
+	ret = prepare_trampoline(&ctx, im, tlinks, orig_call, nregs, flags);
 	if (ret < 0)
 		return ret;
 
@@ -1995,7 +1996,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 	ctx.idx = 0;
 
 	jit_fill_hole(image, (unsigned int)(image_end - image));
-	ret = prepare_trampoline(&ctx, im, tlinks, orig_call, nargs, flags);
+	ret = prepare_trampoline(&ctx, im, tlinks, orig_call, nregs, flags);
 
 	if (ret > 0 && validate_code(&ctx) < 0)
 		ret = -EINVAL;
diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64
index cd42e2825bd2..08adc805878b 100644
--- a/tools/testing/selftests/bpf/DENYLIST.aarch64
+++ b/tools/testing/selftests/bpf/DENYLIST.aarch64
@@ -10,4 +10,3 @@ kprobe_multi_test/link_api_addrs                 # link_fd unexpected link_fd: a
 kprobe_multi_test/link_api_syms                  # link_fd unexpected link_fd: actual -95 < expected 0
 kprobe_multi_test/skel_api                       # libbpf: failed to load BPF skeleton 'kprobe_multi': -3
 module_attach                                    # prog 'kprobe_multi': failed to auto-attach: -95
-tracing_struct                                   # tracing_struct__attach unexpected error: -524 (errno 524)

From 47e79cbeea4b3891ad476047f4c68543eb51c8e0 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Mon, 15 May 2023 13:08:48 +0000
Subject: [PATCH 56/57] bpf: Remove bpf trampoline selector

After commit e21aa341785c ("bpf: Fix fexit trampoline."), the selector is only
used to indicate how many times the bpf trampoline image are updated and been
displayed in the trampoline ksym name. After the trampoline is freed, the
selector will start from 0 again. So the selector is a useless value to the
user. We can remove it.

If the user want to check whether the bpf trampoline image has been updated
or not, the user can compare the address. Each time the trampoline image is
updated, the address will change consequently. Jiri also pointed out another
issue that perf is still using the old name "bpf_trampoline_%lu", so this
change can fix the issue in perf.

Fixes: e21aa341785c ("bpf: Fix fexit trampoline.")
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <song@kernel.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Link: https://lore.kernel.org/bpf/ZFvOOlrmHiY9AgXE@krava
Link: https://lore.kernel.org/bpf/20230515130849.57502-3-laoar.shao@gmail.com
---
 include/linux/bpf.h     |  1 -
 kernel/bpf/trampoline.c | 11 ++++-------
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 456f33b9d205..36e4b2d8cca2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1125,7 +1125,6 @@ struct bpf_trampoline {
 	int progs_cnt[BPF_TRAMP_MAX];
 	/* Executable image of trampoline */
 	struct bpf_tramp_image *cur_image;
-	u64 selector;
 	struct module *mod;
 };
 
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index ac021bc43a66..84850e66ce3d 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -344,7 +344,7 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im)
 	call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
 }
 
-static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
+static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key)
 {
 	struct bpf_tramp_image *im;
 	struct bpf_ksym *ksym;
@@ -371,7 +371,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
 
 	ksym = &im->ksym;
 	INIT_LIST_HEAD_RCU(&ksym->lnode);
-	snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu_%u", key, idx);
+	snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key);
 	bpf_image_ksym_add(image, ksym);
 	return im;
 
@@ -401,11 +401,10 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
 		err = unregister_fentry(tr, tr->cur_image->image);
 		bpf_tramp_image_put(tr->cur_image);
 		tr->cur_image = NULL;
-		tr->selector = 0;
 		goto out;
 	}
 
-	im = bpf_tramp_image_alloc(tr->key, tr->selector);
+	im = bpf_tramp_image_alloc(tr->key);
 	if (IS_ERR(im)) {
 		err = PTR_ERR(im);
 		goto out;
@@ -442,8 +441,7 @@ again:
 
 	set_memory_rox((long)im->image, 1);
 
-	WARN_ON(tr->cur_image && tr->selector == 0);
-	WARN_ON(!tr->cur_image && tr->selector);
+	WARN_ON(tr->cur_image && total == 0);
 	if (tr->cur_image)
 		/* progs already running at this address */
 		err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex);
@@ -473,7 +471,6 @@ again:
 	if (tr->cur_image)
 		bpf_tramp_image_put(tr->cur_image);
 	tr->cur_image = im;
-	tr->selector++;
 out:
 	/* If any error happens, restore previous flags */
 	if (err)

From 108598c39eefbedc9882273ac0df96127a629220 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Mon, 15 May 2023 13:08:47 +0000
Subject: [PATCH 57/57] bpf: Fix memleak due to fentry attach failure

If it fails to attach fentry, the allocated bpf trampoline image will be
left in the system. That can be verified by checking /proc/kallsyms.

This meamleak can be verified by a simple bpf program as follows:

  SEC("fentry/trap_init")
  int fentry_run()
  {
      return 0;
  }

It will fail to attach trap_init because this function is freed after
kernel init, and then we can find the trampoline image is left in the
system by checking /proc/kallsyms.

  $ tail /proc/kallsyms
  ffffffffc0613000 t bpf_trampoline_6442453466_1  [bpf]
  ffffffffc06c3000 t bpf_trampoline_6442453466_1  [bpf]

  $ bpftool btf dump file /sys/kernel/btf/vmlinux | grep "FUNC 'trap_init'"
  [2522] FUNC 'trap_init' type_id=119 linkage=static

  $ echo $((6442453466 & 0x7fffffff))
  2522

Note that there are two left bpf trampoline images, that is because the
libbpf will fallback to raw tracepoint if -EINVAL is returned.

Fixes: e21aa341785c ("bpf: Fix fexit trampoline.")
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <song@kernel.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Link: https://lore.kernel.org/bpf/20230515130849.57502-2-laoar.shao@gmail.com
---
 kernel/bpf/trampoline.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 84850e66ce3d..78acf28d4873 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -251,11 +251,8 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_a
 	return tlinks;
 }
 
-static void __bpf_tramp_image_put_deferred(struct work_struct *work)
+static void bpf_tramp_image_free(struct bpf_tramp_image *im)
 {
-	struct bpf_tramp_image *im;
-
-	im = container_of(work, struct bpf_tramp_image, work);
 	bpf_image_ksym_del(&im->ksym);
 	bpf_jit_free_exec(im->image);
 	bpf_jit_uncharge_modmem(PAGE_SIZE);
@@ -263,6 +260,14 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work)
 	kfree_rcu(im, rcu);
 }
 
+static void __bpf_tramp_image_put_deferred(struct work_struct *work)
+{
+	struct bpf_tramp_image *im;
+
+	im = container_of(work, struct bpf_tramp_image, work);
+	bpf_tramp_image_free(im);
+}
+
 /* callback, fexit step 3 or fentry step 2 */
 static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu)
 {
@@ -437,7 +442,7 @@ again:
 					  &tr->func.model, tr->flags, tlinks,
 					  tr->func.addr);
 	if (err < 0)
-		goto out;
+		goto out_free;
 
 	set_memory_rox((long)im->image, 1);
 
@@ -466,7 +471,7 @@ again:
 	}
 #endif
 	if (err)
-		goto out;
+		goto out_free;
 
 	if (tr->cur_image)
 		bpf_tramp_image_put(tr->cur_image);
@@ -477,6 +482,10 @@ out:
 		tr->flags = orig_flags;
 	kfree(tlinks);
 	return err;
+
+out_free:
+	bpf_tramp_image_free(im);
+	goto out;
 }
 
 static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)