diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-03 16:54:54 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-03 16:54:54 -0800 |
| commit | 015e7b0b0e8e51f7321ec2aafc1d7fc0a8a5536f (patch) | |
| tree | 258f719e59946c733dd03198eba404e85f9d0945 | |
| parent | b6d993310a65b994f37e3347419d9ed398ee37a3 (diff) | |
| parent | ff34657aa72a4dab9c2fd38e1b31a506951f4b1c (diff) | |
| download | net-015e7b0b0e8e51f7321ec2aafc1d7fc0a8a5536f.tar.gz | |
Merge tag 'bpf-next-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Pull bpf updates from Alexei Starovoitov:
- Convert selftests/bpf/test_tc_edt and test_tc_tunnel from .sh to
test_progs runner (Alexis Lothoré)
- Convert selftests/bpf/test_xsk to test_progs runner (Bastien
Curutchet)
- Replace bpf memory allocator with kmalloc_nolock() in
bpf_local_storage (Amery Hung), and in bpf streams and range tree
(Puranjay Mohan)
- Introduce support for indirect jumps in BPF verifier and x86 JIT
(Anton Protopopov) and arm64 JIT (Puranjay Mohan)
- Remove runqslower bpf tool (Hoyeon Lee)
- Fix corner cases in the verifier to close several syzbot reports
(Eduard Zingerman, KaFai Wan)
- Several improvements in deadlock detection in rqspinlock (Kumar
Kartikeya Dwivedi)
- Implement "jmp" mode for BPF trampoline and corresponding
DYNAMIC_FTRACE_WITH_JMP. It improves "fexit" program type performance
from 80 M/s to 136 M/s. With Steven's Ack. (Menglong Dong)
- Add ability to test non-linear skbs in BPF_PROG_TEST_RUN (Paul
Chaignon)
- Do not let BPF_PROG_TEST_RUN emit invalid GSO types to stack (Daniel
Borkmann)
- Generalize buildid reader into bpf_dynptr (Mykyta Yatsenko)
- Optimize bpf_map_update_elem() for map-in-map types (Ritesh
Oedayrajsingh Varma)
- Introduce overwrite mode for BPF ring buffer (Xu Kuohai)
* tag 'bpf-next-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (169 commits)
bpf: optimize bpf_map_update_elem() for map-in-map types
bpf: make kprobe_multi_link_prog_run always_inline
selftests/bpf: do not hardcode target rate in test_tc_edt BPF program
selftests/bpf: remove test_tc_edt.sh
selftests/bpf: integrate test_tc_edt into test_progs
selftests/bpf: rename test_tc_edt.bpf.c section to expose program type
selftests/bpf: Add success stats to rqspinlock stress test
rqspinlock: Precede non-head waiter queueing with AA check
rqspinlock: Disable spinning for trylock fallback
rqspinlock: Use trylock fallback when per-CPU rqnode is busy
rqspinlock: Perform AA checks immediately
rqspinlock: Enclose lock/unlock within lock entry acquisitions
bpf: Remove runqslower tool
selftests/bpf: Remove usage of lsm/file_alloc_security in selftest
bpf: Disable file_alloc_security hook
bpf: check for insn arrays in check_ptr_alignment
bpf: force BPF_F_RDONLY_PROG on insn array creation
bpf: Fix exclusive map memory leak
selftests/bpf: Make CS length configurable for rqspinlock stress test
selftests/bpf: Add lock wait time stats to rqspinlock stress test
...
157 files changed, 10852 insertions, 4998 deletions
diff --git a/Documentation/bpf/libbpf/program_types.rst b/Documentation/bpf/libbpf/program_types.rst index 218b020a2f81ff..3b837522834bf9 100644 --- a/Documentation/bpf/libbpf/program_types.rst +++ b/Documentation/bpf/libbpf/program_types.rst @@ -100,10 +100,26 @@ described in more detail in the footnotes. | | | ``uretprobe.s+`` [#uprobe]_ | Yes | + + +----------------------------------+-----------+ | | | ``usdt+`` [#usdt]_ | | ++ + +----------------------------------+-----------+ +| | | ``usdt.s+`` [#usdt]_ | Yes | + +----------------------------------------+----------------------------------+-----------+ | | ``BPF_TRACE_KPROBE_MULTI`` | ``kprobe.multi+`` [#kpmulti]_ | | + + +----------------------------------+-----------+ | | | ``kretprobe.multi+`` [#kpmulti]_ | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_TRACE_KPROBE_SESSION`` | ``kprobe.session+`` [#kpmulti]_ | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_TRACE_UPROBE_MULTI`` | ``uprobe.multi+`` [#upmul]_ | | ++ + +----------------------------------+-----------+ +| | | ``uprobe.multi.s+`` [#upmul]_ | Yes | ++ + +----------------------------------+-----------+ +| | | ``uretprobe.multi+`` [#upmul]_ | | ++ + +----------------------------------+-----------+ +| | | ``uretprobe.multi.s+`` [#upmul]_ | Yes | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_TRACE_UPROBE_SESSION`` | ``uprobe.session+`` [#upmul]_ | | ++ + +----------------------------------+-----------+ +| | | ``uprobe.session.s+`` [#upmul]_ | Yes | +-------------------------------------------+----------------------------------------+----------------------------------+-----------+ | ``BPF_PROG_TYPE_LIRC_MODE2`` | ``BPF_LIRC_MODE2`` | ``lirc_mode2`` | | +-------------------------------------------+----------------------------------------+----------------------------------+-----------+ @@ -219,6 +235,8 @@ described in more detail in the footnotes. non-negative integer. .. [#ksyscall] The ``ksyscall`` attach format is ``ksyscall/<syscall>``. .. [#uprobe] The ``uprobe`` attach format is ``uprobe[.s]/<path>:<function>[+<offset>]``. +.. [#upmul] The ``uprobe.multi`` attach format is ``uprobe.multi[.s]/<path>:<function-pattern>`` + where ``function-pattern`` supports ``*`` and ``?`` wildcards. .. [#usdt] The ``usdt`` attach format is ``usdt/<path>:<provider>:<name>``. .. [#kpmulti] The ``kprobe.multi`` attach format is ``kprobe.multi/<pattern>`` where ``pattern`` supports ``*`` and ``?`` wildcards. Valid characters for pattern are diff --git a/Documentation/bpf/map_array.rst b/Documentation/bpf/map_array.rst index f2f51a53e8aed5..fa56ff75190c8f 100644 --- a/Documentation/bpf/map_array.rst +++ b/Documentation/bpf/map_array.rst @@ -15,8 +15,9 @@ of constant size. The size of the array is defined in ``max_entries`` at creation time. All array elements are pre-allocated and zero initialized when created. ``BPF_MAP_TYPE_PERCPU_ARRAY`` uses a different memory region for each CPU whereas ``BPF_MAP_TYPE_ARRAY`` uses the same memory region. The value -stored can be of any size, however, all array elements are aligned to 8 -bytes. +stored can be of any size for ``BPF_MAP_TYPE_ARRAY`` and not more than +``PCPU_MIN_UNIT_SIZE`` (32 kB) for ``BPF_MAP_TYPE_PERCPU_ARRAY``. All +array elements are aligned to 8 bytes. Since kernel 5.5, memory mapping may be enabled for ``BPF_MAP_TYPE_ARRAY`` by setting the flag ``BPF_F_MMAPABLE``. The map definition is page-aligned and diff --git a/MAINTAINERS b/MAINTAINERS index 9ac254f4ec41ca..3b1d3af83f135a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4654,6 +4654,7 @@ F: Documentation/userspace-api/ebpf/ F: arch/*/net/* F: include/linux/bpf* F: include/linux/btf* +F: include/linux/buildid.h F: include/linux/filter.h F: include/trace/events/xdp.h F: include/uapi/linux/bpf* diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index afd05b41ea9e66..74dd29816f36a4 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1452,6 +1452,10 @@ emit_bswap_uxt: emit(A64_ASR(is64, dst, dst, imm), ctx); break; + /* JUMP reg */ + case BPF_JMP | BPF_JA | BPF_X: + emit(A64_BR(dst), ctx); + break; /* JUMP off */ case BPF_JMP | BPF_JA: case BPF_JMP32 | BPF_JA: @@ -2231,6 +2235,13 @@ skip_init_ctx: for (i = 0; i <= prog->len; i++) ctx.offset[i] *= AARCH64_INSN_SIZE; bpf_prog_fill_jited_linfo(prog, ctx.offset + 1); + /* + * The bpf_prog_update_insn_ptrs function expects offsets to + * point to the first byte of the jitted instruction (unlike + * the bpf_prog_fill_jited_linfo above, which, for historical + * reasons, expects to point to the next instruction) + */ + bpf_prog_update_insn_ptrs(prog, ctx.offset, ctx.ro_image); out_off: if (!ro_header && priv_stack_ptr) { free_percpu(priv_stack_ptr); @@ -2923,8 +2934,9 @@ static int gen_branch_or_nop(enum aarch64_insn_branch_type type, void *ip, * The dummy_tramp is used to prevent another CPU from jumping to unknown * locations during the patching process, making the patching process easier. */ -int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type, - void *old_addr, void *new_addr) +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, + enum bpf_text_poke_type new_t, void *old_addr, + void *new_addr) { int ret; u32 old_insn; @@ -2968,14 +2980,13 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type, !poking_bpf_entry)) return -EINVAL; - if (poke_type == BPF_MOD_CALL) - branch_type = AARCH64_INSN_BRANCH_LINK; - else - branch_type = AARCH64_INSN_BRANCH_NOLINK; - + branch_type = old_t == BPF_MOD_CALL ? AARCH64_INSN_BRANCH_LINK : + AARCH64_INSN_BRANCH_NOLINK; if (gen_branch_or_nop(branch_type, ip, old_addr, plt, &old_insn) < 0) return -EFAULT; + branch_type = new_t == BPF_MOD_CALL ? AARCH64_INSN_BRANCH_LINK : + AARCH64_INSN_BRANCH_NOLINK; if (gen_branch_or_nop(branch_type, ip, new_addr, plt, &new_insn) < 0) return -EFAULT; diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c index f97dc993640174..8dc58781b8eb7f 100644 --- a/arch/loongarch/net/bpf_jit.c +++ b/arch/loongarch/net/bpf_jit.c @@ -1284,11 +1284,12 @@ void *bpf_arch_text_copy(void *dst, void *src, size_t len) return ret ? ERR_PTR(-EINVAL) : dst; } -int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type, - void *old_addr, void *new_addr) +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, + enum bpf_text_poke_type new_t, void *old_addr, + void *new_addr) { int ret; - bool is_call = (poke_type == BPF_MOD_CALL); + bool is_call; u32 old_insns[LOONGARCH_LONG_JUMP_NINSNS] = {[0 ... 4] = INSN_NOP}; u32 new_insns[LOONGARCH_LONG_JUMP_NINSNS] = {[0 ... 4] = INSN_NOP}; @@ -1298,6 +1299,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type, if (!is_bpf_text_address((unsigned long)ip)) return -ENOTSUPP; + is_call = old_t == BPF_MOD_CALL; ret = emit_jump_or_nops(old_addr, ip, old_insns, is_call); if (ret) return ret; @@ -1305,6 +1307,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type, if (memcmp(ip, old_insns, LOONGARCH_LONG_JUMP_NBYTES)) return -EFAULT; + is_call = new_t == BPF_MOD_CALL; ret = emit_jump_or_nops(new_addr, ip, new_insns, is_call); if (ret) return ret; diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 88ad5ba7b87fd0..5e976730b2f5f5 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -1107,8 +1107,9 @@ static void do_isync(void *info __maybe_unused) * execute isync (or some CSI) so that they don't go back into the * trampoline again. */ -int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type, - void *old_addr, void *new_addr) +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, + enum bpf_text_poke_type new_t, void *old_addr, + void *new_addr) { unsigned long bpf_func, bpf_func_end, size, offset; ppc_inst_t old_inst, new_inst; @@ -1119,7 +1120,6 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type, return -EOPNOTSUPP; bpf_func = (unsigned long)ip; - branch_flags = poke_type == BPF_MOD_CALL ? BRANCH_SET_LINK : 0; /* We currently only support poking bpf programs */ if (!__bpf_address_lookup(bpf_func, &size, &offset, name)) { @@ -1132,7 +1132,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type, * an unconditional branch instruction at im->ip_after_call */ if (offset) { - if (poke_type != BPF_MOD_JUMP) { + if (old_t == BPF_MOD_CALL || new_t == BPF_MOD_CALL) { pr_err("%s (0x%lx): calls are not supported in bpf prog body\n", __func__, bpf_func); return -EOPNOTSUPP; @@ -1166,6 +1166,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type, } old_inst = ppc_inst(PPC_RAW_NOP()); + branch_flags = old_t == BPF_MOD_CALL ? BRANCH_SET_LINK : 0; if (old_addr) { if (is_offset_in_branch_range(ip - old_addr)) create_branch(&old_inst, ip, (unsigned long)old_addr, branch_flags); @@ -1174,6 +1175,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type, branch_flags); } new_inst = ppc_inst(PPC_RAW_NOP()); + branch_flags = new_t == BPF_MOD_CALL ? BRANCH_SET_LINK : 0; if (new_addr) { if (is_offset_in_branch_range(ip - new_addr)) create_branch(&new_inst, ip, (unsigned long)new_addr, branch_flags); diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 45cbc7c6fe490c..5f9457e910e871 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -852,17 +852,19 @@ static int gen_jump_or_nops(void *target, void *ip, u32 *insns, bool is_call) return emit_jump_and_link(is_call ? RV_REG_T0 : RV_REG_ZERO, rvoff, false, &ctx); } -int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type, - void *old_addr, void *new_addr) +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, + enum bpf_text_poke_type new_t, void *old_addr, + void *new_addr) { u32 old_insns[RV_FENTRY_NINSNS], new_insns[RV_FENTRY_NINSNS]; - bool is_call = poke_type == BPF_MOD_CALL; + bool is_call; int ret; if (!is_kernel_text((unsigned long)ip) && !is_bpf_text_address((unsigned long)ip)) return -ENOTSUPP; + is_call = old_t == BPF_MOD_CALL; ret = gen_jump_or_nops(old_addr, ip, old_insns, is_call); if (ret) return ret; @@ -870,6 +872,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type, if (memcmp(ip, old_insns, RV_FENTRY_NBYTES)) return -EFAULT; + is_call = new_t == BPF_MOD_CALL; ret = gen_jump_or_nops(new_addr, ip, new_insns, is_call); if (ret) return ret; @@ -1131,7 +1134,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, store_args(nr_arg_slots, args_off, ctx); /* skip to actual body of traced function */ - if (flags & BPF_TRAMP_F_SKIP_FRAME) + if (flags & BPF_TRAMP_F_ORIG_STACK) orig_call += RV_FENTRY_NINSNS * 4; if (flags & BPF_TRAMP_F_CALL_ORIG) { diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 3238c178bed8b6..579461d471bbac 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -2412,8 +2412,9 @@ bool bpf_jit_supports_far_kfunc_call(void) return true; } -int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, - void *old_addr, void *new_addr) +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, + enum bpf_text_poke_type new_t, void *old_addr, + void *new_addr) { struct bpf_plt expected_plt, current_plt, new_plt, *plt; struct { @@ -2430,7 +2431,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, if (insn.opc != (0xc004 | (old_addr ? 0xf0 : 0))) return -EINVAL; - if (t == BPF_MOD_JUMP && + if ((new_t == BPF_MOD_JUMP || old_t == BPF_MOD_JUMP) && insn.disp == ((char *)new_addr - (char *)ip) >> 1) { /* * The branch already points to the destination, diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 34fb46d5341b6e..17a107cc5244bb 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -230,6 +230,7 @@ config X86 select HAVE_DYNAMIC_FTRACE_WITH_ARGS if X86_64 select HAVE_FTRACE_REGS_HAVING_PT_REGS if X86_64 select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + select HAVE_DYNAMIC_FTRACE_WITH_JMP if X86_64 select HAVE_SAMPLE_FTRACE_DIRECT if X86_64 select HAVE_SAMPLE_FTRACE_DIRECT_MULTI if X86_64 select HAVE_EBPF_JIT diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 4450acec93903d..0543b57f54ee4d 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -74,7 +74,12 @@ static const char *ftrace_call_replace(unsigned long ip, unsigned long addr) * No need to translate into a callthunk. The trampoline does * the depth accounting itself. */ - return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr); + if (ftrace_is_jmp(addr)) { + addr = ftrace_jmp_get(addr); + return text_gen_insn(JMP32_INSN_OPCODE, (void *)ip, (void *)addr); + } else { + return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr); + } } static int ftrace_verify_code(unsigned long ip, const char *old_code) diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 823dbdd0eb4109..a132608265f6c0 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -285,8 +285,18 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR RET +1: + testb $1, %al + jz 2f + andq $0xfffffffffffffffe, %rax + movq %rax, MCOUNT_REG_SIZE+8(%rsp) + restore_mcount_regs + /* Restore flags */ + popfq + RET + /* Swap the flags with orig_rax */ -1: movq MCOUNT_REG_SIZE(%rsp), %rdi +2: movq MCOUNT_REG_SIZE(%rsp), %rdi movq %rdi, MCOUNT_REG_SIZE-8(%rsp) movq %rax, MCOUNT_REG_SIZE(%rsp) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index de5083cb1d3747..b69dc7194e2c07 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -597,7 +597,8 @@ static int emit_jump(u8 **pprog, void *func, void *ip) return emit_patch(pprog, func, ip, 0xE9); } -static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, +static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, + enum bpf_text_poke_type new_t, void *old_addr, void *new_addr) { const u8 *nop_insn = x86_nops[5]; @@ -607,9 +608,9 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, int ret; memcpy(old_insn, nop_insn, X86_PATCH_SIZE); - if (old_addr) { + if (old_t != BPF_MOD_NOP && old_addr) { prog = old_insn; - ret = t == BPF_MOD_CALL ? + ret = old_t == BPF_MOD_CALL ? emit_call(&prog, old_addr, ip) : emit_jump(&prog, old_addr, ip); if (ret) @@ -617,9 +618,9 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, } memcpy(new_insn, nop_insn, X86_PATCH_SIZE); - if (new_addr) { + if (new_t != BPF_MOD_NOP && new_addr) { prog = new_insn; - ret = t == BPF_MOD_CALL ? + ret = new_t == BPF_MOD_CALL ? emit_call(&prog, new_addr, ip) : emit_jump(&prog, new_addr, ip); if (ret) @@ -640,8 +641,9 @@ out: return ret; } -int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, - void *old_addr, void *new_addr) +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, + enum bpf_text_poke_type new_t, void *old_addr, + void *new_addr) { if (!is_kernel_text((long)ip) && !is_bpf_text_address((long)ip)) @@ -655,29 +657,43 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, if (is_endbr(ip)) ip += ENDBR_INSN_SIZE; - return __bpf_arch_text_poke(ip, t, old_addr, new_addr); + return __bpf_arch_text_poke(ip, old_t, new_t, old_addr, new_addr); } #define EMIT_LFENCE() EMIT3(0x0F, 0xAE, 0xE8) -static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) +static void __emit_indirect_jump(u8 **pprog, int reg, bool ereg) { u8 *prog = *pprog; + if (ereg) + EMIT1(0x41); + + EMIT2(0xFF, 0xE0 + reg); + + *pprog = prog; +} + +static void emit_indirect_jump(u8 **pprog, int bpf_reg, u8 *ip) +{ + u8 *prog = *pprog; + int reg = reg2hex[bpf_reg]; + bool ereg = is_ereg(bpf_reg); + if (cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) { OPTIMIZER_HIDE_VAR(reg); - emit_jump(&prog, its_static_thunk(reg), ip); + emit_jump(&prog, its_static_thunk(reg + 8*ereg), ip); } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { EMIT_LFENCE(); - EMIT2(0xFF, 0xE0 + reg); + __emit_indirect_jump(&prog, reg, ereg); } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) { OPTIMIZER_HIDE_VAR(reg); if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) - emit_jump(&prog, &__x86_indirect_jump_thunk_array[reg], ip); + emit_jump(&prog, &__x86_indirect_jump_thunk_array[reg + 8*ereg], ip); else - emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip); + emit_jump(&prog, &__x86_indirect_thunk_array[reg + 8*ereg], ip); } else { - EMIT2(0xFF, 0xE0 + reg); /* jmp *%\reg */ + __emit_indirect_jump(&prog, reg, ereg); if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) || IS_ENABLED(CONFIG_MITIGATION_SLS)) EMIT1(0xCC); /* int3 */ } @@ -797,7 +813,7 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog, * rdi == ctx (1st arg) * rcx == prog->bpf_func + X86_TAIL_CALL_OFFSET */ - emit_indirect_jump(&prog, 1 /* rcx */, ip + (prog - start)); + emit_indirect_jump(&prog, BPF_REG_4 /* R4 -> rcx */, ip + (prog - start)); /* out: */ ctx->tail_call_indirect_label = prog - start; @@ -883,12 +899,13 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) target = array->ptrs[poke->tail_call.key]; if (target) { ret = __bpf_arch_text_poke(poke->tailcall_target, - BPF_MOD_JUMP, NULL, + BPF_MOD_NOP, BPF_MOD_JUMP, + NULL, (u8 *)target->bpf_func + poke->adj_off); BUG_ON(ret < 0); ret = __bpf_arch_text_poke(poke->tailcall_bypass, - BPF_MOD_JUMP, + BPF_MOD_JUMP, BPF_MOD_NOP, (u8 *)poke->tailcall_target + X86_PATCH_SIZE, NULL); BUG_ON(ret < 0); @@ -2614,6 +2631,9 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ break; + case BPF_JMP | BPF_JA | BPF_X: + emit_indirect_jump(&prog, insn->dst_reg, image + addrs[i - 1]); + break; case BPF_JMP | BPF_JA: case BPF_JMP32 | BPF_JA: if (BPF_CLASS(insn->code) == BPF_JMP) { @@ -2830,9 +2850,10 @@ static int get_nr_used_regs(const struct btf_func_model *m) } static void save_args(const struct btf_func_model *m, u8 **prog, - int stack_size, bool for_call_origin) + int stack_size, bool for_call_origin, u32 flags) { int arg_regs, first_off = 0, nr_regs = 0, nr_stack_slots = 0; + bool use_jmp = bpf_trampoline_use_jmp(flags); int i, j; /* Store function arguments to stack. @@ -2873,7 +2894,7 @@ static void save_args(const struct btf_func_model *m, u8 **prog, */ for (j = 0; j < arg_regs; j++) { emit_ldx(prog, BPF_DW, BPF_REG_0, BPF_REG_FP, - nr_stack_slots * 8 + 0x18); + nr_stack_slots * 8 + 16 + (!use_jmp) * 8); emit_stx(prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -stack_size); @@ -3267,12 +3288,17 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im * should be 16-byte aligned. Following code depend on * that stack_size is already 8-byte aligned. */ - stack_size += (stack_size % 16) ? 0 : 8; + if (bpf_trampoline_use_jmp(flags)) { + /* no rip in the "jmp" case */ + stack_size += (stack_size % 16) ? 8 : 0; + } else { + stack_size += (stack_size % 16) ? 0 : 8; + } } arg_stack_off = stack_size; - if (flags & BPF_TRAMP_F_SKIP_FRAME) { + if (flags & BPF_TRAMP_F_CALL_ORIG) { /* skip patched call instruction and point orig_call to actual * body of the kernel function. */ @@ -3327,7 +3353,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off); } - save_args(m, &prog, regs_off, false); + save_args(m, &prog, regs_off, false, flags); if (flags & BPF_TRAMP_F_CALL_ORIG) { /* arg1: mov rdi, im */ @@ -3360,7 +3386,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im if (flags & BPF_TRAMP_F_CALL_ORIG) { restore_regs(m, &prog, regs_off); - save_args(m, &prog, arg_stack_off, true); + save_args(m, &prog, arg_stack_off, true, flags); if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) { /* Before calling the original function, load the @@ -3543,7 +3569,7 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, if (err) return err; - emit_indirect_jump(&prog, 2 /* rdx */, image + (prog - buf)); + emit_indirect_jump(&prog, BPF_REG_3 /* R3 -> rdx */, image + (prog - buf)); *pprog = prog; return 0; @@ -3827,6 +3853,15 @@ out_image: jit_data->header = header; jit_data->rw_header = rw_header; } + + /* + * The bpf_prog_update_insn_ptrs function expects addrs to + * point to the first byte of the jitted instruction (unlike + * the bpf_prog_fill_jited_linfo below, which, for historical + * reasons, expects to point to the next instruction) + */ + bpf_prog_update_insn_ptrs(prog, addrs, image); + /* * ctx.prog_offset is used when CFI preambles put code *before* * the function. See emit_cfi(). For FineIBT specifically this code @@ -3953,6 +3988,7 @@ void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, struct bpf_prog *new, struct bpf_prog *old) { u8 *old_addr, *new_addr, *old_bypass_addr; + enum bpf_text_poke_type t; int ret; old_bypass_addr = old ? NULL : poke->bypass_addr; @@ -3965,21 +4001,22 @@ void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, * the kallsyms check. */ if (new) { + t = old_addr ? BPF_MOD_JUMP : BPF_MOD_NOP; ret = __bpf_arch_text_poke(poke->tailcall_target, - BPF_MOD_JUMP, + t, BPF_MOD_JUMP, old_addr, new_addr); BUG_ON(ret < 0); if (!old) { ret = __bpf_arch_text_poke(poke->tailcall_bypass, - BPF_MOD_JUMP, + BPF_MOD_JUMP, BPF_MOD_NOP, poke->bypass_addr, NULL); BUG_ON(ret < 0); } } else { + t = old_bypass_addr ? BPF_MOD_JUMP : BPF_MOD_NOP; ret = __bpf_arch_text_poke(poke->tailcall_bypass, - BPF_MOD_JUMP, - old_bypass_addr, + t, BPF_MOD_JUMP, old_bypass_addr, poke->bypass_addr); BUG_ON(ret < 0); /* let other CPUs finish the execution of program @@ -3988,9 +4025,9 @@ void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, */ if (!ret) synchronize_rcu(); + t = old_addr ? BPF_MOD_JUMP : BPF_MOD_NOP; ret = __bpf_arch_text_poke(poke->tailcall_target, - BPF_MOD_JUMP, - old_addr, NULL); + t, BPF_MOD_NOP, old_addr, NULL); BUG_ON(ret < 0); } } diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h index 6d4244d643df32..0f2dcbbfee2f0a 100644 --- a/include/asm-generic/rqspinlock.h +++ b/include/asm-generic/rqspinlock.h @@ -129,8 +129,8 @@ dec: * <error> for lock B * release_held_lock_entry * - * try_cmpxchg_acquire for lock A * grab_held_lock_entry + * try_cmpxchg_acquire for lock A * * Lack of any ordering means reordering may occur such that dec, inc * are done before entry is overwritten. This permits a remote lock @@ -139,13 +139,8 @@ dec: * CPU holds a lock it is attempting to acquire, leading to false ABBA * diagnosis). * - * In case of unlock, we will always do a release on the lock word after - * releasing the entry, ensuring that other CPUs cannot hold the lock - * (and make conclusions about deadlocks) until the entry has been - * cleared on the local CPU, preventing any anomalies. Reordering is - * still possible there, but a remote CPU cannot observe a lock in our - * table which it is already holding, since visibility entails our - * release store for the said lock has not retired. + * The case of unlock is treated differently due to NMI reentrancy, see + * comments in res_spin_unlock. * * In theory we don't have a problem if the dec and WRITE_ONCE above get * reordered with each other, we either notice an empty NULL entry on @@ -175,10 +170,22 @@ static __always_inline int res_spin_lock(rqspinlock_t *lock) { int val = 0; - if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) { - grab_held_lock_entry(lock); + /* + * Grab the deadlock detection entry before doing the cmpxchg, so that + * reentrancy due to NMIs between the succeeding cmpxchg and creation of + * held lock entry can correctly detect an acquisition attempt in the + * interrupted context. + * + * cmpxchg lock A + * <NMI> + * res_spin_lock(A) --> missed AA, leads to timeout + * </NMI> + * grab_held_lock_entry(A) + */ + grab_held_lock_entry(lock); + + if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) return 0; - } return resilient_queued_spin_lock_slowpath(lock, val); } @@ -192,28 +199,25 @@ static __always_inline void res_spin_unlock(rqspinlock_t *lock) { struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); - if (unlikely(rqh->cnt > RES_NR_HELD)) - goto unlock; - WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); -unlock: /* - * Release barrier, ensures correct ordering. See release_held_lock_entry - * for details. Perform release store instead of queued_spin_unlock, - * since we use this function for test-and-set fallback as well. When we - * have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword. + * Release barrier, ensures correct ordering. Perform release store + * instead of queued_spin_unlock, since we use this function for the TAS + * fallback as well. When we have CONFIG_QUEUED_SPINLOCKS=n, we clear + * the full 4-byte lockword. * - * Like release_held_lock_entry, we can do the release before the dec. - * We simply care about not seeing the 'lock' in our table from a remote - * CPU once the lock has been released, which doesn't rely on the dec. + * Perform the smp_store_release before clearing the lock entry so that + * NMIs landing in the unlock path can correctly detect AA issues. The + * opposite order shown below may lead to missed AA checks: * - * Unlike smp_wmb(), release is not a two way fence, hence it is - * possible for a inc to move up and reorder with our clearing of the - * entry. This isn't a problem however, as for a misdiagnosis of ABBA, - * the remote CPU needs to hold this lock, which won't be released until - * the store below is done, which would ensure the entry is overwritten - * to NULL, etc. + * WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL) + * <NMI> + * res_spin_lock(A) --> missed AA, leads to timeout + * </NMI> + * smp_store_release(A->locked, 0) */ smp_store_release(&lock->locked, 0); + if (likely(rqh->cnt <= RES_NR_HELD)) + WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); this_cpu_dec(rqspinlock_held_locks.cnt); } diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d808253f2e945d..6498be4c44f8c2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -663,6 +663,16 @@ int map_check_no_btf(const struct bpf_map *map, bool bpf_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1); +static inline bool bpf_map_has_internal_structs(struct bpf_map *map) +{ + return btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK); +} + +void bpf_map_free_internal_structs(struct bpf_map *map, void *obj); + +int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, + struct bpf_dynptr *ptr__uninit); + extern const struct bpf_map_ops bpf_map_offload_ops; /* bpf_type_flag contains a set of flags that are applicable to the values of @@ -785,12 +795,15 @@ enum bpf_type_flag { /* DYNPTR points to skb_metadata_end()-skb_metadata_len() */ DYNPTR_TYPE_SKB_META = BIT(19 + BPF_BASE_TYPE_BITS), + /* DYNPTR points to file */ + DYNPTR_TYPE_FILE = BIT(20 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; #define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \ - | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META) + | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META | DYNPTR_TYPE_FILE) /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) @@ -988,6 +1001,7 @@ enum bpf_reg_type { PTR_TO_ARENA, PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_FUNC, /* reg points to a bpf program function */ + PTR_TO_INSN, /* reg points to a bpf program instruction */ CONST_PTR_TO_DYNPTR, /* reg points to a const struct bpf_dynptr */ __BPF_REG_TYPE_MAX, @@ -1250,6 +1264,18 @@ typedef void (*bpf_trampoline_exit_t)(struct bpf_prog *prog, u64 start, bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog); bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog); +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP +static inline bool bpf_trampoline_use_jmp(u64 flags) +{ + return flags & BPF_TRAMP_F_CALL_ORIG && !(flags & BPF_TRAMP_F_SKIP_FRAME); +} +#else +static inline bool bpf_trampoline_use_jmp(u64 flags) +{ + return false; +} +#endif + struct bpf_ksym { unsigned long start; unsigned long end; @@ -1378,21 +1404,23 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_XDP, /* Points to skb_metadata_end()-skb_metadata_len() */ BPF_DYNPTR_TYPE_SKB_META, + /* Underlying data is a file */ + BPF_DYNPTR_TYPE_FILE, }; -int bpf_dynptr_check_size(u32 size); -u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); -const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len); -void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len); +int bpf_dynptr_check_size(u64 size); +u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); +const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len); +void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len); bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr); -int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, - void *src, u32 len, u64 flags); -void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset, - void *buffer__opt, u32 buffer__szk); +int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, + void *src, u64 len, u64 flags); +void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, + void *buffer__opt, u64 buffer__szk); -static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len) +static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u64 offset, u64 len) { - u32 size = __bpf_dynptr_size(ptr); + u64 size = __bpf_dynptr_size(ptr); if (len > size || offset > size - len) return -E2BIG; @@ -1616,6 +1644,7 @@ struct bpf_prog_aux { u32 ctx_arg_info_size; u32 max_rdonly_access; u32 max_rdwr_access; + u32 subprog_start; struct btf *attach_btf; struct bpf_ctx_arg_aux *ctx_arg_info; void __percpu *priv_stack_ptr; @@ -1905,12 +1934,14 @@ struct btf_member; * reason, if this callback is not defined, the check is skipped as * the struct_ops map will have final verification performed in * @reg. - * @type: BTF type. - * @value_type: Value type. + * @cfi_stubs: Pointer to a structure of stub functions for CFI. These stubs + * provide the correct Control Flow Integrity hashes for the + * trampolines generated by BPF struct_ops. + * @owner: The module that owns this struct_ops. Used for module reference + * counting to ensure the module providing the struct_ops cannot be + * unloaded while in use. * @name: The name of the struct bpf_struct_ops object. * @func_models: Func models - * @type_id: BTF type id. - * @value_id: BTF value id. */ struct bpf_struct_ops { const struct bpf_verifier_ops *verifier_ops; @@ -2099,6 +2130,12 @@ struct bpf_array { }; }; +/* + * The bpf_array_get_next_key() function may be used for all array-like + * maps, i.e., maps with u32 keys with range [0 ,..., max_entries) + */ +int bpf_array_get_next_key(struct bpf_map *map, void *key, void *next_key); + #define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ #define MAX_TAIL_CALL_CNT 33 @@ -2374,6 +2411,9 @@ bpf_prog_run_array_uprobe(const struct bpf_prog_array *array, bool bpf_jit_bypass_spec_v1(void); bool bpf_jit_bypass_spec_v4(void); +#define bpf_rcu_lock_held() \ + (rcu_read_lock_held() || rcu_read_lock_trace_held() || rcu_read_lock_bh_held()) + #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); extern struct mutex bpf_stats_enabled_mutex; @@ -3670,12 +3710,14 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, #endif /* CONFIG_INET */ enum bpf_text_poke_type { + BPF_MOD_NOP, BPF_MOD_CALL, BPF_MOD_JUMP, }; -int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, - void *addr1, void *addr2); +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, + enum bpf_text_poke_type new_t, void *old_addr, + void *new_addr); void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, struct bpf_prog *new, struct bpf_prog *old); @@ -3772,4 +3814,30 @@ int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char * const char **linep, int *nump); struct bpf_prog *bpf_prog_find_from_stack(void); +int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog); +int bpf_insn_array_ready(struct bpf_map *map); +void bpf_insn_array_release(struct bpf_map *map); +void bpf_insn_array_adjust(struct bpf_map *map, u32 off, u32 len); +void bpf_insn_array_adjust_after_remove(struct bpf_map *map, u32 off, u32 len); + +#ifdef CONFIG_BPF_SYSCALL +void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image); +#else +static inline void +bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) +{ +} +#endif + +static inline int bpf_map_check_op_flags(struct bpf_map *map, u64 flags, u64 allowed_flags) +{ + if (flags & ~allowed_flags) + return -EINVAL; + + if ((flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) + return -EINVAL; + + return 0; +} + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index ab7244d8108f60..66432248cd810e 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -18,9 +18,6 @@ #define BPF_LOCAL_STORAGE_CACHE_SIZE 16 -#define bpf_rcu_lock_held() \ - (rcu_read_lock_held() || rcu_read_lock_trace_held() || \ - rcu_read_lock_bh_held()) struct bpf_local_storage_map_bucket { struct hlist_head list; raw_spinlock_t lock; @@ -56,9 +53,7 @@ struct bpf_local_storage_map { u32 bucket_log; u16 elem_size; u16 cache_idx; - struct bpf_mem_alloc selem_ma; - struct bpf_mem_alloc storage_ma; - bool bpf_ma; + bool use_kmalloc_nolock; }; struct bpf_local_storage_data { @@ -100,6 +95,7 @@ struct bpf_local_storage { */ struct rcu_head rcu; raw_spinlock_t lock; /* Protect adding/removing from the "list" */ + bool use_kmalloc_nolock; }; /* U16_MAX is much more than enough for sk local storage @@ -133,7 +129,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr); struct bpf_map * bpf_local_storage_map_alloc(union bpf_attr *attr, struct bpf_local_storage_cache *cache, - bool bpf_ma); + bool use_kmalloc_nolock); void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, @@ -187,10 +183,9 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, - bool charge_mem, bool swap_uptrs, gfp_t gfp_flags); + bool swap_uptrs, gfp_t gfp_flags); void bpf_selem_free(struct bpf_local_storage_elem *selem, - struct bpf_local_storage_map *smap, bool reuse_now); int diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index fa78f49d4a9a64..b13de31e163f84 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -133,6 +133,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_ARENA, arena_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_INSN_ARRAY, insn_array_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 4c497e839526a4..130bcbd66f6060 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -416,7 +416,7 @@ struct bpf_verifier_state { u32 active_irq_id; u32 active_lock_id; void *active_lock_ptr; - bool active_rcu_lock; + u32 active_rcu_locks; bool speculative; bool in_sleepable; @@ -509,6 +509,15 @@ struct bpf_map_ptr_state { #define BPF_ALU_SANITIZE (BPF_ALU_SANITIZE_SRC | \ BPF_ALU_SANITIZE_DST) +/* + * An array of BPF instructions. + * Primary usage: return value of bpf_insn_successors. + */ +struct bpf_iarray { + int cnt; + u32 items[]; +}; + struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ @@ -540,6 +549,7 @@ struct bpf_insn_aux_data { /* remember the offset of node field within type to rewrite */ u64 insert_off; }; + struct bpf_iarray *jt; /* jump table for gotox or bpf_tailcall call instruction */ struct btf_struct_meta *kptr_struct_meta; u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ @@ -548,7 +558,7 @@ struct bpf_insn_aux_data { bool nospec_result; /* result is unsafe under speculation, nospec must follow */ bool zext_dst; /* this insn zero extends dst reg */ bool needs_zext; /* alu op needs to clear upper bits */ - bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ + bool non_sleepable; /* helper/kfunc may be called from non-sleepable context */ bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */ bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */ u8 alu_state; /* used in combination with alu_limit */ @@ -642,6 +652,7 @@ struct bpf_subprog_info { u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u32 postorder_start; /* The idx to the env->cfg.insn_postorder */ + u32 exit_idx; /* Index of one of the BPF_EXIT instructions in this subprogram */ u16 stack_depth; /* max. stack depth used by this function */ u16 stack_extra; /* offsets in range [stack_depth .. fastcall_stack_off) @@ -659,9 +670,9 @@ struct bpf_subprog_info { bool keep_fastcall_stack: 1; bool changes_pkt_data: 1; bool might_sleep: 1; + u8 arg_cnt:3; enum priv_stack_mode priv_stack_mode; - u8 arg_cnt; struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS]; }; @@ -745,8 +756,10 @@ struct bpf_verifier_env { struct list_head free_list; /* list of struct bpf_verifier_state_list */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ struct btf_mod_pair used_btfs[MAX_USED_BTFS]; /* array of BTF's used by BPF program */ + struct bpf_map *insn_array_maps[MAX_USED_MAPS]; /* array of INSN_ARRAY map's to be relocated */ u32 used_map_cnt; /* number of used maps */ u32 used_btf_cnt; /* number of used BTF objects */ + u32 insn_array_map_cnt; /* number of used maps of type BPF_MAP_TYPE_INSN_ARRAY */ u32 id_gen; /* used to generate unique reg IDs */ u32 hidden_subprog_cnt; /* number of hidden subprogs */ int exception_callback_subprog; @@ -828,6 +841,8 @@ struct bpf_verifier_env { /* array of pointers to bpf_scc_info indexed by SCC id */ struct bpf_scc_info **scc_info; u32 scc_cnt; + struct bpf_iarray *succ; + struct bpf_iarray *gotox_tmp_buf; }; static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) @@ -1038,6 +1053,13 @@ static inline bool bpf_stack_narrow_access_ok(int off, int fill_size, int spill_ return !(off % BPF_REG_SIZE); } +static inline bool insn_is_gotox(struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_JMP && + BPF_OP(insn->code) == BPF_JA && + BPF_SRC(insn->code) == BPF_X; +} + const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type); const char *dynptr_type_str(enum bpf_dynptr_type type); const char *iter_type_str(const struct btf *btf, u32 btf_id); @@ -1050,7 +1072,7 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off); int bpf_jmp_offset(struct bpf_insn *insn); -int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]); +struct bpf_iarray *bpf_insn_successors(struct bpf_verifier_env *env, u32 idx); void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx); diff --git a/include/linux/buildid.h b/include/linux/buildid.h index 014a88c410739e..831c1b4b626c87 100644 --- a/include/linux/buildid.h +++ b/include/linux/buildid.h @@ -18,4 +18,29 @@ void init_vmlinux_build_id(void); static inline void init_vmlinux_build_id(void) { } #endif +struct freader { + void *buf; + u32 buf_sz; + int err; + union { + struct { + struct file *file; + struct folio *folio; + void *addr; + loff_t folio_off; + bool may_fault; + }; + struct { + const char *data; + u64 data_sz; + }; + }; +}; + +void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz, + struct file *file, bool may_fault); +void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz); +const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz); +void freader_cleanup(struct freader *r); + #endif diff --git a/include/linux/filter.h b/include/linux/filter.h index 973233b82dc1fd..569de3b14279af 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -712,11 +712,13 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, ret = dfunc(ctx, prog->insnsi, prog->bpf_func); duration = sched_clock() - start; - stats = this_cpu_ptr(prog->stats); - flags = u64_stats_update_begin_irqsave(&stats->syncp); - u64_stats_inc(&stats->cnt); - u64_stats_add(&stats->nsecs, duration); - u64_stats_update_end_irqrestore(&stats->syncp, flags); + if (likely(prog->stats)) { + stats = this_cpu_ptr(prog->stats); + flags = u64_stats_update_begin_irqsave(&stats->syncp); + u64_stats_inc(&stats->cnt); + u64_stats_add(&stats->nsecs, duration); + u64_stats_update_end_irqrestore(&stats->syncp, flags); + } } else { ret = dfunc(ctx, prog->insnsi, prog->bpf_func); } diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 07f8c309e43273..015dd1049bea05 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -359,6 +359,7 @@ enum { FTRACE_OPS_FL_DIRECT = BIT(17), FTRACE_OPS_FL_SUBOP = BIT(18), FTRACE_OPS_FL_GRAPH = BIT(19), + FTRACE_OPS_FL_JMP = BIT(20), }; #ifndef CONFIG_DYNAMIC_FTRACE_WITH_ARGS @@ -577,6 +578,38 @@ static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsigned long addr) { } #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP +static inline bool ftrace_is_jmp(unsigned long addr) +{ + return addr & 1; +} + +static inline unsigned long ftrace_jmp_set(unsigned long addr) +{ + return addr | 1UL; +} + +static inline unsigned long ftrace_jmp_get(unsigned long addr) +{ + return addr & ~1UL; +} +#else +static inline bool ftrace_is_jmp(unsigned long addr) +{ + return false; +} + +static inline unsigned long ftrace_jmp_set(unsigned long addr) +{ + return addr; +} + +static inline unsigned long ftrace_jmp_get(unsigned long addr) +{ + return addr; +} +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_JMP */ + #ifdef CONFIG_STACK_TRACER int stack_trace_sysctl(const struct ctl_table *table, int write, void *buffer, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6829936d33f58e..f5713f59ac10a0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1026,6 +1026,7 @@ enum bpf_map_type { BPF_MAP_TYPE_USER_RINGBUF, BPF_MAP_TYPE_CGRP_STORAGE, BPF_MAP_TYPE_ARENA, + BPF_MAP_TYPE_INSN_ARRAY, __MAX_BPF_MAP_TYPE }; @@ -1430,6 +1431,9 @@ enum { /* Do not translate kernel bpf_arena pointers to user pointers */ BPF_F_NO_USER_CONV = (1U << 18), + +/* Enable BPF ringbuf overwrite mode */ + BPF_F_RB_OVERWRITE = (1U << 19), }; /* Flags for BPF_PROG_QUERY. */ @@ -5618,7 +5622,7 @@ union bpf_attr { * Return * *sk* if casting is valid, or **NULL** otherwise. * - * long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr) + * long bpf_dynptr_from_mem(void *data, u64 size, u64 flags, struct bpf_dynptr *ptr) * Description * Get a dynptr to local memory *data*. * @@ -5661,7 +5665,7 @@ union bpf_attr { * Return * Nothing. Always succeeds. * - * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags) + * long bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr *src, u64 offset, u64 flags) * Description * Read *len* bytes from *src* into *dst*, starting from *offset* * into *src*. @@ -5671,7 +5675,7 @@ union bpf_attr { * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if * *flags* is not 0. * - * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) + * long bpf_dynptr_write(const struct bpf_dynptr *dst, u64 offset, void *src, u64 len, u64 flags) * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. @@ -5692,7 +5696,7 @@ union bpf_attr { * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs, * other errors correspond to errors returned by **bpf_skb_store_bytes**\ (). * - * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) + * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u64 offset, u64 len) * Description * Get a pointer to the underlying dynptr data. * @@ -6231,6 +6235,7 @@ enum { BPF_RB_RING_SIZE = 1, BPF_RB_CONS_POS = 2, BPF_RB_PROD_POS = 3, + BPF_RB_OVERWRITE_POS = 4, }; /* BPF ring buffer constants */ @@ -7645,4 +7650,24 @@ enum bpf_kfunc_flags { BPF_F_PAD_ZEROS = (1ULL << 0), }; +/* + * Values of a BPF_MAP_TYPE_INSN_ARRAY entry must be of this type. + * + * Before the map is used the orig_off field should point to an + * instruction inside the program being loaded. The other fields + * must be set to 0. + * + * After the program is loaded, the xlated_off will be adjusted + * by the verifier to point to the index of the original instruction + * in the xlated program. If the instruction is deleted, it will + * be set to (u32)-1. The jitted_off will be set to the corresponding + * offset in the jitted image of the program. + */ +struct bpf_insn_array_value { + __u32 orig_off; + __u32 xlated_off; + __u32 jitted_off; + __u32 :32; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 7fd0badfacb12f..232cbc97434db9 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -9,7 +9,7 @@ CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 80b1765a315969..1eeb31c5b317d6 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -335,18 +335,17 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) } /* Called from syscall */ -static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +int bpf_array_get_next_key(struct bpf_map *map, void *key, void *next_key) { - struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = (u32 *)next_key; - if (index >= array->map.max_entries) { + if (index >= map->max_entries) { *next = 0; return 0; } - if (index == array->map.max_entries - 1) + if (index == map->max_entries - 1) return -ENOENT; *next = index + 1; @@ -448,19 +447,12 @@ static void array_map_free_internal_structs(struct bpf_map *map) struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - /* We don't reset or free fields other than timer and workqueue - * on uref dropping to zero. - */ - if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { - for (i = 0; i < array->map.max_entries; i++) { - if (btf_record_has_field(map->record, BPF_TIMER)) - bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); - if (btf_record_has_field(map->record, BPF_WORKQUEUE)) - bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); - if (btf_record_has_field(map->record, BPF_TASK_WORK)) - bpf_obj_free_task_work(map->record, array_map_elem_ptr(array, i)); - } - } + /* We only free internal structs on uref dropping to zero */ + if (!bpf_map_has_internal_structs(map)) + return; + + for (i = 0; i < array->map.max_entries; i++) + bpf_map_free_internal_structs(map, array_map_elem_ptr(array, i)); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ @@ -796,7 +788,7 @@ const struct bpf_map_ops array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_release_uref = array_map_free_internal_structs, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, @@ -822,7 +814,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = percpu_array_map_lookup_elem, .map_gen_lookup = percpu_array_map_gen_lookup, .map_update_elem = array_map_update_elem, @@ -1211,7 +1203,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_poke_track = prog_array_map_poke_track, .map_poke_untrack = prog_array_map_poke_untrack, .map_poke_run = prog_array_map_poke_run, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, @@ -1315,7 +1307,7 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = perf_event_fd_array_map_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = perf_event_fd_array_get_ptr, @@ -1351,7 +1343,7 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = cgroup_fd_array_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = cgroup_fd_array_get_ptr, @@ -1436,7 +1428,7 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, .map_free = array_of_map_free, - .map_get_next_key = array_map_get_next_key, + .map_get_next_key = bpf_array_get_next_key, .map_lookup_elem = array_of_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = bpf_map_fd_get_ptr, diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c new file mode 100644 index 00000000000000..c96630cb75bf7a --- /dev/null +++ b/kernel/bpf/bpf_insn_array.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Isovalent */ + +#include <linux/bpf.h> + +struct bpf_insn_array { + struct bpf_map map; + atomic_t used; + long *ips; + DECLARE_FLEX_ARRAY(struct bpf_insn_array_value, values); +}; + +#define cast_insn_array(MAP_PTR) \ + container_of((MAP_PTR), struct bpf_insn_array, map) + +#define INSN_DELETED ((u32)-1) + +static inline u64 insn_array_alloc_size(u32 max_entries) +{ + const u64 base_size = sizeof(struct bpf_insn_array); + const u64 entry_size = sizeof(struct bpf_insn_array_value); + + return base_size + max_entries * (entry_size + sizeof(long)); +} + +static int insn_array_alloc_check(union bpf_attr *attr) +{ + u32 value_size = sizeof(struct bpf_insn_array_value); + + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != value_size || attr->map_flags != 0) + return -EINVAL; + + return 0; +} + +static void insn_array_free(struct bpf_map *map) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + + bpf_map_area_free(insn_array); +} + +static struct bpf_map *insn_array_alloc(union bpf_attr *attr) +{ + u64 size = insn_array_alloc_size(attr->max_entries); + struct bpf_insn_array *insn_array; + + insn_array = bpf_map_area_alloc(size, NUMA_NO_NODE); + if (!insn_array) + return ERR_PTR(-ENOMEM); + + /* ips are allocated right after the insn_array->values[] array */ + insn_array->ips = (void *)&insn_array->values[attr->max_entries]; + + bpf_map_init_from_attr(&insn_array->map, attr); + + /* BPF programs aren't allowed to write to the map */ + insn_array->map.map_flags |= BPF_F_RDONLY_PROG; + + return &insn_array->map; +} + +static void *insn_array_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + u32 index = *(u32 *)key; + + if (unlikely(index >= insn_array->map.max_entries)) + return NULL; + + return &insn_array->values[index]; +} + +static long insn_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + u32 index = *(u32 *)key; + struct bpf_insn_array_value val = {}; + + if (unlikely(index >= insn_array->map.max_entries)) + return -E2BIG; + + if (unlikely(map_flags & BPF_NOEXIST)) + return -EEXIST; + + copy_map_value(map, &val, value); + if (val.jitted_off || val.xlated_off) + return -EINVAL; + + insn_array->values[index].orig_off = val.orig_off; + + return 0; +} + +static long insn_array_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +static int insn_array_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + if (!btf_type_is_i32(key_type)) + return -EINVAL; + + if (!btf_type_is_i64(value_type)) + return -EINVAL; + + return 0; +} + +static u64 insn_array_mem_usage(const struct bpf_map *map) +{ + return insn_array_alloc_size(map->max_entries); +} + +static int insn_array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + + if ((off % sizeof(long)) != 0 || + (off / sizeof(long)) >= map->max_entries) + return -EINVAL; + + /* from BPF's point of view, this map is a jump table */ + *imm = (unsigned long)insn_array->ips + off; + + return 0; +} + +BTF_ID_LIST_SINGLE(insn_array_btf_ids, struct, bpf_insn_array) + +const struct bpf_map_ops insn_array_map_ops = { + .map_alloc_check = insn_array_alloc_check, + .map_alloc = insn_array_alloc, + .map_free = insn_array_free, + .map_get_next_key = bpf_array_get_next_key, + .map_lookup_elem = insn_array_lookup_elem, + .map_update_elem = insn_array_update_elem, + .map_delete_elem = insn_array_delete_elem, + .map_check_btf = insn_array_check_btf, + .map_mem_usage = insn_array_mem_usage, + .map_direct_value_addr = insn_array_map_direct_value_addr, + .map_btf_id = &insn_array_btf_ids[0], +}; + +static inline bool is_frozen(struct bpf_map *map) +{ + guard(mutex)(&map->freeze_mutex); + + return map->frozen; +} + +static bool is_insn_array(const struct bpf_map *map) +{ + return map->map_type == BPF_MAP_TYPE_INSN_ARRAY; +} + +static inline bool valid_offsets(const struct bpf_insn_array *insn_array, + const struct bpf_prog *prog) +{ + u32 off; + int i; + + for (i = 0; i < insn_array->map.max_entries; i++) { + off = insn_array->values[i].orig_off; + + if (off >= prog->len) + return false; + + if (off > 0) { + if (prog->insnsi[off-1].code == (BPF_LD | BPF_DW | BPF_IMM)) + return false; + } + } + + return true; +} + +int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + struct bpf_insn_array_value *values = insn_array->values; + int i; + + if (!is_frozen(map)) + return -EINVAL; + + if (!valid_offsets(insn_array, prog)) + return -EINVAL; + + /* + * There can be only one program using the map + */ + if (atomic_xchg(&insn_array->used, 1)) + return -EBUSY; + + /* + * Reset all the map indexes to the original values. This is needed, + * e.g., when a replay of verification with different log level should + * be performed. + */ + for (i = 0; i < map->max_entries; i++) + values[i].xlated_off = values[i].orig_off; + + return 0; +} + +int bpf_insn_array_ready(struct bpf_map *map) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + int i; + + for (i = 0; i < map->max_entries; i++) { + if (insn_array->values[i].xlated_off == INSN_DELETED) + continue; + if (!insn_array->ips[i]) + return -EFAULT; + } + + return 0; +} + +void bpf_insn_array_release(struct bpf_map *map) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + + atomic_set(&insn_array->used, 0); +} + +void bpf_insn_array_adjust(struct bpf_map *map, u32 off, u32 len) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + int i; + + if (len <= 1) + return; + + for (i = 0; i < map->max_entries; i++) { + if (insn_array->values[i].xlated_off <= off) + continue; + if (insn_array->values[i].xlated_off == INSN_DELETED) + continue; + insn_array->values[i].xlated_off += len - 1; + } +} + +void bpf_insn_array_adjust_after_remove(struct bpf_map *map, u32 off, u32 len) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + int i; + + for (i = 0; i < map->max_entries; i++) { + if (insn_array->values[i].xlated_off < off) + continue; + if (insn_array->values[i].xlated_off == INSN_DELETED) + continue; + if (insn_array->values[i].xlated_off < off + len) + insn_array->values[i].xlated_off = INSN_DELETED; + else + insn_array->values[i].xlated_off -= len; + } +} + +/* + * This function is called by JITs. The image is the real program + * image, the offsets array set up the xlated -> jitted mapping. + * The offsets[xlated] offset should point to the beginning of + * the jitted instruction. + */ +void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) +{ + struct bpf_insn_array *insn_array; + struct bpf_map *map; + u32 xlated_off; + int i, j; + + if (!offsets || !image) + return; + + for (i = 0; i < prog->aux->used_map_cnt; i++) { + map = prog->aux->used_maps[i]; + if (!is_insn_array(map)) + continue; + + insn_array = cast_insn_array(map); + for (j = 0; j < map->max_entries; j++) { + xlated_off = insn_array->values[j].xlated_off; + if (xlated_off == INSN_DELETED) + continue; + if (xlated_off < prog->aux->subprog_start) + continue; + xlated_off -= prog->aux->subprog_start; + if (xlated_off >= prog->len) + continue; + + insn_array->values[j].jitted_off = offsets[xlated_off]; + insn_array->ips[j] = (long)(image + offsets[xlated_off]); + } + } +} diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index b931fbceb54da0..e2fe6c32822b89 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -73,30 +73,24 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, - void *value, bool charge_mem, bool swap_uptrs, gfp_t gfp_flags) + void *value, bool swap_uptrs, gfp_t gfp_flags) { struct bpf_local_storage_elem *selem; - if (charge_mem && mem_charge(smap, owner, smap->elem_size)) + if (mem_charge(smap, owner, smap->elem_size)) return NULL; - if (smap->bpf_ma) { - selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags); - if (selem) - /* Keep the original bpf_map_kzalloc behavior - * before started using the bpf_mem_cache_alloc. - * - * No need to use zero_map_value. The bpf_selem_free() - * only does bpf_mem_cache_free when there is - * no other bpf prog is using the selem. - */ - memset(SDATA(selem)->data, 0, smap->map.value_size); + if (smap->use_kmalloc_nolock) { + selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size, + __GFP_ZERO, NUMA_NO_NODE); } else { selem = bpf_map_kzalloc(&smap->map, smap->elem_size, gfp_flags | __GFP_NOWARN); } if (selem) { + RCU_INIT_POINTER(SDATA(selem)->smap, smap); + if (value) { /* No need to call check_and_init_map_value as memory is zero init */ copy_map_value(&smap->map, SDATA(selem)->data, value); @@ -106,13 +100,12 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, return selem; } - if (charge_mem) - mem_uncharge(smap, owner, smap->elem_size); + mem_uncharge(smap, owner, smap->elem_size); return NULL; } -/* rcu tasks trace callback for bpf_ma == false */ +/* rcu tasks trace callback for use_kmalloc_nolock == false */ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; @@ -127,12 +120,23 @@ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) kfree_rcu(local_storage, rcu); } +/* Handle use_kmalloc_nolock == false */ +static void __bpf_local_storage_free(struct bpf_local_storage *local_storage, + bool vanilla_rcu) +{ + if (vanilla_rcu) + kfree_rcu(local_storage, rcu); + else + call_rcu_tasks_trace(&local_storage->rcu, + __bpf_local_storage_free_trace_rcu); +} + static void bpf_local_storage_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; local_storage = container_of(rcu, struct bpf_local_storage, rcu); - bpf_mem_cache_raw_free(local_storage); + kfree_nolock(local_storage); } static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) @@ -143,46 +147,27 @@ static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) call_rcu(rcu, bpf_local_storage_free_rcu); } -/* Handle bpf_ma == false */ -static void __bpf_local_storage_free(struct bpf_local_storage *local_storage, - bool vanilla_rcu) -{ - if (vanilla_rcu) - kfree_rcu(local_storage, rcu); - else - call_rcu_tasks_trace(&local_storage->rcu, - __bpf_local_storage_free_trace_rcu); -} - static void bpf_local_storage_free(struct bpf_local_storage *local_storage, - struct bpf_local_storage_map *smap, - bool bpf_ma, bool reuse_now) + bool reuse_now) { if (!local_storage) return; - if (!bpf_ma) { + if (!local_storage->use_kmalloc_nolock) { __bpf_local_storage_free(local_storage, reuse_now); return; } - if (!reuse_now) { - call_rcu_tasks_trace(&local_storage->rcu, - bpf_local_storage_free_trace_rcu); + if (reuse_now) { + call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); return; } - if (smap) - bpf_mem_cache_free(&smap->storage_ma, local_storage); - else - /* smap could be NULL if the selem that triggered - * this 'local_storage' creation had been long gone. - * In this case, directly do call_rcu(). - */ - call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); + call_rcu_tasks_trace(&local_storage->rcu, + bpf_local_storage_free_trace_rcu); } -/* rcu tasks trace callback for bpf_ma == false */ +/* rcu tasks trace callback for use_kmalloc_nolock == false */ static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; @@ -194,7 +179,7 @@ static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu) kfree_rcu(selem, rcu); } -/* Handle bpf_ma == false */ +/* Handle use_kmalloc_nolock == false */ static void __bpf_selem_free(struct bpf_local_storage_elem *selem, bool vanilla_rcu) { @@ -216,7 +201,7 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) migrate_disable(); bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); migrate_enable(); - bpf_mem_cache_raw_free(selem); + kfree_nolock(selem); } static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) @@ -228,14 +213,17 @@ static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) } void bpf_selem_free(struct bpf_local_storage_elem *selem, - struct bpf_local_storage_map *smap, bool reuse_now) { - if (!smap->bpf_ma) { - /* Only task storage has uptrs and task storage - * has moved to bpf_mem_alloc. Meaning smap->bpf_ma == true - * for task storage, so this bpf_obj_free_fields() won't unpin - * any uptr. + struct bpf_local_storage_map *smap; + + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + + if (!smap->use_kmalloc_nolock) { + /* + * No uptr will be unpin even when reuse_now == false since uptr + * is only supported in task local storage, where + * smap->use_kmalloc_nolock == true. */ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); __bpf_selem_free(selem, reuse_now); @@ -243,18 +231,11 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, } if (reuse_now) { - /* reuse_now == true only happens when the storage owner - * (e.g. task_struct) is being destructed or the map itself - * is being destructed (ie map_free). In both cases, - * no bpf prog can have a hold on the selem. It is - * safe to unpin the uptrs and free the selem now. - */ - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - /* Instead of using the vanilla call_rcu(), - * bpf_mem_cache_free will be able to reuse selem - * immediately. + /* + * While it is okay to call bpf_obj_free_fields() that unpins uptr when + * reuse_now == true, keep it in bpf_selem_free_rcu() for simplicity. */ - bpf_mem_cache_free(&smap->selem_ma, selem); + call_rcu(&selem->rcu, bpf_selem_free_rcu); return; } @@ -264,7 +245,6 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) { struct bpf_local_storage_elem *selem; - struct bpf_local_storage_map *smap; struct hlist_node *n; /* The "_safe" iteration is needed. @@ -272,10 +252,8 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) * but bpf_selem_free will use the selem->rcu_head * which is union-ized with the selem->free_node. */ - hlist_for_each_entry_safe(selem, n, list, free_node) { - smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - bpf_selem_free(selem, smap, reuse_now); - } + hlist_for_each_entry_safe(selem, n, list, free_node) + bpf_selem_free(selem, reuse_now); } /* local_storage->lock must be held and selem->local_storage == local_storage. @@ -284,7 +262,7 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) */ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem, - bool uncharge_mem, struct hlist_head *free_selem_list) + struct hlist_head *free_selem_list) { struct bpf_local_storage_map *smap; bool free_local_storage; @@ -297,8 +275,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor * The owner may be freed once the last selem is unlinked * from local_storage. */ - if (uncharge_mem) - mem_uncharge(smap, owner, smap->elem_size); + mem_uncharge(smap, owner, smap->elem_size); free_local_storage = hlist_is_singular_node(&selem->snode, &local_storage->list); @@ -336,47 +313,11 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor return free_local_storage; } -static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage, - struct bpf_local_storage_map *storage_smap, - struct bpf_local_storage_elem *selem) -{ - - struct bpf_local_storage_map *selem_smap; - - /* local_storage->smap may be NULL. If it is, get the bpf_ma - * from any selem in the local_storage->list. The bpf_ma of all - * local_storage and selem should have the same value - * for the same map type. - * - * If the local_storage->list is already empty, the caller will not - * care about the bpf_ma value also because the caller is not - * responsible to free the local_storage. - */ - - if (storage_smap) - return storage_smap->bpf_ma; - - if (!selem) { - struct hlist_node *n; - - n = rcu_dereference_check(hlist_first_rcu(&local_storage->list), - bpf_rcu_lock_held()); - if (!n) - return false; - - selem = hlist_entry(n, struct bpf_local_storage_elem, snode); - } - selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - - return selem_smap->bpf_ma; -} - static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, bool reuse_now) { - struct bpf_local_storage_map *storage_smap; struct bpf_local_storage *local_storage; - bool bpf_ma, free_local_storage = false; + bool free_local_storage = false; HLIST_HEAD(selem_free_list); unsigned long flags; @@ -386,20 +327,17 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); - storage_smap = rcu_dereference_check(local_storage->smap, - bpf_rcu_lock_held()); - bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem); raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, true, &selem_free_list); + local_storage, selem, &selem_free_list); raw_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&selem_free_list, reuse_now); if (free_local_storage) - bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now); + bpf_local_storage_free(local_storage, reuse_now); } void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, @@ -434,7 +372,6 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, unsigned long flags; raw_spin_lock_irqsave(&b->lock, flags); - RCU_INIT_POINTER(SDATA(selem)->smap, smap); hlist_add_head_rcu(&selem->map_node, &b->list); raw_spin_unlock_irqrestore(&b->lock, flags); } @@ -493,8 +430,9 @@ int bpf_local_storage_alloc(void *owner, if (err) return err; - if (smap->bpf_ma) - storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags); + if (smap->use_kmalloc_nolock) + storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage), + __GFP_ZERO, NUMA_NO_NODE); else storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), gfp_flags | __GFP_NOWARN); @@ -507,6 +445,7 @@ int bpf_local_storage_alloc(void *owner, INIT_HLIST_HEAD(&storage->list); raw_spin_lock_init(&storage->lock); storage->owner = owner; + storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; bpf_selem_link_storage_nolock(storage, first_selem); bpf_selem_link_map(smap, first_selem); @@ -528,22 +467,12 @@ int bpf_local_storage_alloc(void *owner, bpf_selem_unlink_map(first_selem); err = -EAGAIN; goto uncharge; - - /* Note that even first_selem was linked to smap's - * bucket->list, first_selem can be freed immediately - * (instead of kfree_rcu) because - * bpf_local_storage_map_free() does a - * synchronize_rcu_mult (waiting for both sleepable and - * normal programs) before walking the bucket->list. - * Hence, no one is accessing selem from the - * bucket->list under rcu_read_lock(). - */ } return 0; uncharge: - bpf_local_storage_free(storage, smap, smap->bpf_ma, true); + bpf_local_storage_free(storage, true); mem_uncharge(smap, owner, sizeof(*storage)); return err; } @@ -582,13 +511,13 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (err) return ERR_PTR(err); - selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags); + selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags); if (!selem) return ERR_PTR(-ENOMEM); err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags); if (err) { - bpf_selem_free(selem, smap, true); + bpf_selem_free(selem, true); mem_uncharge(smap, owner, smap->elem_size); return ERR_PTR(err); } @@ -616,7 +545,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, /* A lookup has just been done before and concluded a new selem is * needed. The chance of an unnecessary alloc is unlikely. */ - alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags); + alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags); if (!alloc_selem) return ERR_PTR(-ENOMEM); @@ -656,7 +585,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), - true, &old_selem_free_list); + &old_selem_free_list); } unlock: @@ -664,7 +593,7 @@ unlock: bpf_selem_free_list(&old_selem_free_list, false); if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); - bpf_selem_free(alloc_selem, smap, true); + bpf_selem_free(alloc_selem, true); } return err ? ERR_PTR(err) : SDATA(selem); } @@ -730,16 +659,12 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) { - struct bpf_local_storage_map *storage_smap; struct bpf_local_storage_elem *selem; - bool bpf_ma, free_storage = false; + bool free_storage = false; HLIST_HEAD(free_selem_list); struct hlist_node *n; unsigned long flags; - storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held()); - bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL); - /* Neither the bpf_prog nor the bpf_map's syscall * could be modifying the local_storage->list now. * Thus, no elem can be added to or deleted from the @@ -762,14 +687,14 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * of the loop will set the free_cgroup_storage to true. */ free_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, true, &free_selem_list); + local_storage, selem, &free_selem_list); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&free_selem_list, true); if (free_storage) - bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true); + bpf_local_storage_free(local_storage, true); } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) @@ -782,20 +707,10 @@ u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) return usage; } -/* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory. - * A deadlock free allocator is useful for storage that the bpf prog can easily - * get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf. - * The task and cgroup storage fall into this case. The bpf_mem_alloc reuses - * memory immediately. To be reuse-immediate safe, the owner destruction - * code path needs to go through a rcu grace period before calling - * bpf_local_storage_destroy(). - * - * When bpf_ma == false, the kmalloc and kfree are used. - */ struct bpf_map * bpf_local_storage_map_alloc(union bpf_attr *attr, struct bpf_local_storage_cache *cache, - bool bpf_ma) + bool use_kmalloc_nolock) { struct bpf_local_storage_map *smap; unsigned int i; @@ -829,20 +744,9 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, /* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non * preemptible context. Thus, enforce all storages to use - * bpf_mem_alloc when CONFIG_PREEMPT_RT is enabled. + * kmalloc_nolock() when CONFIG_PREEMPT_RT is enabled. */ - smap->bpf_ma = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : bpf_ma; - if (smap->bpf_ma) { - err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false); - if (err) - goto free_smap; - - err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false); - if (err) { - bpf_mem_alloc_destroy(&smap->selem_ma); - goto free_smap; - } - } + smap->use_kmalloc_nolock = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : use_kmalloc_nolock; smap->cache_idx = bpf_local_storage_cache_idx_get(cache); return &smap->map; @@ -912,12 +816,9 @@ void bpf_local_storage_map_free(struct bpf_map *map, */ synchronize_rcu(); - if (smap->bpf_ma) { + if (smap->use_kmalloc_nolock) { rcu_barrier_tasks_trace(); - if (!rcu_trace_implies_rcu_gp()) - rcu_barrier(); - bpf_mem_alloc_destroy(&smap->selem_ma); - bpf_mem_alloc_destroy(&smap->storage_ma); + rcu_barrier(); } kvfree(smap->buckets); bpf_map_area_free(smap); diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 0a59df1c550a08..7cb6e8d4282cb1 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -51,6 +51,7 @@ BTF_ID(func, bpf_lsm_key_getsecurity) BTF_ID(func, bpf_lsm_audit_rule_match) #endif BTF_ID(func, bpf_lsm_ismaclabel) +BTF_ID(func, bpf_lsm_file_alloc_security) BTF_SET_END(bpf_lsm_disabled_hooks) /* List of LSM hooks that should operate on 'current' cgroup regardless diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d595fe512498cc..c8ae6ab3165100 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1450,6 +1450,23 @@ void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other) bpf_prog_clone_free(fp_other); } +static void adjust_insn_arrays(struct bpf_prog *prog, u32 off, u32 len) +{ +#ifdef CONFIG_BPF_SYSCALL + struct bpf_map *map; + int i; + + if (len <= 1) + return; + + for (i = 0; i < prog->aux->used_map_cnt; i++) { + map = prog->aux->used_maps[i]; + if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) + bpf_insn_array_adjust(map, off, len); + } +#endif +} + struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) { struct bpf_insn insn_buff[16], aux[2]; @@ -1505,6 +1522,9 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) clone = tmp; insn_delta = rewritten - 1; + /* Instructions arrays must be updated using absolute xlated offsets */ + adjust_insn_arrays(clone, prog->aux->subprog_start + i, rewritten); + /* Walk new program and skip insns we just inserted. */ insn = clone->insnsi + i + insn_delta; insn_cnt += insn_delta; @@ -1688,6 +1708,7 @@ bool bpf_opcode_in_insntable(u8 code) [BPF_LD | BPF_IND | BPF_B] = true, [BPF_LD | BPF_IND | BPF_H] = true, [BPF_LD | BPF_IND | BPF_W] = true, + [BPF_JMP | BPF_JA | BPF_X] = true, [BPF_JMP | BPF_JCOND] = true, }; #undef BPF_INSN_3_TBL @@ -3129,8 +3150,9 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, return -EFAULT; } -int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, - void *addr1, void *addr2) +int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, + enum bpf_text_poke_type new_t, void *old_addr, + void *new_addr) { return -ENOTSUPP; } diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 20883c6b1546c7..f8a3c7eb451e4a 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -358,6 +358,9 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose(cbs->private_data, "(%02x) goto pc%+d\n", insn->code, insn->off); + } else if (insn->code == (BPF_JMP | BPF_JA | BPF_X)) { + verbose(cbs->private_data, "(%02x) gotox r%d\n", + insn->code, insn->dst_reg); } else if (insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO) { verbose(cbs->private_data, "(%02x) may_goto pc%+d\n", diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c2fcd0cd51e51b..c8a9b27f8663bb 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -215,19 +215,6 @@ static bool htab_has_extra_elems(struct bpf_htab *htab) return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab); } -static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem *elem) -{ - if (btf_record_has_field(htab->map.record, BPF_TIMER)) - bpf_obj_free_timer(htab->map.record, - htab_elem_value(elem, htab->map.key_size)); - if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) - bpf_obj_free_workqueue(htab->map.record, - htab_elem_value(elem, htab->map.key_size)); - if (btf_record_has_field(htab->map.record, BPF_TASK_WORK)) - bpf_obj_free_task_work(htab->map.record, - htab_elem_value(elem, htab->map.key_size)); -} - static void htab_free_prealloced_internal_structs(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; @@ -240,7 +227,8 @@ static void htab_free_prealloced_internal_structs(struct bpf_htab *htab) struct htab_elem *elem; elem = get_htab_elem(htab, i); - htab_free_internal_structs(htab, elem); + bpf_map_free_internal_structs(&htab->map, + htab_elem_value(elem, htab->map.key_size)); cond_resched(); } } @@ -669,8 +657,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) struct htab_elem *l; u32 hash, key_size; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -947,15 +934,21 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, void *value, bool onallcpus) { + void *ptr; + if (!onallcpus) { /* copy true value_size bytes */ - copy_map_value(&htab->map, this_cpu_ptr(pptr), value); + ptr = this_cpu_ptr(pptr); + copy_map_value(&htab->map, ptr, value); + bpf_obj_free_fields(htab->map.record, ptr); } else { u32 size = round_up(htab->map.value_size, 8); int off = 0, cpu; for_each_possible_cpu(cpu) { - copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off); + ptr = per_cpu_ptr(pptr, cpu); + copy_map_value_long(&htab->map, ptr, value + off); + bpf_obj_free_fields(htab->map.record, ptr); off += size; } } @@ -1098,8 +1091,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1206,8 +1198,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1275,8 +1266,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1338,8 +1328,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1416,8 +1405,7 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1452,8 +1440,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret; - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); key_size = map->key_size; @@ -1509,8 +1496,9 @@ static void htab_free_malloced_internal_structs(struct bpf_htab *htab) struct htab_elem *l; hlist_nulls_for_each_entry(l, n, head, hash_node) { - /* We only free timer on uref dropping to zero */ - htab_free_internal_structs(htab, l); + /* We only free internal structs on uref dropping to zero */ + bpf_map_free_internal_structs(&htab->map, + htab_elem_value(l, htab->map.key_size)); } cond_resched_rcu(); } @@ -1521,13 +1509,14 @@ static void htab_map_free_internal_structs(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - /* We only free timer and workqueue on uref dropping to zero */ - if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { - if (!htab_is_prealloc(htab)) - htab_free_malloced_internal_structs(htab); - else - htab_free_prealloced_internal_structs(htab); - } + /* We only free internal structs on uref dropping to zero */ + if (!bpf_map_has_internal_structs(map)) + return; + + if (htab_is_prealloc(htab)) + htab_free_prealloced_internal_structs(htab); + else + htab_free_malloced_internal_structs(htab); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index e4007fea49091c..07bfa72f964935 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -28,6 +28,7 @@ #include <linux/verification.h> #include <linux/task_work.h> #include <linux/irq_work.h> +#include <linux/buildid.h> #include "../../lib/kstrtox.h" @@ -42,8 +43,7 @@ */ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); return (unsigned long) map->ops->map_lookup_elem(map, key); } @@ -59,8 +59,7 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = { BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, void *, value, u64, flags) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); return map->ops->map_update_elem(map, key, value, flags); } @@ -77,8 +76,7 @@ const struct bpf_func_proto bpf_map_update_elem_proto = { BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); return map->ops->map_delete_elem(map, key); } @@ -134,8 +132,7 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = { BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && - !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!bpf_rcu_lock_held()); return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu); } @@ -777,9 +774,11 @@ int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs) { int nest_level; + preempt_disable(); nest_level = this_cpu_inc_return(bpf_bprintf_nest_level); if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) { this_cpu_dec(bpf_bprintf_nest_level); + preempt_enable(); return -EBUSY; } *bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]); @@ -792,6 +791,7 @@ void bpf_put_buffers(void) if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0)) return; this_cpu_dec(bpf_bprintf_nest_level); + preempt_enable(); } void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) @@ -1660,6 +1660,13 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = { .arg2_btf_id = BPF_PTR_POISON, }; +struct bpf_dynptr_file_impl { + struct freader freader; + /* 64 bit offset and size overriding 32 bit ones in bpf_dynptr_kern */ + u64 offset; + u64 size; +}; + /* Since the upper 8 bits of dynptr->size is reserved, the * maximum supported size is 2^24 - 1. */ @@ -1688,23 +1695,65 @@ static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *pt return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT; } -u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) +u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) { + if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) { + struct bpf_dynptr_file_impl *df = ptr->data; + + return df->size; + } + return ptr->size & DYNPTR_SIZE_MASK; } -static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size) +static void bpf_dynptr_advance_offset(struct bpf_dynptr_kern *ptr, u64 off) +{ + if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) { + struct bpf_dynptr_file_impl *df = ptr->data; + + df->offset += off; + return; + } + ptr->offset += off; +} + +static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u64 new_size) { u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK; - ptr->size = new_size | metadata; + if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) { + struct bpf_dynptr_file_impl *df = ptr->data; + + df->size = new_size; + return; + } + ptr->size = (u32)new_size | metadata; } -int bpf_dynptr_check_size(u32 size) +int bpf_dynptr_check_size(u64 size) { return size > DYNPTR_MAX_SIZE ? -E2BIG : 0; } +static int bpf_file_fetch_bytes(struct bpf_dynptr_file_impl *df, u64 offset, void *buf, u64 len) +{ + const void *ptr; + + if (!buf) + return -EINVAL; + + df->freader.buf = buf; + df->freader.buf_sz = len; + ptr = freader_fetch(&df->freader, offset + df->offset, len); + if (!ptr) + return df->freader.err; + + if (ptr != buf) /* Force copying into the buffer */ + memcpy(buf, ptr, len); + + return 0; +} + void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size) { @@ -1719,7 +1768,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) memset(ptr, 0, sizeof(*ptr)); } -BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr) +BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u64, size, u64, flags, struct bpf_dynptr_kern *, ptr) { int err; @@ -1754,8 +1803,8 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE, }; -static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *src, - u32 offset, u64 flags) +static int __bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr_kern *src, + u64 offset, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1785,14 +1834,16 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s case BPF_DYNPTR_TYPE_SKB_META: memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len); return 0; + case BPF_DYNPTR_TYPE_FILE: + return bpf_file_fetch_bytes(src->data, offset, dst, len); default: WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type); return -EFAULT; } } -BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, - u32, offset, u64, flags) +BPF_CALL_5(bpf_dynptr_read, void *, dst, u64, len, const struct bpf_dynptr_kern *, src, + u64, offset, u64, flags) { return __bpf_dynptr_read(dst, len, src, offset, flags); } @@ -1808,8 +1859,8 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .arg5_type = ARG_ANYTHING, }; -int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, - u32 len, u64 flags) +int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src, + u64 len, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1852,8 +1903,8 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, } } -BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, - u32, len, u64, flags) +BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u64, offset, void *, src, + u64, len, u64, flags) { return __bpf_dynptr_write(dst, offset, src, len, flags); } @@ -1869,7 +1920,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = { .arg5_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) +BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u64, offset, u64, len) { enum bpf_dynptr_type type; int err; @@ -2684,12 +2735,12 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) */ -__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, - void *buffer__opt, u32 buffer__szk) +__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, + void *buffer__opt, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; enum bpf_dynptr_type type; - u32 len = buffer__szk; + u64 len = buffer__szk; int err; if (!ptr->data) @@ -2723,6 +2774,9 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, } case BPF_DYNPTR_TYPE_SKB_META: return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset); + case BPF_DYNPTR_TYPE_FILE: + err = bpf_file_fetch_bytes(ptr->data, offset, buffer__opt, buffer__szk); + return err ? NULL : buffer__opt; default: WARN_ONCE(true, "unknown dynptr type %d\n", type); return NULL; @@ -2771,8 +2825,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) */ -__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset, - void *buffer__opt, u32 buffer__szk) +__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, + void *buffer__opt, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; @@ -2804,10 +2858,10 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset, return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk); } -__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end) +__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; - u32 size; + u64 size; if (!ptr->data || start > end) return -EINVAL; @@ -2817,7 +2871,7 @@ __bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end if (start > size || end > size) return -ERANGE; - ptr->offset += start; + bpf_dynptr_advance_offset(ptr, start); bpf_dynptr_set_size(ptr, end - start); return 0; @@ -2840,7 +2894,7 @@ __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p) return __bpf_dynptr_is_rdonly(ptr); } -__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr *p) +__bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; @@ -2877,14 +2931,14 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, * Copies data from source dynptr to destination dynptr. * Returns 0 on success; negative error, otherwise. */ -__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, - struct bpf_dynptr *src_ptr, u32 src_off, u32 size) +__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off, + struct bpf_dynptr *src_ptr, u64 src_off, u64 size) { struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; void *src_slice, *dst_slice; char buf[256]; - u32 off; + u64 off; src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size); dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size); @@ -2906,7 +2960,7 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, off = 0; while (off < size) { - u32 chunk_sz = min_t(u32, sizeof(buf), size - off); + u64 chunk_sz = min_t(u64, sizeof(buf), size - off); int err; err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0); @@ -2932,10 +2986,10 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, * at @offset with the constant byte @val. * Returns 0 on success; negative error, otherwise. */ - __bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u32 offset, u32 size, u8 val) - { +__bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u64 offset, u64 size, u8 val) +{ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; - u32 chunk_sz, write_off; + u64 chunk_sz, write_off; char buf[256]; void* slice; int err; @@ -2954,11 +3008,11 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, return err; /* Non-linear data under the dynptr, write from a local buffer */ - chunk_sz = min_t(u32, sizeof(buf), size); + chunk_sz = min_t(u64, sizeof(buf), size); memset(buf, val, chunk_sz); for (write_off = 0; write_off < size; write_off += chunk_sz) { - chunk_sz = min_t(u32, sizeof(buf), size - write_off); + chunk_sz = min_t(u64, sizeof(buf), size - write_off); err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0); if (err) return err; @@ -3678,34 +3732,21 @@ err_out: return -EFAULT; } -/** - * bpf_strnstr - Find the first substring in a length-limited string - * @s1__ign: The string to be searched - * @s2__ign: The string to search for - * @len: the maximum number of characters to search - * - * Return: - * * >=0 - Index of the first character of the first occurrence of @s2__ign - * within the first @len characters of @s1__ign - * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign - * * %-EFAULT - Cannot read one of the strings - * * %-E2BIG - One of the strings is too large - * * %-ERANGE - One of the strings is outside of kernel address space - */ -__bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len) +static int __bpf_strnstr(const char *s1, const char *s2, size_t len, + bool ignore_case) { char c1, c2; int i, j; - if (!copy_from_kernel_nofault_allowed(s1__ign, 1) || - !copy_from_kernel_nofault_allowed(s2__ign, 1)) { + if (!copy_from_kernel_nofault_allowed(s1, 1) || + !copy_from_kernel_nofault_allowed(s2, 1)) { return -ERANGE; } guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) { - __get_kernel_nofault(&c2, s2__ign + j, char, err_out); + __get_kernel_nofault(&c2, s2 + j, char, err_out); if (c2 == '\0') return i; /* @@ -3715,7 +3756,13 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len */ if (i + j == len) break; - __get_kernel_nofault(&c1, s1__ign + j, char, err_out); + __get_kernel_nofault(&c1, s1 + j, char, err_out); + + if (ignore_case) { + c1 = tolower(c1); + c2 = tolower(c2); + } + if (c1 == '\0') return -ENOENT; if (c1 != c2) @@ -3725,7 +3772,7 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len return -E2BIG; if (i + j == len) return -ENOENT; - s1__ign++; + s1++; } return -E2BIG; err_out: @@ -3747,8 +3794,69 @@ err_out: */ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign) { - return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX); + return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, false); +} + +/** + * bpf_strcasestr - Find the first substring in a string, ignoring the case of + * the characters + * @s1__ign: The string to be searched + * @s2__ign: The string to search for + * + * Return: + * * >=0 - Index of the first character of the first occurrence of @s2__ign + * within @s1__ign + * * %-ENOENT - @s2__ign is not a substring of @s1__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strcasestr(const char *s1__ign, const char *s2__ign) +{ + return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, true); +} + +/** + * bpf_strnstr - Find the first substring in a length-limited string + * @s1__ign: The string to be searched + * @s2__ign: The string to search for + * @len: the maximum number of characters to search + * + * Return: + * * >=0 - Index of the first character of the first occurrence of @s2__ign + * within the first @len characters of @s1__ign + * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, + size_t len) +{ + return __bpf_strnstr(s1__ign, s2__ign, len, false); +} + +/** + * bpf_strncasestr - Find the first substring in a length-limited string, + * ignoring the case of the characters + * @s1__ign: The string to be searched + * @s2__ign: The string to search for + * @len: the maximum number of characters to search + * + * Return: + * * >=0 - Index of the first character of the first occurrence of @s2__ign + * within the first @len characters of @s1__ign + * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strncasestr(const char *s1__ign, const char *s2__ign, + size_t len) +{ + return __bpf_strnstr(s1__ign, s2__ign, len, true); } + #ifdef CONFIG_KEYS /** * bpf_lookup_user_key - lookup a key by its serial @@ -4206,6 +4314,54 @@ __bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task, return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME); } +static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep, + struct bpf_dynptr_kern *ptr) +{ + struct bpf_dynptr_file_impl *state; + + /* flags is currently unsupported */ + if (flags) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + state = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_dynptr_file_impl)); + if (!state) { + bpf_dynptr_set_null(ptr); + return -ENOMEM; + } + state->offset = 0; + state->size = U64_MAX; /* Don't restrict size, as file may change anyways */ + freader_init_from_file(&state->freader, NULL, 0, file, may_sleep); + bpf_dynptr_init(ptr, state, BPF_DYNPTR_TYPE_FILE, 0, 0); + bpf_dynptr_set_rdonly(ptr); + return 0; +} + +__bpf_kfunc int bpf_dynptr_from_file(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit) +{ + return make_file_dynptr(file, flags, false, (struct bpf_dynptr_kern *)ptr__uninit); +} + +int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit) +{ + return make_file_dynptr(file, flags, true, (struct bpf_dynptr_kern *)ptr__uninit); +} + +__bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr) +{ + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)dynptr; + struct bpf_dynptr_file_impl *df = ptr->data; + + if (!df) + return 0; + + freader_cleanup(&df->freader); + bpf_mem_free(&bpf_global_ma, df); + bpf_dynptr_set_null(ptr); + return 0; +} + __bpf_kfunc_end_defs(); static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work) @@ -4376,13 +4532,17 @@ BTF_ID_FLAGS(func, bpf_strnlen); BTF_ID_FLAGS(func, bpf_strspn); BTF_ID_FLAGS(func, bpf_strcspn); BTF_ID_FLAGS(func, bpf_strstr); +BTF_ID_FLAGS(func, bpf_strcasestr); BTF_ID_FLAGS(func, bpf_strnstr); +BTF_ID_FLAGS(func, bpf_strncasestr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_file, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_file_discard) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { @@ -4423,7 +4583,7 @@ late_initcall(kfunc_init); /* Get a pointer to dynptr data up to len bytes for read only access. If * the dynptr doesn't have continuous data up to len bytes, return NULL. */ -const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len) +const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len) { const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr; @@ -4434,9 +4594,19 @@ const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len) * the dynptr doesn't have continuous data up to len bytes, or the dynptr * is read only, return NULL. */ -void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len) +void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len) { if (__bpf_dynptr_is_rdonly(ptr)) return NULL; return (void *)__bpf_dynptr_data(ptr, len); } + +void bpf_map_free_internal_structs(struct bpf_map *map, void *val) +{ + if (btf_record_has_field(map->record, BPF_TIMER)) + bpf_obj_free_timer(map->record, val); + if (btf_record_has_field(map->record, BPF_WORKQUEUE)) + bpf_obj_free_workqueue(map->record, val); + if (btf_record_has_field(map->record, BPF_TASK_WORK)) + bpf_obj_free_task_work(map->record, val); +} diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index 1e6538f59a7860..60db5d655495b5 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -34,7 +34,7 @@ * - read and write marks propagation. * - The propagation phase is a textbook live variable data flow analysis: * - * state[cc, i].live_after = U [state[cc, s].live_before for s in insn_successors(i)] + * state[cc, i].live_after = U [state[cc, s].live_before for s in bpf_insn_successors(i)] * state[cc, i].live_before = * (state[cc, i].live_after / state[cc, i].must_write) U state[i].may_read * @@ -54,7 +54,7 @@ * The equation for "must_write_acc" propagation looks as follows: * * state[cc, i].must_write_acc = - * ∩ [state[cc, s].must_write_acc for s in insn_successors(i)] + * ∩ [state[cc, s].must_write_acc for s in bpf_insn_successors(i)] * U state[cc, i].must_write * * (An intersection of all "must_write_acc" for instruction successors @@ -447,7 +447,12 @@ int bpf_jmp_offset(struct bpf_insn *insn) __diag_push(); __diag_ignore_all("-Woverride-init", "Allow field initialization overrides for opcode_info_tbl"); -inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) +/* + * Returns an array of instructions succ, with succ->items[0], ..., + * succ->items[n-1] with successor instructions, where n=succ->cnt + */ +inline struct bpf_iarray * +bpf_insn_successors(struct bpf_verifier_env *env, u32 idx) { static const struct opcode_info { bool can_jump; @@ -474,19 +479,29 @@ inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) _J(BPF_JSET, {.can_jump = true, .can_fallthrough = true}), #undef _J }; + struct bpf_prog *prog = env->prog; struct bpf_insn *insn = &prog->insnsi[idx]; const struct opcode_info *opcode_info; - int i = 0, insn_sz; + struct bpf_iarray *succ, *jt; + int insn_sz; + + jt = env->insn_aux_data[idx].jt; + if (unlikely(jt)) + return jt; + + /* pre-allocated array of size up to 2; reset cnt, as it may have been used already */ + succ = env->succ; + succ->cnt = 0; opcode_info = &opcode_info_tbl[BPF_CLASS(insn->code) | BPF_OP(insn->code)]; insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; if (opcode_info->can_fallthrough) - succ[i++] = idx + insn_sz; + succ->items[succ->cnt++] = idx + insn_sz; if (opcode_info->can_jump) - succ[i++] = idx + bpf_jmp_offset(insn) + 1; + succ->items[succ->cnt++] = idx + bpf_jmp_offset(insn) + 1; - return i; + return succ; } __diag_pop(); @@ -524,6 +539,8 @@ static int propagate_to_outer_instance(struct bpf_verifier_env *env, this_subprog_start = callchain_subprog_start(callchain); outer_instance = get_outer_instance(env, instance); + if (IS_ERR(outer_instance)) + return PTR_ERR(outer_instance); callsite = callchain->callsites[callchain->curframe - 1]; reset_stack_write_marks(env, outer_instance, callsite); @@ -546,11 +563,12 @@ static inline bool update_insn(struct bpf_verifier_env *env, struct bpf_insn_aux_data *aux = env->insn_aux_data; u64 new_before, new_after, must_write_acc; struct per_frame_masks *insn, *succ_insn; - u32 succ_num, s, succ[2]; + struct bpf_iarray *succ; + u32 s; bool changed; - succ_num = bpf_insn_successors(env->prog, insn_idx, succ); - if (unlikely(succ_num == 0)) + succ = bpf_insn_successors(env, insn_idx); + if (succ->cnt == 0) return false; changed = false; @@ -562,8 +580,8 @@ static inline bool update_insn(struct bpf_verifier_env *env, * of successors plus all "must_write" slots of instruction itself. */ must_write_acc = U64_MAX; - for (s = 0; s < succ_num; ++s) { - succ_insn = get_frame_masks(instance, frame, succ[s]); + for (s = 0; s < succ->cnt; ++s) { + succ_insn = get_frame_masks(instance, frame, succ->items[s]); new_after |= succ_insn->live_before; must_write_acc &= succ_insn->must_write_acc; } diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index f50533169cc34e..a0c3b35de2ce65 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -461,6 +461,7 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) [PTR_TO_ARENA] = "arena", [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", + [PTR_TO_INSN] = "insn", [PTR_TO_MAP_KEY] = "map_key", [CONST_PTR_TO_DYNPTR] = "dynptr_ptr", }; @@ -500,6 +501,8 @@ const char *dynptr_type_str(enum bpf_dynptr_type type) return "xdp"; case BPF_DYNPTR_TYPE_SKB_META: return "skb_meta"; + case BPF_DYNPTR_TYPE_FILE: + return "file"; case BPF_DYNPTR_TYPE_INVALID: return "<invalid>"; default: diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c index 37b80a23ae1aed..99c63d982c5d67 100644 --- a/kernel/bpf/range_tree.c +++ b/kernel/bpf/range_tree.c @@ -2,7 +2,6 @@ /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ #include <linux/interval_tree_generic.h> #include <linux/slab.h> -#include <linux/bpf_mem_alloc.h> #include <linux/bpf.h> #include "range_tree.h" @@ -21,7 +20,7 @@ * in commit 6772fcc8890a ("xfs: convert xbitmap to interval tree"). * * The implementation relies on external lock to protect rbtree-s. - * The alloc/free of range_node-s is done via bpf_mem_alloc. + * The alloc/free of range_node-s is done via kmalloc_nolock(). * * bpf arena is using range_tree to represent unallocated slots. * At init time: @@ -150,9 +149,7 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len) range_it_insert(rn, rt); /* Add a range */ - migrate_disable(); - new_rn = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node)); - migrate_enable(); + new_rn = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE); if (!new_rn) return -ENOMEM; new_rn->rn_start = last + 1; @@ -172,9 +169,7 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len) } else { /* in the middle of the clearing range */ range_it_remove(rn, rt); - migrate_disable(); - bpf_mem_free(&bpf_global_ma, rn); - migrate_enable(); + kfree_nolock(rn); } } return 0; @@ -227,9 +222,7 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len) range_it_remove(right, rt); left->rn_last = right->rn_last; range_it_insert(left, rt); - migrate_disable(); - bpf_mem_free(&bpf_global_ma, right); - migrate_enable(); + kfree_nolock(right); } else if (left) { /* Combine with the left range */ range_it_remove(left, rt); @@ -241,9 +234,7 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len) right->rn_start = start; range_it_insert(right, rt); } else { - migrate_disable(); - left = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node)); - migrate_enable(); + left = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE); if (!left) return -ENOMEM; left->rn_start = start; @@ -259,7 +250,7 @@ void range_tree_destroy(struct range_tree *rt) while ((rn = range_it_iter_first(rt, 0, -1U))) { range_it_remove(rn, rt); - bpf_mem_free(&bpf_global_ma, rn); + kfree_nolock(rn); } } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index d706c4b7f532d4..f6a075ffac6372 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -13,7 +13,7 @@ #include <linux/btf_ids.h> #include <asm/rqspinlock.h> -#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) +#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE | BPF_F_RB_OVERWRITE) /* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ #define RINGBUF_PGOFF \ @@ -30,6 +30,7 @@ struct bpf_ringbuf { u64 mask; struct page **pages; int nr_pages; + bool overwrite_mode; rqspinlock_t spinlock ____cacheline_aligned_in_smp; /* For user-space producer ring buffers, an atomic_t busy bit is used * to synchronize access to the ring buffers in the kernel, rather than @@ -73,6 +74,7 @@ struct bpf_ringbuf { unsigned long consumer_pos __aligned(PAGE_SIZE); unsigned long producer_pos __aligned(PAGE_SIZE); unsigned long pending_pos; + unsigned long overwrite_pos; /* position after the last overwritten record */ char data[] __aligned(PAGE_SIZE); }; @@ -166,7 +168,7 @@ static void bpf_ringbuf_notify(struct irq_work *work) * considering that the maximum value of data_sz is (4GB - 1), there * will be no overflow, so just note the size limit in the comments. */ -static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) +static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node, bool overwrite_mode) { struct bpf_ringbuf *rb; @@ -183,17 +185,25 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) rb->consumer_pos = 0; rb->producer_pos = 0; rb->pending_pos = 0; + rb->overwrite_mode = overwrite_mode; return rb; } static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) { + bool overwrite_mode = false; struct bpf_ringbuf_map *rb_map; if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); + if (attr->map_flags & BPF_F_RB_OVERWRITE) { + if (attr->map_type != BPF_MAP_TYPE_RINGBUF) + return ERR_PTR(-EINVAL); + overwrite_mode = true; + } + if (attr->key_size || attr->value_size || !is_power_of_2(attr->max_entries) || !PAGE_ALIGNED(attr->max_entries)) @@ -205,7 +215,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) bpf_map_init_from_attr(&rb_map->map, attr); - rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); + rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node, overwrite_mode); if (!rb_map->rb) { bpf_map_area_free(rb_map); return ERR_PTR(-ENOMEM); @@ -295,13 +305,26 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); } +/* + * Return an estimate of the available data in the ring buffer. + * Note: the returned value can exceed the actual ring buffer size because the + * function is not synchronized with the producer. The producer acquires the + * ring buffer's spinlock, but this function does not. + */ static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) { - unsigned long cons_pos, prod_pos; + unsigned long cons_pos, prod_pos, over_pos; cons_pos = smp_load_acquire(&rb->consumer_pos); - prod_pos = smp_load_acquire(&rb->producer_pos); - return prod_pos - cons_pos; + + if (unlikely(rb->overwrite_mode)) { + over_pos = smp_load_acquire(&rb->overwrite_pos); + prod_pos = smp_load_acquire(&rb->producer_pos); + return prod_pos - max(cons_pos, over_pos); + } else { + prod_pos = smp_load_acquire(&rb->producer_pos); + return prod_pos - cons_pos; + } } static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb) @@ -404,11 +427,43 @@ bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) return (void*)((addr & PAGE_MASK) - off); } +static bool bpf_ringbuf_has_space(const struct bpf_ringbuf *rb, + unsigned long new_prod_pos, + unsigned long cons_pos, + unsigned long pend_pos) +{ + /* + * No space if oldest not yet committed record until the newest + * record span more than (ringbuf_size - 1). + */ + if (new_prod_pos - pend_pos > rb->mask) + return false; + + /* Ok, we have space in overwrite mode */ + if (unlikely(rb->overwrite_mode)) + return true; + + /* + * No space if producer position advances more than (ringbuf_size - 1) + * ahead of consumer position when not in overwrite mode. + */ + if (new_prod_pos - cons_pos > rb->mask) + return false; + + return true; +} + +static u32 bpf_ringbuf_round_up_hdr_len(u32 hdr_len) +{ + hdr_len &= ~BPF_RINGBUF_DISCARD_BIT; + return round_up(hdr_len + BPF_RINGBUF_HDR_SZ, 8); +} + static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) { - unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, flags; + unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, over_pos, flags; struct bpf_ringbuf_hdr *hdr; - u32 len, pg_off, tmp_size, hdr_len; + u32 len, pg_off, hdr_len; if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) return NULL; @@ -431,24 +486,43 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) hdr_len = READ_ONCE(hdr->len); if (hdr_len & BPF_RINGBUF_BUSY_BIT) break; - tmp_size = hdr_len & ~BPF_RINGBUF_DISCARD_BIT; - tmp_size = round_up(tmp_size + BPF_RINGBUF_HDR_SZ, 8); - pend_pos += tmp_size; + pend_pos += bpf_ringbuf_round_up_hdr_len(hdr_len); } rb->pending_pos = pend_pos; - /* check for out of ringbuf space: - * - by ensuring producer position doesn't advance more than - * (ringbuf_size - 1) ahead - * - by ensuring oldest not yet committed record until newest - * record does not span more than (ringbuf_size - 1) - */ - if (new_prod_pos - cons_pos > rb->mask || - new_prod_pos - pend_pos > rb->mask) { + if (!bpf_ringbuf_has_space(rb, new_prod_pos, cons_pos, pend_pos)) { raw_res_spin_unlock_irqrestore(&rb->spinlock, flags); return NULL; } + /* + * In overwrite mode, advance overwrite_pos when the ring buffer is full. + * The key points are to stay on record boundaries and consume enough records + * to fit the new one. + */ + if (unlikely(rb->overwrite_mode)) { + over_pos = rb->overwrite_pos; + while (new_prod_pos - over_pos > rb->mask) { + hdr = (void *)rb->data + (over_pos & rb->mask); + hdr_len = READ_ONCE(hdr->len); + /* + * The bpf_ringbuf_has_space() check above ensures we won’t + * step over a record currently being worked on by another + * producer. + */ + over_pos += bpf_ringbuf_round_up_hdr_len(hdr_len); + } + /* + * smp_store_release(&rb->producer_pos, new_prod_pos) at + * the end of the function ensures that when consumer sees + * the updated rb->producer_pos, it always sees the updated + * rb->overwrite_pos, so when consumer reads overwrite_pos + * after smp_load_acquire(r->producer_pos), the overwrite_pos + * will always be valid. + */ + WRITE_ONCE(rb->overwrite_pos, over_pos); + } + hdr = (void *)rb->data + (prod_pos & rb->mask); pg_off = bpf_ringbuf_rec_pg_off(rb, hdr); hdr->len = size | BPF_RINGBUF_BUSY_BIT; @@ -578,6 +652,8 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) return smp_load_acquire(&rb->consumer_pos); case BPF_RB_PROD_POS: return smp_load_acquire(&rb->producer_pos); + case BPF_RB_OVERWRITE_POS: + return smp_load_acquire(&rb->overwrite_pos); default: return 0; } diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index a00561b1d3e515..f7d0c8d4644edb 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -89,15 +89,14 @@ struct rqspinlock_timeout { DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); EXPORT_SYMBOL_GPL(rqspinlock_held_locks); -static bool is_lock_released(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts) +static bool is_lock_released(rqspinlock_t *lock, u32 mask) { if (!(atomic_read_acquire(&lock->val) & (mask))) return true; return false; } -static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask, - struct rqspinlock_timeout *ts) +static noinline int check_deadlock_AA(rqspinlock_t *lock) { struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); int cnt = min(RES_NR_HELD, rqh->cnt); @@ -118,8 +117,7 @@ static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask, * more locks, which reduce to ABBA). This is not exhaustive, and we rely on * timeouts as the final line of defense. */ -static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask, - struct rqspinlock_timeout *ts) +static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask) { struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); int rqh_cnt = min(RES_NR_HELD, rqh->cnt); @@ -142,7 +140,7 @@ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask, * Let's ensure to break out of this loop if the lock is available for * us to potentially acquire. */ - if (is_lock_released(lock, mask, ts)) + if (is_lock_released(lock, mask)) return 0; /* @@ -198,33 +196,21 @@ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask, return 0; } -static noinline int check_deadlock(rqspinlock_t *lock, u32 mask, - struct rqspinlock_timeout *ts) -{ - int ret; - - ret = check_deadlock_AA(lock, mask, ts); - if (ret) - return ret; - ret = check_deadlock_ABBA(lock, mask, ts); - if (ret) - return ret; - - return 0; -} - static noinline int check_timeout(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts) { - u64 time = ktime_get_mono_fast_ns(); u64 prev = ts->cur; + u64 time; if (!ts->timeout_end) { - ts->cur = time; - ts->timeout_end = time + ts->duration; + if (check_deadlock_AA(lock)) + return -EDEADLK; + ts->cur = ktime_get_mono_fast_ns(); + ts->timeout_end = ts->cur + ts->duration; return 0; } + time = ktime_get_mono_fast_ns(); if (time > ts->timeout_end) return -ETIMEDOUT; @@ -234,7 +220,7 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask, */ if (prev + NSEC_PER_MSEC < time) { ts->cur = time; - return check_deadlock(lock, mask, ts); + return check_deadlock_ABBA(lock, mask); } return 0; @@ -278,6 +264,10 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock) int val, ret = 0; RES_INIT_TIMEOUT(ts); + /* + * The fast path is not invoked for the TAS fallback, so we must grab + * the deadlock detection entry here. + */ grab_held_lock_entry(lock); /* @@ -400,10 +390,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) goto queue; } - /* - * Grab an entry in the held locks array, to enable deadlock detection. - */ - grab_held_lock_entry(lock); + /* Deadlock detection entry already held after failing fast path. */ /* * We're pending, wait for the owner to go away. @@ -450,12 +437,21 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) * queuing. */ queue: - lockevent_inc(lock_slowpath); /* - * Grab deadlock detection entry for the queue path. + * Do not queue if we're a waiter and someone is attempting this lock on + * the same CPU. In case of NMIs, this prevents long timeouts where we + * interrupt the pending waiter, and the owner, that will eventually + * signal the head of our queue, both of which are logically but not + * physically part of the queue, hence outside the scope of the idx > 0 + * check above for the trylock fallback. */ - grab_held_lock_entry(lock); + if (check_deadlock_AA(lock)) { + ret = -EDEADLK; + goto err_release_entry; + } + lockevent_inc(lock_slowpath); + /* Deadlock detection entry already held after failing fast path. */ node = this_cpu_ptr(&rqnodes[0].mcs); idx = node->count++; tail = encode_tail(smp_processor_id(), idx); @@ -467,19 +463,17 @@ queue: * not be nested NMIs taking spinlocks. That may not be true in * some architectures even though the chance of needing more than * 4 nodes will still be extremely unlikely. When that happens, - * we fall back to spinning on the lock directly without using - * any MCS node. This is not the most elegant solution, but is - * simple enough. + * we fall back to attempting a trylock operation without using + * any MCS node. Unlike qspinlock which cannot fail, we have the + * option of failing the slow path, and under contention, such a + * trylock spinning will likely be treated unfairly due to lack of + * queueing, hence do not spin. */ - if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) { + if (unlikely(idx >= _Q_MAX_NODES || (in_nmi() && idx > 0))) { lockevent_inc(lock_no_node); - RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); - while (!queued_spin_trylock(lock)) { - if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) { - lockevent_inc(rqspinlock_lock_timeout); - goto err_release_node; - } - cpu_relax(); + if (!queued_spin_trylock(lock)) { + ret = -EDEADLK; + goto err_release_node; } goto release; } @@ -540,7 +534,7 @@ queue: val = arch_mcs_spin_lock_contended(&node->locked); if (val == RES_TIMEOUT_VAL) { - ret = -EDEADLK; + ret = -ETIMEDOUT; goto waitq_timeout; } @@ -575,6 +569,14 @@ queue: val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK)); + /* Disable queue destruction when we detect deadlocks. */ + if (ret == -EDEADLK) { + if (!next) + next = smp_cond_load_relaxed(&node->next, (VAL)); + arch_mcs_spin_unlock_contended(&next->locked); + goto err_release_node; + } + waitq_timeout: if (ret) { /* diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 8f1dacaf01fe28..da3d328f5c15a1 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -42,6 +42,28 @@ static inline int stack_map_data_size(struct bpf_map *map) sizeof(struct bpf_stack_build_id) : sizeof(u64); } +/** + * stack_map_calculate_max_depth - Calculate maximum allowed stack trace depth + * @size: Size of the buffer/map value in bytes + * @elem_size: Size of each stack trace element + * @flags: BPF stack trace flags (BPF_F_USER_STACK, BPF_F_USER_BUILD_ID, ...) + * + * Return: Maximum number of stack trace entries that can be safely stored + */ +static u32 stack_map_calculate_max_depth(u32 size, u32 elem_size, u64 flags) +{ + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; + u32 max_depth; + u32 curr_sysctl_max_stack = READ_ONCE(sysctl_perf_event_max_stack); + + max_depth = size / elem_size; + max_depth += skip; + if (max_depth > curr_sysctl_max_stack) + return curr_sysctl_max_stack; + + return max_depth; +} + static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) { u64 elem_size = sizeof(struct stack_map_bucket) + @@ -229,8 +251,8 @@ static long __bpf_get_stackid(struct bpf_map *map, { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *new_bucket, *old_bucket; + u32 hash, id, trace_nr, trace_len, i, max_depth; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; - u32 hash, id, trace_nr, trace_len, i; bool user = flags & BPF_F_USER_STACK; u64 *ips; bool hash_matches; @@ -239,7 +261,8 @@ static long __bpf_get_stackid(struct bpf_map *map, /* skipping more than usable stack trace */ return -EFAULT; - trace_nr = trace->nr - skip; + max_depth = stack_map_calculate_max_depth(map->value_size, stack_map_data_size(map), flags); + trace_nr = min_t(u32, trace->nr - skip, max_depth - skip); trace_len = trace_nr * sizeof(u64); ips = trace->ip + skip; hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); @@ -300,20 +323,17 @@ static long __bpf_get_stackid(struct bpf_map *map, BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { - u32 max_depth = map->value_size / stack_map_data_size(map); - u32 skip = flags & BPF_F_SKIP_FIELD_MASK; + u32 elem_size = stack_map_data_size(map); bool user = flags & BPF_F_USER_STACK; struct perf_callchain_entry *trace; bool kernel = !user; + u32 max_depth; if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; - max_depth += skip; - if (max_depth > sysctl_perf_event_max_stack) - max_depth = sysctl_perf_event_max_stack; - + max_depth = stack_map_calculate_max_depth(map->value_size, elem_size, flags); trace = get_perf_callchain(regs, kernel, user, max_depth, false, false, 0); @@ -371,15 +391,11 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, return -EFAULT; nr_kernel = count_kernel_ip(trace); + __u64 nr = trace->nr; /* save original */ if (kernel) { - __u64 nr = trace->nr; - trace->nr = nr_kernel; ret = __bpf_get_stackid(map, trace, flags); - - /* restore nr */ - trace->nr = nr; } else { /* user */ u64 skip = flags & BPF_F_SKIP_FIELD_MASK; @@ -390,6 +406,10 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; ret = __bpf_get_stackid(map, trace, flags); } + + /* restore nr */ + trace->nr = nr; + return ret; } @@ -406,7 +426,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, struct perf_callchain_entry *trace_in, void *buf, u32 size, u64 flags, bool may_fault) { - u32 trace_nr, copy_len, elem_size, num_elem, max_depth; + u32 trace_nr, copy_len, elem_size, max_depth; bool user_build_id = flags & BPF_F_USER_BUILD_ID; bool crosstask = task && task != current; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; @@ -438,21 +458,20 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, goto clear; } - num_elem = size / elem_size; - max_depth = num_elem + skip; - if (sysctl_perf_event_max_stack < max_depth) - max_depth = sysctl_perf_event_max_stack; + max_depth = stack_map_calculate_max_depth(size, elem_size, flags); if (may_fault) rcu_read_lock(); /* need RCU for perf's callchain below */ - if (trace_in) + if (trace_in) { trace = trace_in; - else if (kernel && task) + trace->nr = min_t(u32, trace->nr, max_depth); + } else if (kernel && task) { trace = get_callchain_entry_for_task(task, max_depth); - else + } else { trace = get_perf_callchain(regs, kernel, user, max_depth, crosstask, false, 0); + } if (unlikely(!trace) || trace->nr < skip) { if (may_fault) @@ -461,7 +480,6 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, } trace_nr = trace->nr - skip; - trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; copy_len = trace_nr * elem_size; ips = trace->ip + skip; diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index ff16c631951bb6..0b6bc3f303350c 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -4,111 +4,10 @@ #include <linux/bpf.h> #include <linux/filter.h> #include <linux/bpf_mem_alloc.h> -#include <linux/percpu.h> -#include <linux/refcount.h> #include <linux/gfp.h> #include <linux/memory.h> -#include <linux/local_lock.h> #include <linux/mutex.h> -/* - * Simple per-CPU NMI-safe bump allocation mechanism, backed by the NMI-safe - * try_alloc_pages()/free_pages_nolock() primitives. We allocate a page and - * stash it in a local per-CPU variable, and bump allocate from the page - * whenever items need to be printed to a stream. Each page holds a global - * atomic refcount in its first 4 bytes, and then records of variable length - * that describe the printed messages. Once the global refcount has dropped to - * zero, it is a signal to free the page back to the kernel's page allocator, - * given all the individual records in it have been consumed. - * - * It is possible the same page is used to serve allocations across different - * programs, which may be consumed at different times individually, hence - * maintaining a reference count per-page is critical for correct lifetime - * tracking. - * - * The bpf_stream_page code will be replaced to use kmalloc_nolock() once it - * lands. - */ -struct bpf_stream_page { - refcount_t ref; - u32 consumed; - char buf[]; -}; - -/* Available room to add data to a refcounted page. */ -#define BPF_STREAM_PAGE_SZ (PAGE_SIZE - offsetofend(struct bpf_stream_page, consumed)) - -static DEFINE_PER_CPU(local_trylock_t, stream_local_lock) = INIT_LOCAL_TRYLOCK(stream_local_lock); -static DEFINE_PER_CPU(struct bpf_stream_page *, stream_pcpu_page); - -static bool bpf_stream_page_local_lock(unsigned long *flags) -{ - return local_trylock_irqsave(&stream_local_lock, *flags); -} - -static void bpf_stream_page_local_unlock(unsigned long *flags) -{ - local_unlock_irqrestore(&stream_local_lock, *flags); -} - -static void bpf_stream_page_free(struct bpf_stream_page *stream_page) -{ - struct page *p; - - if (!stream_page) - return; - p = virt_to_page(stream_page); - free_pages_nolock(p, 0); -} - -static void bpf_stream_page_get(struct bpf_stream_page *stream_page) -{ - refcount_inc(&stream_page->ref); -} - -static void bpf_stream_page_put(struct bpf_stream_page *stream_page) -{ - if (refcount_dec_and_test(&stream_page->ref)) - bpf_stream_page_free(stream_page); -} - -static void bpf_stream_page_init(struct bpf_stream_page *stream_page) -{ - refcount_set(&stream_page->ref, 1); - stream_page->consumed = 0; -} - -static struct bpf_stream_page *bpf_stream_page_replace(void) -{ - struct bpf_stream_page *stream_page, *old_stream_page; - struct page *page; - - page = alloc_pages_nolock(/* Don't account */ 0, NUMA_NO_NODE, 0); - if (!page) - return NULL; - stream_page = page_address(page); - bpf_stream_page_init(stream_page); - - old_stream_page = this_cpu_read(stream_pcpu_page); - if (old_stream_page) - bpf_stream_page_put(old_stream_page); - this_cpu_write(stream_pcpu_page, stream_page); - return stream_page; -} - -static int bpf_stream_page_check_room(struct bpf_stream_page *stream_page, int len) -{ - int min = offsetof(struct bpf_stream_elem, str[0]); - int consumed = stream_page->consumed; - int total = BPF_STREAM_PAGE_SZ; - int rem = max(0, total - consumed - min); - - /* Let's give room of at least 8 bytes. */ - WARN_ON_ONCE(rem % 8 != 0); - rem = rem < 8 ? 0 : rem; - return min(len, rem); -} - static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len) { init_llist_node(&elem->node); @@ -116,54 +15,12 @@ static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len) elem->consumed_len = 0; } -static struct bpf_stream_page *bpf_stream_page_from_elem(struct bpf_stream_elem *elem) -{ - unsigned long addr = (unsigned long)elem; - - return (struct bpf_stream_page *)PAGE_ALIGN_DOWN(addr); -} - -static struct bpf_stream_elem *bpf_stream_page_push_elem(struct bpf_stream_page *stream_page, int len) -{ - u32 consumed = stream_page->consumed; - - stream_page->consumed += round_up(offsetof(struct bpf_stream_elem, str[len]), 8); - return (struct bpf_stream_elem *)&stream_page->buf[consumed]; -} - -static struct bpf_stream_elem *bpf_stream_page_reserve_elem(int len) -{ - struct bpf_stream_elem *elem = NULL; - struct bpf_stream_page *page; - int room = 0; - - page = this_cpu_read(stream_pcpu_page); - if (!page) - page = bpf_stream_page_replace(); - if (!page) - return NULL; - - room = bpf_stream_page_check_room(page, len); - if (room != len) - page = bpf_stream_page_replace(); - if (!page) - return NULL; - bpf_stream_page_get(page); - room = bpf_stream_page_check_room(page, len); - WARN_ON_ONCE(room != len); - - elem = bpf_stream_page_push_elem(page, room); - bpf_stream_elem_init(elem, room); - return elem; -} - static struct bpf_stream_elem *bpf_stream_elem_alloc(int len) { const int max_len = ARRAY_SIZE((struct bpf_bprintf_buffers){}.buf); struct bpf_stream_elem *elem; - unsigned long flags; + size_t alloc_size; - BUILD_BUG_ON(max_len > BPF_STREAM_PAGE_SZ); /* * Length denotes the amount of data to be written as part of stream element, * thus includes '\0' byte. We're capped by how much bpf_bprintf_buffers can @@ -172,10 +29,13 @@ static struct bpf_stream_elem *bpf_stream_elem_alloc(int len) if (len < 0 || len > max_len) return NULL; - if (!bpf_stream_page_local_lock(&flags)) + alloc_size = offsetof(struct bpf_stream_elem, str[len]); + elem = kmalloc_nolock(alloc_size, __GFP_ZERO, -1); + if (!elem) return NULL; - elem = bpf_stream_page_reserve_elem(len); - bpf_stream_page_local_unlock(&flags); + + bpf_stream_elem_init(elem, len); + return elem; } @@ -231,10 +91,7 @@ static struct bpf_stream *bpf_stream_get(enum bpf_stream_id stream_id, struct bp static void bpf_stream_free_elem(struct bpf_stream_elem *elem) { - struct bpf_stream_page *p; - - p = bpf_stream_page_from_elem(elem); - bpf_stream_page_put(p); + kfree_nolock(elem); } static void bpf_stream_free_list(struct llist_node *list) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6cde6a46babf0b..7561f66e3331e3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -158,7 +158,7 @@ static void maybe_wait_bpf_programs(struct bpf_map *map) */ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) - synchronize_rcu(); + synchronize_rcu_expedited(); } static void unpin_uptr_kaddr(void *kaddr) @@ -1493,6 +1493,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_MAP_TYPE_STRUCT_OPS: case BPF_MAP_TYPE_CPUMAP: case BPF_MAP_TYPE_ARENA: + case BPF_MAP_TYPE_INSN_ARRAY: if (!bpf_token_capable(token, CAP_BPF)) goto put_token; break; @@ -1585,7 +1586,8 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) goto free_map; } } else if (attr->excl_prog_hash_size) { - return -EINVAL; + err = -EINVAL; + goto free_map; } err = security_bpf_map_create(map, attr, token, uattr.is_kernel); @@ -1724,9 +1726,6 @@ static int map_lookup_elem(union bpf_attr *attr) if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) return -EINVAL; - if (attr->flags & ~BPF_F_LOCK) - return -EINVAL; - CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) @@ -1734,9 +1733,9 @@ static int map_lookup_elem(union bpf_attr *attr) if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) return -EPERM; - if ((attr->flags & BPF_F_LOCK) && - !btf_record_has_field(map->record, BPF_SPIN_LOCK)) - return -EINVAL; + err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK); + if (err) + return err; key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) @@ -1799,11 +1798,9 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } - if ((attr->flags & BPF_F_LOCK) && - !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { - err = -EINVAL; + err = bpf_map_check_op_flags(map, attr->flags, ~0); + if (err) goto err_put; - } key = ___bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { @@ -2007,13 +2004,9 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, void *key, *value; int err = 0; - if (attr->batch.elem_flags & ~BPF_F_LOCK) - return -EINVAL; - - if ((attr->batch.elem_flags & BPF_F_LOCK) && - !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { - return -EINVAL; - } + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + if (err) + return err; value_size = bpf_map_value_size(map); @@ -2070,12 +2063,9 @@ int generic_map_lookup_batch(struct bpf_map *map, u32 value_size, cp, max_count; int err; - if (attr->batch.elem_flags & ~BPF_F_LOCK) - return -EINVAL; - - if ((attr->batch.elem_flags & BPF_F_LOCK) && - !btf_record_has_field(map->record, BPF_SPIN_LOCK)) - return -EINVAL; + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + if (err) + return err; value_size = bpf_map_value_size(map); @@ -2462,6 +2452,9 @@ void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog) struct bpf_prog_stats *stats; unsigned int flags; + if (unlikely(!prog->stats)) + return; + stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->misses); @@ -2853,6 +2846,23 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr return err; } +static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog) +{ + int err; + int i; + + for (i = 0; i < prog->aux->used_map_cnt; i++) { + if (prog->aux->used_maps[i]->map_type != BPF_MAP_TYPE_INSN_ARRAY) + continue; + + err = bpf_insn_array_ready(prog->aux->used_maps[i]); + if (err) + return err; + } + + return 0; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD keyring_id @@ -3082,6 +3092,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (err < 0) goto free_used_maps; + err = bpf_prog_mark_insn_arrays_ready(prog); + if (err < 0) + goto free_used_maps; + err = bpf_prog_alloc_id(prog); if (err) goto free_used_maps; @@ -5034,19 +5048,19 @@ static int bpf_prog_get_info_by_fd(struct file *file, struct bpf_insn *insns_sanitized; bool fault; - if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) { + if (!prog->blinded || bpf_dump_raw_ok(file->f_cred)) { + insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); + if (!insns_sanitized) + return -ENOMEM; + uinsns = u64_to_user_ptr(info.xlated_prog_insns); + ulen = min_t(u32, info.xlated_prog_len, ulen); + fault = copy_to_user(uinsns, insns_sanitized, ulen); + kfree(insns_sanitized); + if (fault) + return -EFAULT; + } else { info.xlated_prog_insns = 0; - goto done; } - insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); - if (!insns_sanitized) - return -ENOMEM; - uinsns = u64_to_user_ptr(info.xlated_prog_insns); - ulen = min_t(u32, info.xlated_prog_len, ulen); - fault = copy_to_user(uinsns, insns_sanitized, ulen); - kfree(insns_sanitized); - if (fault) - return -EFAULT; } if (bpf_prog_is_offloaded(prog->aux)) { diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index f2cb0b09709330..976d89011b1570 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -175,23 +175,42 @@ out: return tr; } -static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) +static int bpf_trampoline_update_fentry(struct bpf_trampoline *tr, u32 orig_flags, + void *old_addr, void *new_addr) { + enum bpf_text_poke_type new_t = BPF_MOD_CALL, old_t = BPF_MOD_CALL; void *ip = tr->func.addr; + + if (!new_addr) + new_t = BPF_MOD_NOP; + else if (bpf_trampoline_use_jmp(tr->flags)) + new_t = BPF_MOD_JUMP; + + if (!old_addr) + old_t = BPF_MOD_NOP; + else if (bpf_trampoline_use_jmp(orig_flags)) + old_t = BPF_MOD_JUMP; + + return bpf_arch_text_poke(ip, old_t, new_t, old_addr, new_addr); +} + +static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, + void *old_addr) +{ int ret; if (tr->func.ftrace_managed) ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false); else - ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); + ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, NULL); return ret; } -static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr, +static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, + void *old_addr, void *new_addr, bool lock_direct_mutex) { - void *ip = tr->func.addr; int ret; if (tr->func.ftrace_managed) { @@ -200,7 +219,8 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad else ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr); } else { - ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr); + ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, + new_addr); } return ret; } @@ -220,10 +240,12 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) } if (tr->func.ftrace_managed) { - ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); + ret = ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); + if (ret) + return ret; ret = register_ftrace_direct(tr->fops, (long)new_addr); } else { - ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); + ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr); } return ret; @@ -334,8 +356,9 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im) * call_rcu_tasks() is not necessary. */ if (im->ip_after_call) { - int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP, - NULL, im->ip_epilogue); + int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_NOP, + BPF_MOD_JUMP, NULL, + im->ip_epilogue); WARN_ON(err); if (IS_ENABLED(CONFIG_TASKS_RCU)) call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks); @@ -408,7 +431,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut return PTR_ERR(tlinks); if (total == 0) { - err = unregister_fentry(tr, tr->cur_image->image); + err = unregister_fentry(tr, orig_flags, tr->cur_image->image); bpf_tramp_image_put(tr->cur_image); tr->cur_image = NULL; goto out; @@ -432,9 +455,20 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS again: - if ((tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) && - (tr->flags & BPF_TRAMP_F_CALL_ORIG)) - tr->flags |= BPF_TRAMP_F_ORIG_STACK; + if (tr->flags & BPF_TRAMP_F_CALL_ORIG) { + if (tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) { + /* The BPF_TRAMP_F_SKIP_FRAME can be cleared in the + * first try, reset it in the second try. + */ + tr->flags |= BPF_TRAMP_F_ORIG_STACK | BPF_TRAMP_F_SKIP_FRAME; + } else if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_JMP)) { + /* Use "jmp" instead of "call" for the trampoline + * in the origin call case, and we don't need to + * skip the frame. + */ + tr->flags &= ~BPF_TRAMP_F_SKIP_FRAME; + } + } #endif size = arch_bpf_trampoline_size(&tr->func.model, tr->flags, @@ -465,10 +499,18 @@ again: if (err) goto out_free; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP + if (bpf_trampoline_use_jmp(tr->flags)) + tr->fops->flags |= FTRACE_OPS_FL_JMP; + else + tr->fops->flags &= ~FTRACE_OPS_FL_JMP; +#endif + WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) /* progs already running at this address */ - err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex); + err = modify_fentry(tr, orig_flags, tr->cur_image->image, + im->image, lock_direct_mutex); else /* first time registering */ err = register_fentry(tr, im->image); @@ -491,8 +533,15 @@ again: tr->cur_image = im; out: /* If any error happens, restore previous flags */ - if (err) + if (err) { tr->flags = orig_flags; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP + if (bpf_trampoline_use_jmp(tr->flags)) + tr->fops->flags |= FTRACE_OPS_FL_JMP; + else + tr->fops->flags &= ~FTRACE_OPS_FL_JMP; +#endif + } kfree(tlinks); return err; @@ -568,7 +617,8 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, if (err) return err; tr->extension_prog = link->link.prog; - return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, + return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP, + BPF_MOD_JUMP, NULL, link->link.prog->bpf_func); } if (cnt >= BPF_MAX_TRAMP_LINKS) @@ -616,6 +666,7 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, if (kind == BPF_TRAMP_REPLACE) { WARN_ON_ONCE(!tr->extension_prog); err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, + BPF_MOD_NOP, tr->extension_prog->bpf_func, NULL); tr->extension_prog = NULL; guard(mutex)(&tgt_prog->aux->ext_mutex); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fbe4bb91c564ae..f0ca69f888fac2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -209,8 +209,6 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env); static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env); static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg); -static void specialize_kfunc(struct bpf_verifier_env *env, - u32 func_id, u16 offset, unsigned long *addr); static bool is_trusted_reg(const struct bpf_reg_state *reg); static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) @@ -515,6 +513,7 @@ static bool is_callback_calling_kfunc(u32 btf_id); static bool is_bpf_throw_kfunc(struct bpf_insn *insn); static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id); +static bool is_task_work_add_kfunc(u32 func_id); static bool is_sync_callback_calling_function(enum bpf_func_id func_id) { @@ -547,6 +546,21 @@ static bool is_async_callback_calling_insn(struct bpf_insn *insn) (bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm)); } +static bool is_async_cb_sleepable(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + /* bpf_timer callbacks are never sleepable. */ + if (bpf_helper_call(insn) && insn->imm == BPF_FUNC_timer_set_callback) + return false; + + /* bpf_wq and bpf_task_work callbacks are always sleepable. */ + if (bpf_pseudo_kfunc_call(insn) && insn->off == 0 && + (is_bpf_wq_set_callback_impl_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm))) + return true; + + verifier_bug(env, "unhandled async callback in is_async_cb_sleepable"); + return false; +} + static bool is_may_goto_insn(struct bpf_insn *insn) { return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO; @@ -676,6 +690,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) return BPF_DYNPTR_TYPE_XDP; case DYNPTR_TYPE_SKB_META: return BPF_DYNPTR_TYPE_SKB_META; + case DYNPTR_TYPE_FILE: + return BPF_DYNPTR_TYPE_FILE; default: return BPF_DYNPTR_TYPE_INVALID; } @@ -694,6 +710,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) return DYNPTR_TYPE_XDP; case BPF_DYNPTR_TYPE_SKB_META: return DYNPTR_TYPE_SKB_META; + case BPF_DYNPTR_TYPE_FILE: + return DYNPTR_TYPE_FILE; default: return 0; } @@ -701,7 +719,7 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) static bool dynptr_type_refcounted(enum bpf_dynptr_type type) { - return type == BPF_DYNPTR_TYPE_RINGBUF; + return type == BPF_DYNPTR_TYPE_RINGBUF || type == BPF_DYNPTR_TYPE_FILE; } static void __mark_dynptr_reg(struct bpf_reg_state *reg, @@ -812,6 +830,15 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re struct bpf_func_state *state = func(env, reg); int spi, ref_obj_id, i; + /* + * This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot + * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr + * is safe to do directly. + */ + if (reg->type == CONST_PTR_TO_DYNPTR) { + verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released"); + return -EFAULT; + } spi = dynptr_get_spi(env, reg); if (spi < 0) return spi; @@ -1410,7 +1437,7 @@ static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf dst->acquired_refs = src->acquired_refs; dst->active_locks = src->active_locks; dst->active_preempt_locks = src->active_preempt_locks; - dst->active_rcu_lock = src->active_rcu_lock; + dst->active_rcu_locks = src->active_rcu_locks; dst->active_irq_id = src->active_irq_id; dst->active_lock_id = src->active_lock_id; dst->active_lock_ptr = src->active_lock_ptr; @@ -2093,7 +2120,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT); if (!elem) - return NULL; + return ERR_PTR(-ENOMEM); elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; @@ -2103,12 +2130,12 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, env->stack_size++; err = copy_verifier_state(&elem->st, cur); if (err) - return NULL; + return ERR_PTR(-ENOMEM); elem->st.speculative |= speculative; if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { verbose(env, "The sequence of %d jumps is too complex.\n", env->stack_size); - return NULL; + return ERR_PTR(-E2BIG); } if (elem->st.parent) { ++elem->st.parent->branches; @@ -2903,7 +2930,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT); if (!elem) - return NULL; + return ERR_PTR(-ENOMEM); elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; @@ -2915,7 +2942,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, verbose(env, "The sequence of %d jumps is too complex for async cb.\n", env->stack_size); - return NULL; + return ERR_PTR(-E2BIG); } /* Unlike push_stack() do not copy_verifier_state(). * The caller state doesn't matter. @@ -2926,7 +2953,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, elem->st.in_sleepable = is_sleepable; frame = kzalloc(sizeof(*frame), GFP_KERNEL_ACCOUNT); if (!frame) - return NULL; + return ERR_PTR(-ENOMEM); init_func_state(env, frame, BPF_MAIN_FUNC /* callsite */, 0 /* frameno within this callchain */, @@ -3097,6 +3124,9 @@ struct bpf_kfunc_btf_tab { u32 nr_descs; }; +static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, + int insn_idx); + static int kfunc_desc_cmp_by_id_off(const void *a, const void *b) { const struct bpf_kfunc_desc *d0 = a; @@ -3114,7 +3144,7 @@ static int kfunc_btf_cmp_by_off(const void *a, const void *b) return d0->offset - d1->offset; } -static const struct bpf_kfunc_desc * +static struct bpf_kfunc_desc * find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset) { struct bpf_kfunc_desc desc = { @@ -3237,12 +3267,12 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) { const struct btf_type *func, *func_proto; struct bpf_kfunc_btf_tab *btf_tab; + struct btf_func_model func_model; struct bpf_kfunc_desc_tab *tab; struct bpf_prog_aux *prog_aux; struct bpf_kfunc_desc *desc; const char *func_name; struct btf *desc_btf; - unsigned long call_imm; unsigned long addr; int err; @@ -3326,19 +3356,6 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) func_name); return -EINVAL; } - specialize_kfunc(env, func_id, offset, &addr); - - if (bpf_jit_supports_far_kfunc_call()) { - call_imm = func_id; - } else { - call_imm = BPF_CALL_IMM(addr); - /* Check whether the relative offset overflows desc->imm */ - if ((unsigned long)(s32)call_imm != call_imm) { - verbose(env, "address of kernel function %s is out of range\n", - func_name); - return -EINVAL; - } - } if (bpf_dev_bound_kfunc_id(func_id)) { err = bpf_dev_bound_kfunc_check(&env->log, prog_aux); @@ -3346,18 +3363,20 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return err; } + err = btf_distill_func_proto(&env->log, desc_btf, + func_proto, func_name, + &func_model); + if (err) + return err; + desc = &tab->descs[tab->nr_descs++]; desc->func_id = func_id; - desc->imm = call_imm; desc->offset = offset; desc->addr = addr; - err = btf_distill_func_proto(&env->log, desc_btf, - func_proto, func_name, - &desc->func_model); - if (!err) - sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), - kfunc_desc_cmp_by_id_off, NULL); - return err; + desc->func_model = func_model; + sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), + kfunc_desc_cmp_by_id_off, NULL); + return 0; } static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b) @@ -3372,16 +3391,43 @@ static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b) return 0; } -static void sort_kfunc_descs_by_imm_off(struct bpf_prog *prog) +static int set_kfunc_desc_imm(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc) +{ + unsigned long call_imm; + + if (bpf_jit_supports_far_kfunc_call()) { + call_imm = desc->func_id; + } else { + call_imm = BPF_CALL_IMM(desc->addr); + /* Check whether the relative offset overflows desc->imm */ + if ((unsigned long)(s32)call_imm != call_imm) { + verbose(env, "address of kernel func_id %u is out of range\n", + desc->func_id); + return -EINVAL; + } + } + desc->imm = call_imm; + return 0; +} + +static int sort_kfunc_descs_by_imm_off(struct bpf_verifier_env *env) { struct bpf_kfunc_desc_tab *tab; + int i, err; - tab = prog->aux->kfunc_tab; + tab = env->prog->aux->kfunc_tab; if (!tab) - return; + return 0; + + for (i = 0; i < tab->nr_descs; i++) { + err = set_kfunc_desc_imm(env, &tab->descs[i]); + if (err) + return err; + } sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off, NULL); + return 0; } bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog) @@ -3509,8 +3555,12 @@ static int check_subprogs(struct bpf_verifier_env *env) subprog[cur_subprog].has_ld_abs = true; if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) goto next; - if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) + if (BPF_OP(code) == BPF_CALL) goto next; + if (BPF_OP(code) == BPF_EXIT) { + subprog[cur_subprog].exit_idx = i; + goto next; + } off = i + bpf_jmp_offset(&insn[i]) + 1; if (off < subprog_start || off >= subprog_end) { verbose(env, "jump out of range from insn %d to %d\n", i, off); @@ -4392,6 +4442,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, bt_reg_mask(bt)); return -EFAULT; } + if (insn->src_reg == BPF_REG_0 && insn->imm == BPF_FUNC_tail_call + && subseq_idx - idx != 1) { + if (bt_subprog_enter(bt)) + return -EFAULT; + } } else if (opcode == BPF_EXIT) { bool r0_precise; @@ -5826,8 +5881,7 @@ bad_type: static bool in_sleepable(struct bpf_verifier_env *env) { - return env->prog->sleepable || - (env->cur_state && env->cur_state->in_sleepable); + return env->cur_state->in_sleepable; } /* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock() @@ -5835,7 +5889,7 @@ static bool in_sleepable(struct bpf_verifier_env *env) */ static bool in_rcu_cs(struct bpf_verifier_env *env) { - return env->cur_state->active_rcu_lock || + return env->cur_state->active_rcu_locks || env->cur_state->active_locks || !in_sleepable(env); } @@ -5988,6 +6042,18 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, return 0; } +/* + * Return the size of the memory region accessible from a pointer to map value. + * For INSN_ARRAY maps whole bpf_insn_array->ips array is accessible. + */ +static u32 map_mem_size(const struct bpf_map *map) +{ + if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) + return map->max_entries * sizeof(long); + + return map->value_size; +} + /* check read/write into a map element with possible variable offset */ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed, @@ -5997,11 +6063,11 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; struct bpf_map *map = reg->map_ptr; + u32 mem_size = map_mem_size(map); struct btf_record *rec; int err, i; - err = check_mem_region_access(env, regno, off, size, map->value_size, - zero_size_allowed); + err = check_mem_region_access(env, regno, off, size, mem_size, zero_size_allowed); if (err) return err; @@ -6416,6 +6482,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, break; case PTR_TO_MAP_VALUE: pointer_desc = "value "; + if (reg->map_ptr->map_type == BPF_MAP_TYPE_INSN_ARRAY) + strict = true; break; case PTR_TO_CTX: pointer_desc = "context "; @@ -7039,6 +7107,9 @@ BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state) { /* RCU trusted: these fields are trusted in RCU CS and can be NULL */ BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) { struct file __rcu *exe_file; +#ifdef CONFIG_MEMCG + struct task_struct __rcu *owner; +#endif }; /* skb->sk, req->sk are not RCU protected, but we mark them as such @@ -7078,6 +7149,11 @@ BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) { struct sock *sk; }; +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct) { + struct mm_struct *vm_mm; + struct file *vm_file; +}; + static bool type_is_rcu(struct bpf_verifier_env *env, struct bpf_reg_state *reg, const char *field_name, u32 btf_id) @@ -7119,6 +7195,7 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env, { BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted_or_null"); @@ -7502,10 +7579,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (t == BPF_READ && value_regno >= 0) { struct bpf_map *map = reg->map_ptr; - /* if map is read-only, track its contents as scalars */ + /* + * If map is read-only, track its contents as scalars, + * unless it is an insn array (see the special case below) + */ if (tnum_is_const(reg->var_off) && bpf_map_is_rdonly(map) && - map->ops->map_direct_value_addr) { + map->ops->map_direct_value_addr && + map->map_type != BPF_MAP_TYPE_INSN_ARRAY) { int map_off = off + reg->var_off.value; u64 val = 0; @@ -7516,6 +7597,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn regs[value_regno].type = SCALAR_VALUE; __mark_reg_known(®s[value_regno], val); + } else if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) { + if (bpf_size != BPF_DW) { + verbose(env, "Invalid read of %d bytes from insn_array\n", + size); + return -EACCES; + } + copy_register_state(®s[value_regno], reg); + regs[value_regno].type = PTR_TO_INSN; } else { mark_reg_unknown(env, regs, value_regno); } @@ -8464,6 +8553,9 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, case BPF_TASK_WORK: field_off = map->record->task_work_off; break; + case BPF_WORKQUEUE: + field_off = map->record->wq_off; + break; default: verifier_bug(env, "unsupported BTF field type: %s\n", struct_name); return -EINVAL; @@ -8505,13 +8597,17 @@ static int process_wq_func(struct bpf_verifier_env *env, int regno, { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; struct bpf_map *map = reg->map_ptr; - u64 val = reg->var_off.value; + int err; - if (map->record->wq_off != val + reg->off) { - verbose(env, "off %lld doesn't point to 'struct bpf_wq' that is at %d\n", - val + reg->off, map->record->wq_off); - return -EINVAL; + err = check_map_field_pointer(env, regno, BPF_WORKQUEUE); + if (err) + return err; + + if (meta->map.ptr) { + verifier_bug(env, "Two map pointers in a bpf_wq helper"); + return -EFAULT; } + meta->map.uid = reg->map_uid; meta->map.ptr = map; return 0; @@ -9016,8 +9112,8 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, prev_st = find_prev_entry(env, cur_st->parent, insn_idx); /* branch out active iter state */ queued_st = push_stack(env, insn_idx + 1, insn_idx, false); - if (!queued_st) - return -ENOMEM; + if (IS_ERR(queued_st)) + return PTR_ERR(queued_st); queued_iter = get_iter_from_state(queued_st, meta); queued_iter->iter.state = BPF_ITER_STATE_ACTIVE; @@ -10054,6 +10150,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_map_push_elem) goto error; break; + case BPF_MAP_TYPE_INSN_ARRAY: + goto error; default: break; } @@ -10368,8 +10466,6 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env, struct bpf_func_state *callee, int insn_idx); -static bool is_task_work_add_kfunc(u32 func_id); - static int set_callee_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, int insn_idx); @@ -10588,10 +10684,9 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, insn_idx, subprog, - is_bpf_wq_set_callback_impl_kfunc(insn->imm) || - is_task_work_add_kfunc(insn->imm)); - if (!async_cb) - return -EFAULT; + is_async_cb_sleepable(env, insn)); + if (IS_ERR(async_cb)) + return PTR_ERR(async_cb); callee = async_cb->frame[0]; callee->async_entry_cnt = caller->async_entry_cnt + 1; @@ -10607,8 +10702,8 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins * proceed with next instruction within current frame. */ callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false); - if (!callback_state) - return -ENOMEM; + if (IS_ERR(callback_state)) + return PTR_ERR(callback_state); err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb, callback_state); @@ -10648,7 +10743,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } if (env->subprog_info[subprog].might_sleep && - (env->cur_state->active_rcu_lock || env->cur_state->active_preempt_locks || + (env->cur_state->active_rcu_locks || env->cur_state->active_preempt_locks || env->cur_state->active_irq_id || !in_sleepable(env))) { verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n" "i.e., in a RCU/IRQ/preempt-disabled section, or in\n" @@ -10662,8 +10757,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return err; } - verbose(env, "Func#%d ('%s') is global and assumed valid.\n", - subprog, sub_name); + if (env->log.level & BPF_LOG_LEVEL) + verbose(env, "Func#%d ('%s') is global and assumed valid.\n", + subprog, sub_name); if (env->subprog_info[subprog].changes_pkt_data) clear_all_pkt_pointers(env); /* mark global subprog for verifying after main prog */ @@ -10976,6 +11072,10 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) bool in_callback_fn; int err; + err = bpf_update_live_stack(env); + if (err) + return err; + callee = state->frame[state->curframe]; r0 = &callee->regs[BPF_REG_0]; if (r0->type == PTR_TO_STACK) { @@ -11226,7 +11326,7 @@ static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit return -EINVAL; } - if (check_lock && env->cur_state->active_rcu_lock) { + if (check_lock && env->cur_state->active_rcu_locks) { verbose(env, "%s cannot be used inside bpf_rcu_read_lock-ed region\n", prefix); return -EINVAL; } @@ -11361,6 +11461,15 @@ static int get_helper_proto(struct bpf_verifier_env *env, int func_id, return *ptr && (*ptr)->func ? 0 : -EINVAL; } +/* Check if we're in a sleepable context. */ +static inline bool in_sleepable_context(struct bpf_verifier_env *env) +{ + return !env->cur_state->active_rcu_locks && + !env->cur_state->active_preempt_locks && + !env->cur_state->active_irq_id && + in_sleepable(env); +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -11421,15 +11530,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return err; } - if (env->cur_state->active_rcu_lock) { + if (env->cur_state->active_rcu_locks) { if (fn->might_sleep) { verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n", func_id_name(func_id), func_id); return -EINVAL; } - - if (in_sleepable(env) && is_storage_get_function(func_id)) - env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } if (env->cur_state->active_preempt_locks) { @@ -11438,9 +11544,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn func_id_name(func_id), func_id); return -EINVAL; } - - if (in_sleepable(env) && is_storage_get_function(func_id)) - env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } if (env->cur_state->active_irq_id) { @@ -11449,11 +11552,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn func_id_name(func_id), func_id); return -EINVAL; } - - if (in_sleepable(env) && is_storage_get_function(func_id)) - env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } + /* Track non-sleepable context for helpers. */ + if (!in_sleepable_context(env)) + env->insn_aux_data[insn_idx].non_sleepable = true; + meta.func_id = func_id; /* check args */ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { @@ -11484,15 +11588,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (meta.release_regno) { err = -EINVAL; - /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot - * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr - * is safe to do directly. - */ if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { - if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) { - verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released"); - return -EFAULT; - } err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) { u32 ref_obj_id = meta.ref_obj_id; @@ -11886,6 +11982,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn env->prog->call_get_func_ip = true; } + if (func_id == BPF_FUNC_tail_call) { + if (env->cur_state->curframe) { + struct bpf_verifier_state *branch; + + mark_reg_scratched(env, BPF_REG_0); + branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); + if (IS_ERR(branch)) + return PTR_ERR(branch); + clear_all_pkt_pointers(env); + mark_reg_unknown(env, regs, BPF_REG_0); + err = prepare_func_exit(env, &env->insn_idx); + if (err) + return err; + env->insn_idx--; + } else { + changes_data = false; + } + } + if (changes_data) clear_all_pkt_pointers(env); return 0; @@ -12260,6 +12375,8 @@ enum special_kfunc_type { KF_bpf_res_spin_unlock, KF_bpf_res_spin_lock_irqsave, KF_bpf_res_spin_unlock_irqrestore, + KF_bpf_dynptr_from_file, + KF_bpf_dynptr_file_discard, KF___bpf_trap, KF_bpf_task_work_schedule_signal_impl, KF_bpf_task_work_schedule_resume_impl, @@ -12332,6 +12449,8 @@ BTF_ID(func, bpf_res_spin_lock) BTF_ID(func, bpf_res_spin_unlock) BTF_ID(func, bpf_res_spin_lock_irqsave) BTF_ID(func, bpf_res_spin_unlock_irqrestore) +BTF_ID(func, bpf_dynptr_from_file) +BTF_ID(func, bpf_dynptr_file_discard) BTF_ID(func, __bpf_trap) BTF_ID(func, bpf_task_work_schedule_signal_impl) BTF_ID(func, bpf_task_work_schedule_resume_impl) @@ -13295,6 +13414,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ dynptr_arg_type |= DYNPTR_TYPE_XDP; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb_meta]) { dynptr_arg_type |= DYNPTR_TYPE_SKB_META; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) { + dynptr_arg_type |= DYNPTR_TYPE_FILE; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_file_discard]) { + dynptr_arg_type |= DYNPTR_TYPE_FILE; + meta->release_regno = regno; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && (dynptr_arg_type & MEM_UNINIT)) { enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type; @@ -13829,9 +13953,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_reg_state *regs; branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); - if (!branch) { + if (IS_ERR(branch)) { verbose(env, "failed to push state for failed lock acquisition\n"); - return -ENOMEM; + return PTR_ERR(branch); } regs = branch->frame[branch->curframe]->regs; @@ -13863,6 +13987,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EACCES; } + /* Track non-sleepable context for kfuncs, same as for helpers. */ + if (!in_sleepable_context(env)) + insn_aux->non_sleepable = true; + /* Check the arguments */ err = check_kfunc_args(env, &meta, insn_idx); if (err < 0) @@ -13909,36 +14037,33 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, preempt_disable = is_kfunc_bpf_preempt_disable(&meta); preempt_enable = is_kfunc_bpf_preempt_enable(&meta); - if (env->cur_state->active_rcu_lock) { + if (rcu_lock) { + env->cur_state->active_rcu_locks++; + } else if (rcu_unlock) { struct bpf_func_state *state; struct bpf_reg_state *reg; u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER); - if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) { - verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n"); - return -EACCES; - } - - if (rcu_lock) { - verbose(env, "nested rcu read lock (kernel function %s)\n", func_name); + if (env->cur_state->active_rcu_locks == 0) { + verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name); return -EINVAL; - } else if (rcu_unlock) { + } + if (--env->cur_state->active_rcu_locks == 0) { bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({ if (reg->type & MEM_RCU) { reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); reg->type |= PTR_UNTRUSTED; } })); - env->cur_state->active_rcu_lock = false; - } else if (sleepable) { - verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name); - return -EACCES; } - } else if (rcu_lock) { - env->cur_state->active_rcu_lock = true; - } else if (rcu_unlock) { - verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name); - return -EINVAL; + } else if (sleepable && env->cur_state->active_rcu_locks) { + verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name); + return -EACCES; + } + + if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) { + verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n"); + return -EACCES; } if (env->cur_state->active_preempt_locks) { @@ -13971,12 +14096,18 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ if (meta.release_regno) { - err = release_reference(env, regs[meta.release_regno].ref_obj_id); - if (err) { - verbose(env, "kfunc %s#%d reference has not been acquired before\n", - func_name, meta.func_id); - return err; + struct bpf_reg_state *reg = ®s[meta.release_regno]; + + if (meta.initialized_dynptr.ref_obj_id) { + err = unmark_stack_slots_dynptr(env, reg); + } else { + err = release_reference(env, reg->ref_obj_id); + if (err) + verbose(env, "kfunc %s#%d reference has not been acquired before\n", + func_name, meta.func_id); } + if (err) + return err; } if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] || @@ -14282,16 +14413,15 @@ struct bpf_sanitize_info { bool mask_to_left; }; -static struct bpf_verifier_state * -sanitize_speculative_path(struct bpf_verifier_env *env, - const struct bpf_insn *insn, - u32 next_idx, u32 curr_idx) +static int sanitize_speculative_path(struct bpf_verifier_env *env, + const struct bpf_insn *insn, + u32 next_idx, u32 curr_idx) { struct bpf_verifier_state *branch; struct bpf_reg_state *regs; branch = push_stack(env, next_idx, curr_idx, true); - if (branch && insn) { + if (!IS_ERR(branch) && insn) { regs = branch->frame[branch->curframe]->regs; if (BPF_SRC(insn->code) == BPF_K) { mark_reg_unknown(env, regs, insn->dst_reg); @@ -14300,7 +14430,7 @@ sanitize_speculative_path(struct bpf_verifier_env *env, mark_reg_unknown(env, regs, insn->src_reg); } } - return branch; + return PTR_ERR_OR_ZERO(branch); } static int sanitize_ptr_alu(struct bpf_verifier_env *env, @@ -14319,7 +14449,6 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, u8 opcode = BPF_OP(insn->code); u32 alu_state, alu_limit; struct bpf_reg_state tmp; - bool ret; int err; if (can_skip_alu_sanitation(env, insn)) @@ -14392,11 +14521,12 @@ do_sim: tmp = *dst_reg; copy_register_state(dst_reg, ptr_reg); } - ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1, - env->insn_idx); - if (!ptr_is_dst_reg && ret) + err = sanitize_speculative_path(env, NULL, env->insn_idx + 1, env->insn_idx); + if (err < 0) + return REASON_STACK; + if (!ptr_is_dst_reg) *dst_reg = tmp; - return !ret ? REASON_STACK : 0; + return 0; } static void sanitize_mark_insn_seen(struct bpf_verifier_env *env) @@ -15950,6 +16080,30 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value; s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value; + if (reg1 == reg2) { + switch (opcode) { + case BPF_JGE: + case BPF_JLE: + case BPF_JSGE: + case BPF_JSLE: + case BPF_JEQ: + return 1; + case BPF_JGT: + case BPF_JLT: + case BPF_JSGT: + case BPF_JSLT: + case BPF_JNE: + return 0; + case BPF_JSET: + if (tnum_is_const(t1)) + return t1.value != 0; + else + return (smin1 <= 0 && smax1 >= 0) ? -1 : 1; + default: + return -1; + } + } + switch (opcode) { case BPF_JEQ: /* constants, umin/umax and smin/smax checks would be @@ -16396,6 +16550,13 @@ static int reg_set_min_max(struct bpf_verifier_env *env, if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE) return 0; + /* We compute branch direction for same SCALAR_VALUE registers in + * is_scalar_branch_taken(). For unknown branch directions (e.g., BPF_JSET) + * on the same registers, we don't need to adjust the min/max values. + */ + if (false_reg1 == false_reg2) + return 0; + /* fallthrough (FALSE) branch */ regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32); reg_bounds_sync(false_reg1); @@ -16716,8 +16877,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, /* branch out 'fallthrough' insn as a new state to explore */ queued_st = push_stack(env, idx + 1, idx, false); - if (!queued_st) - return -ENOMEM; + if (IS_ERR(queued_st)) + return PTR_ERR(queued_st); queued_st->may_goto_depth++; if (prev_st) @@ -16795,10 +16956,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, * the fall-through branch for simulation under speculative * execution. */ - if (!env->bypass_spec_v1 && - !sanitize_speculative_path(env, insn, *insn_idx + 1, - *insn_idx)) - return -EFAULT; + if (!env->bypass_spec_v1) { + err = sanitize_speculative_path(env, insn, *insn_idx + 1, *insn_idx); + if (err < 0) + return err; + } if (env->log.level & BPF_LOG_LEVEL) print_insn_state(env, this_branch, this_branch->curframe); *insn_idx += insn->off; @@ -16808,11 +16970,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, * program will go. If needed, push the goto branch for * simulation under speculative execution. */ - if (!env->bypass_spec_v1 && - !sanitize_speculative_path(env, insn, - *insn_idx + insn->off + 1, - *insn_idx)) - return -EFAULT; + if (!env->bypass_spec_v1) { + err = sanitize_speculative_path(env, insn, *insn_idx + insn->off + 1, + *insn_idx); + if (err < 0) + return err; + } if (env->log.level & BPF_LOG_LEVEL) print_insn_state(env, this_branch, this_branch->curframe); return 0; @@ -16833,10 +16996,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return err; } - other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, - false); - if (!other_branch) - return -EFAULT; + other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, false); + if (IS_ERR(other_branch)) + return PTR_ERR(other_branch); other_branch_regs = other_branch->frame[other_branch->curframe]->regs; if (BPF_SRC(insn->code) == BPF_X) { @@ -17019,7 +17181,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) } dst_reg->type = PTR_TO_MAP_VALUE; dst_reg->off = aux->map_off; - WARN_ON_ONCE(map->max_entries != 1); + WARN_ON_ONCE(map->map_type != BPF_MAP_TYPE_INSN_ARRAY && + map->max_entries != 1); /* We want reg->id to be same (0) as map_value is not distinct */ } else if (insn->src_reg == BPF_PSEUDO_MAP_FD || insn->src_reg == BPF_PSEUDO_MAP_IDX) { @@ -17771,6 +17934,247 @@ static int mark_fastcall_patterns(struct bpf_verifier_env *env) return 0; } +static struct bpf_iarray *iarray_realloc(struct bpf_iarray *old, size_t n_elem) +{ + size_t new_size = sizeof(struct bpf_iarray) + n_elem * sizeof(old->items[0]); + struct bpf_iarray *new; + + new = kvrealloc(old, new_size, GFP_KERNEL_ACCOUNT); + if (!new) { + /* this is what callers always want, so simplify the call site */ + kvfree(old); + return NULL; + } + + new->cnt = n_elem; + return new; +} + +static int copy_insn_array(struct bpf_map *map, u32 start, u32 end, u32 *items) +{ + struct bpf_insn_array_value *value; + u32 i; + + for (i = start; i <= end; i++) { + value = map->ops->map_lookup_elem(map, &i); + /* + * map_lookup_elem of an array map will never return an error, + * but not checking it makes some static analysers to worry + */ + if (IS_ERR(value)) + return PTR_ERR(value); + else if (!value) + return -EINVAL; + items[i - start] = value->xlated_off; + } + return 0; +} + +static int cmp_ptr_to_u32(const void *a, const void *b) +{ + return *(u32 *)a - *(u32 *)b; +} + +static int sort_insn_array_uniq(u32 *items, int cnt) +{ + int unique = 1; + int i; + + sort(items, cnt, sizeof(items[0]), cmp_ptr_to_u32, NULL); + + for (i = 1; i < cnt; i++) + if (items[i] != items[unique - 1]) + items[unique++] = items[i]; + + return unique; +} + +/* + * sort_unique({map[start], ..., map[end]}) into off + */ +static int copy_insn_array_uniq(struct bpf_map *map, u32 start, u32 end, u32 *off) +{ + u32 n = end - start + 1; + int err; + + err = copy_insn_array(map, start, end, off); + if (err) + return err; + + return sort_insn_array_uniq(off, n); +} + +/* + * Copy all unique offsets from the map + */ +static struct bpf_iarray *jt_from_map(struct bpf_map *map) +{ + struct bpf_iarray *jt; + int err; + int n; + + jt = iarray_realloc(NULL, map->max_entries); + if (!jt) + return ERR_PTR(-ENOMEM); + + n = copy_insn_array_uniq(map, 0, map->max_entries - 1, jt->items); + if (n < 0) { + err = n; + goto err_free; + } + if (n == 0) { + err = -EINVAL; + goto err_free; + } + jt->cnt = n; + return jt; + +err_free: + kvfree(jt); + return ERR_PTR(err); +} + +/* + * Find and collect all maps which fit in the subprog. Return the result as one + * combined jump table in jt->items (allocated with kvcalloc) + */ +static struct bpf_iarray *jt_from_subprog(struct bpf_verifier_env *env, + int subprog_start, int subprog_end) +{ + struct bpf_iarray *jt = NULL; + struct bpf_map *map; + struct bpf_iarray *jt_cur; + int i; + + for (i = 0; i < env->insn_array_map_cnt; i++) { + /* + * TODO (when needed): collect only jump tables, not static keys + * or maps for indirect calls + */ + map = env->insn_array_maps[i]; + + jt_cur = jt_from_map(map); + if (IS_ERR(jt_cur)) { + kvfree(jt); + return jt_cur; + } + + /* + * This is enough to check one element. The full table is + * checked to fit inside the subprog later in create_jt() + */ + if (jt_cur->items[0] >= subprog_start && jt_cur->items[0] < subprog_end) { + u32 old_cnt = jt ? jt->cnt : 0; + jt = iarray_realloc(jt, old_cnt + jt_cur->cnt); + if (!jt) { + kvfree(jt_cur); + return ERR_PTR(-ENOMEM); + } + memcpy(jt->items + old_cnt, jt_cur->items, jt_cur->cnt << 2); + } + + kvfree(jt_cur); + } + + if (!jt) { + verbose(env, "no jump tables found for subprog starting at %u\n", subprog_start); + return ERR_PTR(-EINVAL); + } + + jt->cnt = sort_insn_array_uniq(jt->items, jt->cnt); + return jt; +} + +static struct bpf_iarray * +create_jt(int t, struct bpf_verifier_env *env) +{ + static struct bpf_subprog_info *subprog; + int subprog_start, subprog_end; + struct bpf_iarray *jt; + int i; + + subprog = bpf_find_containing_subprog(env, t); + subprog_start = subprog->start; + subprog_end = (subprog + 1)->start; + jt = jt_from_subprog(env, subprog_start, subprog_end); + if (IS_ERR(jt)) + return jt; + + /* Check that the every element of the jump table fits within the given subprogram */ + for (i = 0; i < jt->cnt; i++) { + if (jt->items[i] < subprog_start || jt->items[i] >= subprog_end) { + verbose(env, "jump table for insn %d points outside of the subprog [%u,%u]\n", + t, subprog_start, subprog_end); + kvfree(jt); + return ERR_PTR(-EINVAL); + } + } + + return jt; +} + +/* "conditional jump with N edges" */ +static int visit_gotox_insn(int t, struct bpf_verifier_env *env) +{ + int *insn_stack = env->cfg.insn_stack; + int *insn_state = env->cfg.insn_state; + bool keep_exploring = false; + struct bpf_iarray *jt; + int i, w; + + jt = env->insn_aux_data[t].jt; + if (!jt) { + jt = create_jt(t, env); + if (IS_ERR(jt)) + return PTR_ERR(jt); + + env->insn_aux_data[t].jt = jt; + } + + mark_prune_point(env, t); + for (i = 0; i < jt->cnt; i++) { + w = jt->items[i]; + if (w < 0 || w >= env->prog->len) { + verbose(env, "indirect jump out of range from insn %d to %d\n", t, w); + return -EINVAL; + } + + mark_jmp_point(env, w); + + /* EXPLORED || DISCOVERED */ + if (insn_state[w]) + continue; + + if (env->cfg.cur_stack >= env->prog->len) + return -E2BIG; + + insn_stack[env->cfg.cur_stack++] = w; + insn_state[w] |= DISCOVERED; + keep_exploring = true; + } + + return keep_exploring ? KEEP_EXPLORING : DONE_EXPLORING; +} + +static int visit_tailcall_insn(struct bpf_verifier_env *env, int t) +{ + static struct bpf_subprog_info *subprog; + struct bpf_iarray *jt; + + if (env->insn_aux_data[t].jt) + return 0; + + jt = iarray_realloc(NULL, 2); + if (!jt) + return -ENOMEM; + + subprog = bpf_find_containing_subprog(env, t); + jt->items[0] = t + 1; + jt->items[1] = subprog->exit_idx; + env->insn_aux_data[t].jt = jt; + return 0; +} + /* Visits the instruction at index t and returns one of the following: * < 0 - an error occurred * DONE_EXPLORING - the instruction was fully explored @@ -17831,6 +18235,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) mark_subprog_might_sleep(env, t); if (bpf_helper_changes_pkt_data(insn->imm)) mark_subprog_changes_pkt_data(env, t); + if (insn->imm == BPF_FUNC_tail_call) + visit_tailcall_insn(env, t); } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { struct bpf_kfunc_call_arg_meta meta; @@ -17863,8 +18269,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); case BPF_JA: - if (BPF_SRC(insn->code) != BPF_K) - return -EINVAL; + if (BPF_SRC(insn->code) == BPF_X) + return visit_gotox_insn(t, env); if (BPF_CLASS(insn->code) == BPF_JMP) off = insn->off; @@ -17991,8 +18397,9 @@ err_free: */ static int compute_postorder(struct bpf_verifier_env *env) { - u32 cur_postorder, i, top, stack_sz, s, succ_cnt, succ[2]; + u32 cur_postorder, i, top, stack_sz, s; int *stack = NULL, *postorder = NULL, *state = NULL; + struct bpf_iarray *succ; postorder = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); state = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); @@ -18016,11 +18423,11 @@ static int compute_postorder(struct bpf_verifier_env *env) stack_sz--; continue; } - succ_cnt = bpf_insn_successors(env->prog, top, succ); - for (s = 0; s < succ_cnt; ++s) { - if (!state[succ[s]]) { - stack[stack_sz++] = succ[s]; - state[succ[s]] |= DISCOVERED; + succ = bpf_insn_successors(env, top); + for (s = 0; s < succ->cnt; ++s) { + if (!state[succ->items[s]]) { + stack[stack_sz++] = succ->items[s]; + state[succ->items[s]] |= DISCOVERED; } } state[top] |= EXPLORED; @@ -18792,6 +19199,10 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno; case PTR_TO_ARENA: return true; + case PTR_TO_INSN: + return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 && + rold->off == rcur->off && range_within(rold, rcur) && + tnum_in(rold->var_off, rcur->var_off); default: return regs_exact(rold, rcur, idmap); } @@ -18972,7 +19383,7 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c if (old->active_preempt_locks != cur->active_preempt_locks) return false; - if (old->active_rcu_lock != cur->active_rcu_lock) + if (old->active_rcu_locks != cur->active_rcu_locks) return false; if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap)) @@ -19144,7 +19555,7 @@ static int propagate_precision(struct bpf_verifier_env *env, bt_set_frame_slot(&env->bt, fr, i); first = false; } - if (!first) + if (!first && (env->log.level & BPF_LOG_LEVEL2)) verbose(env, "\n"); } @@ -19784,9 +20195,6 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env, return PROCESS_BPF_EXIT; if (env->cur_state->curframe) { - err = bpf_update_live_stack(env); - if (err) - return err; /* exit from nested function */ err = prepare_func_exit(env, &env->insn_idx); if (err) @@ -19801,6 +20209,99 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env, return PROCESS_BPF_EXIT; } +static int indirect_jump_min_max_index(struct bpf_verifier_env *env, + int regno, + struct bpf_map *map, + u32 *pmin_index, u32 *pmax_index) +{ + struct bpf_reg_state *reg = reg_state(env, regno); + u64 min_index, max_index; + const u32 size = 8; + + if (check_add_overflow(reg->umin_value, reg->off, &min_index) || + (min_index > (u64) U32_MAX * size)) { + verbose(env, "the sum of R%u umin_value %llu and off %u is too big\n", + regno, reg->umin_value, reg->off); + return -ERANGE; + } + if (check_add_overflow(reg->umax_value, reg->off, &max_index) || + (max_index > (u64) U32_MAX * size)) { + verbose(env, "the sum of R%u umax_value %llu and off %u is too big\n", + regno, reg->umax_value, reg->off); + return -ERANGE; + } + + min_index /= size; + max_index /= size; + + if (max_index >= map->max_entries) { + verbose(env, "R%u points to outside of jump table: [%llu,%llu] max_entries %u\n", + regno, min_index, max_index, map->max_entries); + return -EINVAL; + } + + *pmin_index = min_index; + *pmax_index = max_index; + return 0; +} + +/* gotox *dst_reg */ +static int check_indirect_jump(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + struct bpf_verifier_state *other_branch; + struct bpf_reg_state *dst_reg; + struct bpf_map *map; + u32 min_index, max_index; + int err = 0; + int n; + int i; + + dst_reg = reg_state(env, insn->dst_reg); + if (dst_reg->type != PTR_TO_INSN) { + verbose(env, "R%d has type %s, expected PTR_TO_INSN\n", + insn->dst_reg, reg_type_str(env, dst_reg->type)); + return -EINVAL; + } + + map = dst_reg->map_ptr; + if (verifier_bug_if(!map, env, "R%d has an empty map pointer", insn->dst_reg)) + return -EFAULT; + + if (verifier_bug_if(map->map_type != BPF_MAP_TYPE_INSN_ARRAY, env, + "R%d has incorrect map type %d", insn->dst_reg, map->map_type)) + return -EFAULT; + + err = indirect_jump_min_max_index(env, insn->dst_reg, map, &min_index, &max_index); + if (err) + return err; + + /* Ensure that the buffer is large enough */ + if (!env->gotox_tmp_buf || env->gotox_tmp_buf->cnt < max_index - min_index + 1) { + env->gotox_tmp_buf = iarray_realloc(env->gotox_tmp_buf, + max_index - min_index + 1); + if (!env->gotox_tmp_buf) + return -ENOMEM; + } + + n = copy_insn_array_uniq(map, min_index, max_index, env->gotox_tmp_buf->items); + if (n < 0) + return n; + if (n == 0) { + verbose(env, "register R%d doesn't point to any offset in map id=%d\n", + insn->dst_reg, map->id); + return -EINVAL; + } + + for (i = 0; i < n - 1; i++) { + other_branch = push_stack(env, env->gotox_tmp_buf->items[i], + env->insn_idx, env->cur_state->speculative); + if (IS_ERR(other_branch)) + return PTR_ERR(other_branch); + } + env->insn_idx = env->gotox_tmp_buf->items[n-1]; + return 0; +} + static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) { int err; @@ -19903,6 +20404,15 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) mark_reg_scratched(env, BPF_REG_0); } else if (opcode == BPF_JA) { + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->src_reg != BPF_REG_0 || + insn->imm != 0 || insn->off != 0) { + verbose(env, "BPF_JA|BPF_X uses reserved fields\n"); + return -EINVAL; + } + return check_indirect_jump(env, insn); + } + if (BPF_SRC(insn->code) != BPF_K || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0 || @@ -20419,6 +20929,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_QUEUE: case BPF_MAP_TYPE_STACK: case BPF_MAP_TYPE_ARENA: + case BPF_MAP_TYPE_INSN_ARRAY: break; default: verbose(env, @@ -20490,6 +21001,15 @@ static int __add_used_map(struct bpf_verifier_env *env, struct bpf_map *map) env->used_maps[env->used_map_cnt++] = map; + if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) { + err = bpf_insn_array_init(map, env->prog); + if (err) { + verbose(env, "Failed to properly initialize insn array\n"); + return err; + } + env->insn_array_maps[env->insn_array_map_cnt++] = map; + } + return env->used_map_cnt - 1; } @@ -20736,6 +21256,33 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len } } +static void release_insn_arrays(struct bpf_verifier_env *env) +{ + int i; + + for (i = 0; i < env->insn_array_map_cnt; i++) + bpf_insn_array_release(env->insn_array_maps[i]); +} + +static void adjust_insn_arrays(struct bpf_verifier_env *env, u32 off, u32 len) +{ + int i; + + if (len == 1) + return; + + for (i = 0; i < env->insn_array_map_cnt; i++) + bpf_insn_array_adjust(env->insn_array_maps[i], off, len); +} + +static void adjust_insn_arrays_after_remove(struct bpf_verifier_env *env, u32 off, u32 len) +{ + int i; + + for (i = 0; i < env->insn_array_map_cnt; i++) + bpf_insn_array_adjust_after_remove(env->insn_array_maps[i], off, len); +} + static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len) { struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; @@ -20777,6 +21324,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of } adjust_insn_aux_data(env, new_prog, off, len); adjust_subprog_starts(env, off, len); + adjust_insn_arrays(env, off, len); adjust_poke_descs(new_prog, off, len); return new_prog; } @@ -20939,6 +21487,27 @@ static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off, return 0; } +/* + * Clean up dynamically allocated fields of aux data for instructions [start, ...] + */ +static void clear_insn_aux_data(struct bpf_verifier_env *env, int start, int len) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + struct bpf_insn *insns = env->prog->insnsi; + int end = start + len; + int i; + + for (i = start; i < end; i++) { + if (aux_data[i].jt) { + kvfree(aux_data[i].jt); + aux_data[i].jt = NULL; + } + + if (bpf_is_ldimm64(&insns[i])) + i++; + } +} + static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) { struct bpf_insn_aux_data *aux_data = env->insn_aux_data; @@ -20948,6 +21517,9 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) if (bpf_prog_is_offloaded(env->prog->aux)) bpf_prog_offload_remove_insns(env, off, cnt); + /* Should be called before bpf_remove_insns, as it uses prog->insnsi */ + clear_insn_aux_data(env, off, cnt); + err = bpf_remove_insns(env->prog, off, cnt); if (err) return err; @@ -20960,6 +21532,8 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) if (err) return err; + adjust_insn_arrays_after_remove(env, off, cnt); + memmove(aux_data + off, aux_data + off + cnt, sizeof(*aux_data) * (orig_prog_len - off - cnt)); @@ -21499,6 +22073,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) struct bpf_insn *insn; void *old_bpf_func; int err, num_exentries; + int old_len, subprog_start_adjustment = 0; if (env->subprog_cnt <= 1) return 0; @@ -21573,6 +22148,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->func_idx = i; /* Below members will be freed only at prog->aux */ func[i]->aux->btf = prog->aux->btf; + func[i]->aux->subprog_start = subprog_start + subprog_start_adjustment; func[i]->aux->func_info = prog->aux->func_info; func[i]->aux->func_info_cnt = prog->aux->func_info_cnt; func[i]->aux->poke_tab = prog->aux->poke_tab; @@ -21602,6 +22178,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->jited_linfo = prog->aux->jited_linfo; func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; func[i]->aux->arena = prog->aux->arena; + func[i]->aux->used_maps = env->used_maps; + func[i]->aux->used_map_cnt = env->used_map_cnt; num_exentries = 0; insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { @@ -21626,7 +22204,15 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->might_sleep = env->subprog_info[i].might_sleep; if (!i) func[i]->aux->exception_boundary = env->seen_exception; + + /* + * To properly pass the absolute subprog start to jit + * all instruction adjustments should be accumulated + */ + old_len = func[i]->len; func[i] = bpf_int_jit_compile(func[i]); + subprog_start_adjustment += func[i]->len - old_len; + if (!func[i]->jited) { err = -ENOTSUPP; goto out_free; @@ -21679,6 +22265,15 @@ static int jit_subprogs(struct bpf_verifier_env *env) cond_resched(); } + /* + * Cleanup func[i]->aux fields which aren't required + * or can become invalid in future + */ + for (i = 0; i < env->subprog_cnt; i++) { + func[i]->aux->used_maps = NULL; + func[i]->aux->used_map_cnt = 0; + } + /* finally lock prog and jit images for all functions and * populate kallsysm. Begin at the first subprogram, since * bpf_prog_load will add the kallsyms for the main program. @@ -21808,46 +22403,47 @@ static int fixup_call_args(struct bpf_verifier_env *env) } /* replace a generic kfunc with a specialized version if necessary */ -static void specialize_kfunc(struct bpf_verifier_env *env, - u32 func_id, u16 offset, unsigned long *addr) +static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, int insn_idx) { struct bpf_prog *prog = env->prog; bool seen_direct_write; void *xdp_kfunc; bool is_rdonly; + u32 func_id = desc->func_id; + u16 offset = desc->offset; + unsigned long addr = desc->addr; + + if (offset) /* return if module BTF is used */ + return 0; if (bpf_dev_bound_kfunc_id(func_id)) { xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id); - if (xdp_kfunc) { - *addr = (unsigned long)xdp_kfunc; - return; - } + if (xdp_kfunc) + addr = (unsigned long)xdp_kfunc; /* fallback to default kfunc when not supported by netdev */ - } - - if (offset) - return; - - if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { + } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { seen_direct_write = env->seen_direct_write; is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE); if (is_rdonly) - *addr = (unsigned long)bpf_dynptr_from_skb_rdonly; + addr = (unsigned long)bpf_dynptr_from_skb_rdonly; /* restore env->seen_direct_write to its original value, since * may_access_direct_pkt_data mutates it */ env->seen_direct_write = seen_direct_write; + } else if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr]) { + if (bpf_lsm_has_d_inode_locked(prog)) + addr = (unsigned long)bpf_set_dentry_xattr_locked; + } else if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr]) { + if (bpf_lsm_has_d_inode_locked(prog)) + addr = (unsigned long)bpf_remove_dentry_xattr_locked; + } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) { + if (!env->insn_aux_data[insn_idx].non_sleepable) + addr = (unsigned long)bpf_dynptr_from_file_sleepable; } - - if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr] && - bpf_lsm_has_d_inode_locked(prog)) - *addr = (unsigned long)bpf_set_dentry_xattr_locked; - - if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr] && - bpf_lsm_has_d_inode_locked(prog)) - *addr = (unsigned long)bpf_remove_dentry_xattr_locked; + desc->addr = addr; + return 0; } static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux, @@ -21870,7 +22466,8 @@ static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux, static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn *insn_buf, int insn_idx, int *cnt) { - const struct bpf_kfunc_desc *desc; + struct bpf_kfunc_desc *desc; + int err; if (!insn->imm) { verbose(env, "invalid kernel function call not eliminated in verifier pass\n"); @@ -21890,6 +22487,10 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EFAULT; } + err = specialize_kfunc(env, desc, insn_idx); + if (err) + return err; + if (!bpf_jit_supports_far_kfunc_call()) insn->imm = BPF_CALL_IMM(desc->addr); if (insn->off) @@ -22485,8 +23086,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) } if (is_storage_get_function(insn->imm)) { - if (!in_sleepable(env) || - env->insn_aux_data[i + delta].storage_get_func_atomic) + if (env->insn_aux_data[i + delta].non_sleepable) insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC); else insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL); @@ -22919,7 +23519,9 @@ next_insn: } } - sort_kfunc_descs_by_imm_off(env->prog); + ret = sort_kfunc_descs_by_imm_off(env); + if (ret) + return ret; return 0; } @@ -23156,6 +23758,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) state->curframe = 0; state->speculative = false; state->branches = 1; + state->in_sleepable = env->prog->sleepable; state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL_ACCOUNT); if (!state->frame[0]) { kfree(state); @@ -23175,7 +23778,8 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) struct bpf_subprog_arg_info *arg; struct bpf_reg_state *reg; - verbose(env, "Validating %s() func#%d...\n", sub_name, subprog); + if (env->log.level & BPF_LOG_LEVEL) + verbose(env, "Validating %s() func#%d...\n", sub_name, subprog); ret = btf_prepare_func_args(env, subprog); if (ret) goto out; @@ -24278,14 +24882,13 @@ static int compute_live_registers(struct bpf_verifier_env *env) for (i = 0; i < env->cfg.cur_postorder; ++i) { int insn_idx = env->cfg.insn_postorder[i]; struct insn_live_regs *live = &state[insn_idx]; - int succ_num; - u32 succ[2]; + struct bpf_iarray *succ; u16 new_out = 0; u16 new_in = 0; - succ_num = bpf_insn_successors(env->prog, insn_idx, succ); - for (int s = 0; s < succ_num; ++s) - new_out |= state[succ[s]].in; + succ = bpf_insn_successors(env, insn_idx); + for (int s = 0; s < succ->cnt; ++s) + new_out |= state[succ->items[s]].in; new_in = (new_out & ~live->def) | live->use; if (new_out != live->out || new_in != live->in) { live->in = new_in; @@ -24338,11 +24941,11 @@ static int compute_scc(struct bpf_verifier_env *env) const u32 insn_cnt = env->prog->len; int stack_sz, dfs_sz, err = 0; u32 *stack, *pre, *low, *dfs; - u32 succ_cnt, i, j, t, w; + u32 i, j, t, w; u32 next_preorder_num; u32 next_scc_id; bool assign_scc; - u32 succ[2]; + struct bpf_iarray *succ; next_preorder_num = 1; next_scc_id = 1; @@ -24449,12 +25052,12 @@ dfs_continue: stack[stack_sz++] = w; } /* Visit 'w' successors */ - succ_cnt = bpf_insn_successors(env->prog, w, succ); - for (j = 0; j < succ_cnt; ++j) { - if (pre[succ[j]]) { - low[w] = min(low[w], low[succ[j]]); + succ = bpf_insn_successors(env, w); + for (j = 0; j < succ->cnt; ++j) { + if (pre[succ->items[j]]) { + low[w] = min(low[w], low[succ->items[j]]); } else { - dfs[dfs_sz++] = succ[j]; + dfs[dfs_sz++] = succ->items[j]; goto dfs_continue; } } @@ -24471,8 +25074,8 @@ dfs_continue: * or if component has a self reference. */ assign_scc = stack[stack_sz - 1] != w; - for (j = 0; j < succ_cnt; ++j) { - if (succ[j] == w) { + for (j = 0; j < succ->cnt; ++j) { + if (succ->items[j] == w) { assign_scc = true; break; } @@ -24534,6 +25137,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 goto err_free_env; for (i = 0; i < len; i++) env->insn_aux_data[i].orig_idx = i; + env->succ = iarray_realloc(NULL, 2); + if (!env->succ) + goto err_free_env; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; @@ -24757,6 +25363,8 @@ skip_full_check: adjust_btf_func(env); err_release_maps: + if (ret) + release_insn_arrays(env); if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_used_maps() will release them. @@ -24777,11 +25385,14 @@ err_release_maps: err_unlock: if (!is_priv) mutex_unlock(&bpf_verifier_lock); + clear_insn_aux_data(env, 0, env->prog->len); vfree(env->insn_aux_data); err_free_env: bpf_stack_liveness_free(env); kvfree(env->cfg.insn_postorder); kvfree(env->scc_info); + kvfree(env->succ); + kvfree(env->gotox_tmp_buf); kvfree(env); return ret; } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d2c79da81e4f8e..4661b9e606e000 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -80,6 +80,12 @@ config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE If the architecture generates __patchable_function_entries sections but does not want them included in the ftrace locations. +config HAVE_DYNAMIC_FTRACE_WITH_JMP + bool + help + If the architecture supports to replace the __fentry__ with a + "jmp" instruction. + config HAVE_SYSCALL_TRACEPOINTS bool help @@ -330,6 +336,12 @@ config DYNAMIC_FTRACE_WITH_ARGS depends on DYNAMIC_FTRACE depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS +config DYNAMIC_FTRACE_WITH_JMP + def_bool y + depends on DYNAMIC_FTRACE + depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS + depends on HAVE_DYNAMIC_FTRACE_WITH_JMP + config FPROBE bool "Kernel Function Probe (fprobe)" depends on HAVE_FUNCTION_GRAPH_FREGS && HAVE_FTRACE_GRAPH_FUNC diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4f87c16d915a02..d57727abaade7f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2529,7 +2529,7 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) return run_ctx->entry_ip; } -static int +static __always_inline int kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, unsigned long entry_ip, struct ftrace_regs *fregs, bool is_return, void *data) @@ -3372,13 +3372,13 @@ typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struc * direct calls into all the specific callback implementations * (copy_user_data_sleepable, copy_user_data_nofault, and so on) */ -static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 doff, u32 size, +static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u64 doff, u64 size, const void *unsafe_src, copy_fn_t str_copy_fn, struct task_struct *tsk) { struct bpf_dynptr_kern *dst; - u32 chunk_sz, off; + u64 chunk_sz, off; void *dst_slice; int cnt, err; char buf[256]; @@ -3392,7 +3392,7 @@ static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 do return -E2BIG; for (off = 0; off < size; off += chunk_sz - 1) { - chunk_sz = min_t(u32, sizeof(buf), size - off); + chunk_sz = min_t(u64, sizeof(buf), size - off); /* Expect str_copy_fn to return count of copied bytes, including * zero terminator. Next iteration increment off by chunk_sz - 1 to * overwrite NUL. @@ -3409,14 +3409,14 @@ static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 do return off; } -static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 doff, - u32 size, const void *unsafe_src, +static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u64 doff, + u64 size, const void *unsafe_src, copy_fn_t copy_fn, struct task_struct *tsk) { struct bpf_dynptr_kern *dst; void *dst_slice; char buf[256]; - u32 off, chunk_sz; + u64 off, chunk_sz; int err; dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); @@ -3428,7 +3428,7 @@ static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 return -E2BIG; for (off = 0; off < size; off += chunk_sz) { - chunk_sz = min_t(u32, sizeof(buf), size - off); + chunk_sz = min_t(u64, sizeof(buf), size - off); err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk); if (err) return err; @@ -3514,58 +3514,58 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid return bpf_send_signal_common(sig, type, task, value); } -__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign) +__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_data_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void *unsafe_ptr__ign) +__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign, copy_kernel_data_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign) +__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_str_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void *unsafe_ptr__ign) +__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign, copy_kernel_str_nofault, NULL); } -__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign) +__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_data_sleepable, NULL); } -__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign) +__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_str_sleepable, NULL); } -__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign, +__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_data_sleepable, tsk); } -__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign, +__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 59cfacb8a5bbdc..bbb37c0f8c6ce1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5951,7 +5951,8 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { del = __ftrace_lookup_ip(direct_functions, entry->ip); - if (del && del->direct == addr) { + if (del && ftrace_jmp_get(del->direct) == + ftrace_jmp_get(addr)) { remove_hash_entry(direct_functions, del); kfree(del); } @@ -6016,8 +6017,15 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) if (ftrace_hash_empty(hash)) return -EINVAL; + /* This is a "raw" address, and this should never happen. */ + if (WARN_ON_ONCE(ftrace_is_jmp(addr))) + return -EINVAL; + mutex_lock(&direct_mutex); + if (ops->flags & FTRACE_OPS_FL_JMP) + addr = ftrace_jmp_set(addr); + /* Make sure requested entries are not already registered.. */ size = 1 << hash->size_bits; for (i = 0; i < size; i++) { @@ -6138,6 +6146,13 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) lockdep_assert_held_once(&direct_mutex); + /* This is a "raw" address, and this should never happen. */ + if (WARN_ON_ONCE(ftrace_is_jmp(addr))) + return -EINVAL; + + if (ops->flags & FTRACE_OPS_FL_JMP) + addr = ftrace_jmp_set(addr); + /* Enable the tmp_ops to have the same functions as the direct ops */ ftrace_ops_init(&tmp_ops); tmp_ops.func_hash = ops->func_hash; diff --git a/lib/buildid.c b/lib/buildid.c index c4b0f376fb3410..aaf61dfc091958 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -11,27 +11,8 @@ #define MAX_PHDR_CNT 256 -struct freader { - void *buf; - u32 buf_sz; - int err; - union { - struct { - struct file *file; - struct folio *folio; - void *addr; - loff_t folio_off; - bool may_fault; - }; - struct { - const char *data; - u64 data_sz; - }; - }; -}; - -static void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz, - struct file *file, bool may_fault) +void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz, + struct file *file, bool may_fault) { memset(r, 0, sizeof(*r)); r->buf = buf; @@ -40,7 +21,7 @@ static void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz, r->may_fault = may_fault; } -static void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz) +void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz) { memset(r, 0, sizeof(*r)); r->data = data; @@ -92,7 +73,7 @@ static int freader_get_folio(struct freader *r, loff_t file_off) return 0; } -static const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz) +const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz) { size_t folio_sz; @@ -127,18 +108,21 @@ static const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz) */ folio_sz = folio_size(r->folio); if (file_off + sz > r->folio_off + folio_sz) { - int part_sz = r->folio_off + folio_sz - file_off; - - /* copy the part that resides in the current folio */ - memcpy(r->buf, r->addr + (file_off - r->folio_off), part_sz); - - /* fetch next folio */ - r->err = freader_get_folio(r, r->folio_off + folio_sz); - if (r->err) - return NULL; - - /* copy the rest of requested data */ - memcpy(r->buf + part_sz, r->addr, sz - part_sz); + u64 part_sz = r->folio_off + folio_sz - file_off, off; + + memcpy(r->buf, r->addr + file_off - r->folio_off, part_sz); + off = part_sz; + + while (off < sz) { + /* fetch next folio */ + r->err = freader_get_folio(r, r->folio_off + folio_sz); + if (r->err) + return NULL; + folio_sz = folio_size(r->folio); + part_sz = min_t(u64, sz - off, folio_sz); + memcpy(r->buf + off, r->addr, part_sz); + off += part_sz; + } return r->buf; } @@ -147,7 +131,7 @@ static const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz) return r->addr + (file_off - r->folio_off); } -static void freader_cleanup(struct freader *r) +void freader_cleanup(struct freader *r) { if (!r->buf) return; /* non-file-backed mode */ diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 8b7d0b90fea76b..655efac6f13343 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -436,7 +436,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, static int bpf_test_finish(const union bpf_attr *kattr, union bpf_attr __user *uattr, const void *data, - struct skb_shared_info *sinfo, u32 size, + struct skb_shared_info *sinfo, u32 size, u32 frag_size, u32 retval, u32 duration) { void __user *data_out = u64_to_user_ptr(kattr->test.data_out); @@ -453,7 +453,7 @@ static int bpf_test_finish(const union bpf_attr *kattr, } if (data_out) { - int len = sinfo ? copy_size - sinfo->xdp_frags_size : copy_size; + int len = sinfo ? copy_size - frag_size : copy_size; if (len < 0) { err = -ENOSPC; @@ -899,6 +899,12 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) /* cb is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, cb), + offsetof(struct __sk_buff, data_end))) + return -EINVAL; + + /* data_end is allowed, but not copied to skb */ + + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, data_end), offsetof(struct __sk_buff, tstamp))) return -EINVAL; @@ -939,6 +945,11 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) if (__skb->gso_segs > GSO_MAX_SEGS) return -EINVAL; + + /* Currently GSO type is zero/unset. If this gets extended with + * a small list of accepted GSO types in future, the filter for + * an unset GSO type in bpf_clone_redirect() can be lifted. + */ skb_shinfo(skb)->gso_segs = __skb->gso_segs; skb_shinfo(skb)->gso_size = __skb->gso_size; skb_shinfo(skb)->hwtstamps.hwtstamp = __skb->hwtstamp; @@ -973,46 +984,39 @@ static struct proto bpf_dummy_proto = { int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { - bool is_l2 = false, is_direct_pkt_access = false; + bool is_l2 = false, is_direct_pkt_access = false, is_lwt = false; + u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); struct net *net = current->nsproxy->net_ns; struct net_device *dev = net->loopback_dev; - u32 size = kattr->test.data_size_in; + u32 headroom = NET_SKB_PAD + NET_IP_ALIGN; + u32 linear_sz = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; struct __sk_buff *ctx = NULL; + struct sk_buff *skb = NULL; + struct sock *sk = NULL; u32 retval, duration; int hh_len = ETH_HLEN; - struct sk_buff *skb; - struct sock *sk; - void *data; + void *data = NULL; int ret; if ((kattr->test.flags & ~BPF_F_TEST_SKB_CHECKSUM_COMPLETE) || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; - if (size < ETH_HLEN) + if (kattr->test.data_size_in < ETH_HLEN) return -EINVAL; - data = bpf_test_init(kattr, kattr->test.data_size_in, - size, NET_SKB_PAD + NET_IP_ALIGN, - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); - if (IS_ERR(data)) - return PTR_ERR(data); - - ctx = bpf_ctx_init(kattr, sizeof(struct __sk_buff)); - if (IS_ERR(ctx)) { - kfree(data); - return PTR_ERR(ctx); - } - switch (prog->type) { case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: + is_direct_pkt_access = true; is_l2 = true; - fallthrough; + break; case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_XMIT: + is_lwt = true; + fallthrough; case BPF_PROG_TYPE_CGROUP_SKB: is_direct_pkt_access = true; break; @@ -1020,25 +1024,88 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, break; } + ctx = bpf_ctx_init(kattr, sizeof(struct __sk_buff)); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + if (ctx) { + if (ctx->data_end > kattr->test.data_size_in || ctx->data || ctx->data_meta) { + ret = -EINVAL; + goto out; + } + if (ctx->data_end) { + /* Non-linear LWT test_run is unsupported for now. */ + if (is_lwt) { + ret = -EINVAL; + goto out; + } + linear_sz = max(ETH_HLEN, ctx->data_end); + } + } + + linear_sz = min_t(u32, linear_sz, PAGE_SIZE - headroom - tailroom); + + data = bpf_test_init(kattr, linear_sz, linear_sz, headroom, tailroom); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + data = NULL; + goto out; + } + sk = sk_alloc(net, AF_UNSPEC, GFP_USER, &bpf_dummy_proto, 1); if (!sk) { - kfree(data); - kfree(ctx); - return -ENOMEM; + ret = -ENOMEM; + goto out; } sock_init_data(NULL, sk); skb = slab_build_skb(data); if (!skb) { - kfree(data); - kfree(ctx); - sk_free(sk); - return -ENOMEM; + ret = -ENOMEM; + goto out; } skb->sk = sk; + data = NULL; /* data released via kfree_skb */ + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); - __skb_put(skb, size); + __skb_put(skb, linear_sz); + + if (unlikely(kattr->test.data_size_in > linear_sz)) { + void __user *data_in = u64_to_user_ptr(kattr->test.data_in); + struct skb_shared_info *sinfo = skb_shinfo(skb); + u32 copied = linear_sz; + + while (copied < kattr->test.data_size_in) { + struct page *page; + u32 data_len; + + if (sinfo->nr_frags == MAX_SKB_FRAGS) { + ret = -ENOMEM; + goto out; + } + + page = alloc_page(GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + goto out; + } + + data_len = min_t(u32, kattr->test.data_size_in - copied, + PAGE_SIZE); + skb_fill_page_desc(skb, sinfo->nr_frags, page, 0, data_len); + + if (copy_from_user(page_address(page), data_in + copied, + data_len)) { + ret = -EFAULT; + goto out; + } + skb->data_len += data_len; + skb->truesize += PAGE_SIZE; + skb->len += data_len; + copied += data_len; + } + } if (ctx && ctx->ifindex > 1) { dev = dev_get_by_index(net, ctx->ifindex); @@ -1118,12 +1185,11 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, convert_skb_to___skb(skb, ctx); - size = skb->len; - /* bpf program can never convert linear skb to non-linear */ - if (WARN_ON_ONCE(skb_is_nonlinear(skb))) - size = skb_headlen(skb); - ret = bpf_test_finish(kattr, uattr, skb->data, NULL, size, retval, - duration); + if (skb_is_nonlinear(skb)) + /* bpf program can never convert linear skb to non-linear */ + WARN_ON_ONCE(linear_sz == kattr->test.data_size_in); + ret = bpf_test_finish(kattr, uattr, skb->data, skb_shinfo(skb), skb->len, + skb->data_len, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, ctx, sizeof(struct __sk_buff)); @@ -1131,7 +1197,9 @@ out: if (dev && dev != net->loopback_dev) dev_put(dev); kfree_skb(skb); - sk_free(sk); + kfree(data); + if (sk) + sk_free(sk); kfree(ctx); return ret; } @@ -1329,7 +1397,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, goto out; size = xdp.data_end - xdp.data_meta + sinfo->xdp_frags_size; - ret = bpf_test_finish(kattr, uattr, xdp.data_meta, sinfo, size, + ret = bpf_test_finish(kattr, uattr, xdp.data_meta, sinfo, size, sinfo->xdp_frags_size, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, ctx, @@ -1420,7 +1488,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, goto out; ret = bpf_test_finish(kattr, uattr, &flow_keys, NULL, - sizeof(flow_keys), retval, duration); + sizeof(flow_keys), 0, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(struct bpf_flow_keys)); @@ -1521,7 +1589,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat user_ctx->cookie = sock_gen_cookie(ctx.selected_sk); } - ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, retval, duration); + ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, 0, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx)); @@ -1721,7 +1789,7 @@ int bpf_prog_test_run_nf(struct bpf_prog *prog, if (ret) goto out; - ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, retval, duration); + ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, 0, retval, duration); out: kfree(user_ctx); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 2e538399757fea..850dd736ccd125 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -50,16 +50,14 @@ void bpf_sk_storage_free(struct sock *sk) { struct bpf_local_storage *sk_storage; - migrate_disable(); - rcu_read_lock(); + rcu_read_lock_dont_migrate(); sk_storage = rcu_dereference(sk->sk_bpf_storage); if (!sk_storage) goto out; bpf_local_storage_destroy(sk_storage); out: - rcu_read_unlock(); - migrate_enable(); + rcu_read_unlock_migrate(); } static void bpf_sk_storage_map_free(struct bpf_map *map) @@ -138,7 +136,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk, { struct bpf_local_storage_elem *copy_selem; - copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, false, GFP_ATOMIC); + copy_selem = bpf_selem_alloc(smap, newsk, NULL, false, GFP_ATOMIC); if (!copy_selem) return NULL; @@ -161,8 +159,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); - migrate_disable(); - rcu_read_lock(); + rcu_read_lock_dont_migrate(); sk_storage = rcu_dereference(sk->sk_bpf_storage); if (!sk_storage || hlist_empty(&sk_storage->list)) @@ -199,7 +196,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) } else { ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC); if (ret) { - bpf_selem_free(copy_selem, smap, true); + bpf_selem_free(copy_selem, true); atomic_sub(smap->elem_size, &newsk->sk_omem_alloc); bpf_map_put(map); @@ -213,8 +210,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) } out: - rcu_read_unlock(); - migrate_enable(); + rcu_read_unlock_migrate(); /* In case of an error, don't free anything explicitly here, the * caller is responsible to call bpf_sk_storage_free. diff --git a/net/core/filter.c b/net/core/filter.c index fa06c5a08e22f1..df6ce85e48dcd6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2458,6 +2458,13 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return -EINVAL; + /* BPF test infra's convert___skb_to_skb() can create type-less + * GSO packets. gso_features_check() will detect this as a bad + * offload. However, lets not leak them out in the first place. + */ + if (unlikely(skb_is_gso(skb) && !skb_shinfo(skb)->gso_type)) + return -EBADMSG; + dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); if (unlikely(!dev)) return -EINVAL; @@ -6422,9 +6429,12 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, */ if (skb_is_gso(skb)) { ret = BPF_MTU_CHK_RET_SUCCESS; - if (flags & BPF_MTU_CHK_SEGS && - !skb_gso_validate_network_len(skb, mtu)) - ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; + if (flags & BPF_MTU_CHK_SEGS) { + if (!skb_transport_header_was_set(skb)) + return -EINVAL; + if (!skb_gso_validate_network_len(skb, mtu)) + ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; + } } out: *mtu_len = mtu; diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh index 38e4599350db11..7f4f722787d5ad 100755 --- a/samples/bpf/do_hbm_test.sh +++ b/samples/bpf/do_hbm_test.sh @@ -112,7 +112,7 @@ function start_hbm () { processArgs () { for i in $args ; do case $i in - # Support for upcomming ingress rate limiting + # Support for upcoming ingress rate limiting #in) # support for upcoming ingress rate limiting # dir="-i" # dir_name="in" diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c index bf66277115e2a9..fc88d4dbdf48f0 100644 --- a/samples/bpf/hbm.c +++ b/samples/bpf/hbm.c @@ -5,7 +5,7 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. * - * Example program for Host Bandwidth Managment + * Example program for Host Bandwidth Management * * This program loads a cgroup skb BPF program to enforce cgroup output * (egress) or input (ingress) bandwidth limits. @@ -24,7 +24,7 @@ * beyond the rate limit specified while there is available * bandwidth. Current implementation assumes there is only * NIC (eth0), but can be extended to support multiple NICs. - * Currrently only supported for egress. + * Currently only supported for egress. * -h Print this info * prog BPF program file name. Name defaults to hbm_out_kern.o */ diff --git a/samples/bpf/tcp_cong_kern.c b/samples/bpf/tcp_cong_kern.c index 2311fc9dde854c..339415eac477e8 100644 --- a/samples/bpf/tcp_cong_kern.c +++ b/samples/bpf/tcp_cong_kern.c @@ -5,7 +5,7 @@ * License as published by the Free Software Foundation. * * BPF program to set congestion control to dctcp when both hosts are - * in the same datacenter (as deteremined by IPv6 prefix). + * in the same datacenter (as determined by IPv6 prefix). * * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. */ diff --git a/samples/bpf/tracex1.bpf.c b/samples/bpf/tracex1.bpf.c index 0ab39d76ff8f0a..ceedf0b1d4793f 100644 --- a/samples/bpf/tracex1.bpf.c +++ b/samples/bpf/tracex1.bpf.c @@ -20,7 +20,7 @@ SEC("kprobe.multi/__netif_receive_skb_core*") int bpf_prog1(struct pt_regs *ctx) { /* attaches to kprobe __netif_receive_skb_core, - * looks for packets on loobpack device and prints them + * looks for packets on loopback device and prints them * (wildcard is used for avoiding symbol mismatch due to optimization) */ char devname[IFNAMSIZ]; diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index 062bbd6cd048e9..fd2585af125266 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -32,7 +32,7 @@ FEATURE_TESTS = libbfd disassembler-four-args disassembler-init-styled FEATURE_DISPLAY = libbfd check_feat := 1 -NON_CHECK_FEAT_TARGETS := clean bpftool_clean runqslower_clean resolve_btfids_clean +NON_CHECK_FEAT_TARGETS := clean bpftool_clean resolve_btfids_clean ifdef MAKECMDGOALS ifeq ($(filter-out $(NON_CHECK_FEAT_TARGETS),$(MAKECMDGOALS)),) check_feat := 0 @@ -70,7 +70,7 @@ $(OUTPUT)%.lex.o: $(OUTPUT)%.lex.c PROGS = $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg $(OUTPUT)bpf_asm -all: $(PROGS) bpftool runqslower +all: $(PROGS) bpftool $(OUTPUT)bpf_jit_disasm: CFLAGS += -DPACKAGE='bpf_jit_disasm' $(OUTPUT)bpf_jit_disasm: $(OUTPUT)bpf_jit_disasm.o @@ -86,7 +86,7 @@ $(OUTPUT)bpf_exp.lex.c: $(OUTPUT)bpf_exp.yacc.c $(OUTPUT)bpf_exp.yacc.o: $(OUTPUT)bpf_exp.yacc.c $(OUTPUT)bpf_exp.lex.o: $(OUTPUT)bpf_exp.lex.c -clean: bpftool_clean runqslower_clean resolve_btfids_clean +clean: bpftool_clean resolve_btfids_clean $(call QUIET_CLEAN, bpf-progs) $(Q)$(RM) -r -- $(OUTPUT)*.o $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg \ $(OUTPUT)bpf_asm $(OUTPUT)bpf_exp.yacc.* $(OUTPUT)bpf_exp.lex.* @@ -112,12 +112,6 @@ bpftool_install: bpftool_clean: $(call descend,bpftool,clean) -runqslower: - $(call descend,runqslower) - -runqslower_clean: - $(call descend,runqslower,clean) - resolve_btfids: $(call descend,resolve_btfids) @@ -125,5 +119,4 @@ resolve_btfids_clean: $(call descend,resolve_btfids,clean) .PHONY: all install clean bpftool bpftool_install bpftool_clean \ - runqslower runqslower_clean \ resolve_btfids resolve_btfids_clean diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 252e4c538edb7d..1af3305ea2b2e3 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -55,7 +55,8 @@ MAP COMMANDS | | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** | | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** | | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** -| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** } +| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** +| | **insn_array** } DESCRIPTION =========== diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c index ff12628593aecd..def297e879f453 100644 --- a/tools/bpf/bpftool/btf_dumper.c +++ b/tools/bpf/bpftool/btf_dumper.c @@ -590,7 +590,7 @@ static int btf_dumper_do_type(const struct btf_dumper *d, __u32 type_id, case BTF_KIND_DATASEC: return btf_dumper_datasec(d, type_id, data); default: - jsonw_printf(d->jw, "(unsupported-kind"); + jsonw_printf(d->jw, "(unsupported-kind)"); return -EINVAL; } } diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index c9de44a45778b2..7ebf7dbcfba4ff 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1477,7 +1477,8 @@ static int do_help(int argc, char **argv) " devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n" " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n" " queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n" - " task_storage | bloom_filter | user_ringbuf | cgrp_storage | arena }\n" + " task_storage | bloom_filter | user_ringbuf | cgrp_storage | arena |\n" + " insn_array }\n" " " HELP_SPEC_OPTIONS " |\n" " {-f|--bpffs} | {-n|--nomount} }\n" "", diff --git a/tools/bpf/bpftool/sign.c b/tools/bpf/bpftool/sign.c index b34f74d210e9cd..f9b742f4bb104b 100644 --- a/tools/bpf/bpftool/sign.c +++ b/tools/bpf/bpftool/sign.c @@ -28,6 +28,12 @@ #define OPEN_SSL_ERR_BUF_LEN 256 +/* Use deprecated in 3.0 ERR_get_error_line_data for openssl < 3 */ +#if !defined(OPENSSL_VERSION_MAJOR) || (OPENSSL_VERSION_MAJOR < 3) +#define ERR_get_error_all(file, line, func, data, flags) \ + ERR_get_error_line_data(file, line, data, flags) +#endif + static void display_openssl_errors(int l) { char buf[OPEN_SSL_ERR_BUF_LEN]; diff --git a/tools/bpf/runqslower/.gitignore b/tools/bpf/runqslower/.gitignore deleted file mode 100644 index ffdb70230c8bc3..00000000000000 --- a/tools/bpf/runqslower/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -/.output diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile deleted file mode 100644 index 78a436c4072e38..00000000000000 --- a/tools/bpf/runqslower/Makefile +++ /dev/null @@ -1,91 +0,0 @@ -# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) -include ../../scripts/Makefile.include - -OUTPUT ?= $(abspath .output)/ - -BPFTOOL_OUTPUT := $(OUTPUT)bpftool/ -DEFAULT_BPFTOOL := $(BPFTOOL_OUTPUT)bootstrap/bpftool -BPFTOOL ?= $(DEFAULT_BPFTOOL) -BPF_TARGET_ENDIAN ?= --target=bpf -LIBBPF_SRC := $(abspath ../../lib/bpf) -BPFOBJ_OUTPUT := $(OUTPUT)libbpf/ -BPFOBJ := $(BPFOBJ_OUTPUT)libbpf.a -BPF_DESTDIR := $(BPFOBJ_OUTPUT) -BPF_INCLUDE := $(BPF_DESTDIR)/include -INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../include/uapi) -CFLAGS := -g -Wall $(CLANG_CROSS_FLAGS) -CFLAGS += $(EXTRA_CFLAGS) -LDFLAGS += $(EXTRA_LDFLAGS) -LDLIBS += -lelf -lz - -# Try to detect best kernel BTF source -KERNEL_REL := $(shell uname -r) -VMLINUX_BTF_PATHS := $(if $(O),$(O)/vmlinux) \ - $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ - ../../../vmlinux /sys/kernel/btf/vmlinux \ - /boot/vmlinux-$(KERNEL_REL) -VMLINUX_BTF_PATH := $(or $(VMLINUX_BTF),$(firstword \ - $(wildcard $(VMLINUX_BTF_PATHS)))) - -ifneq ($(V),1) -MAKEFLAGS += --no-print-directory -submake_extras := feature_display=0 -endif - -.DELETE_ON_ERROR: - -.PHONY: all clean runqslower libbpf_hdrs -all: runqslower - -runqslower: $(OUTPUT)/runqslower - -clean: - $(call QUIET_CLEAN, runqslower) - $(Q)$(RM) -r $(BPFOBJ_OUTPUT) $(BPFTOOL_OUTPUT) - $(Q)$(RM) $(OUTPUT)*.o $(OUTPUT)*.d - $(Q)$(RM) $(OUTPUT)*.skel.h $(OUTPUT)vmlinux.h - $(Q)$(RM) $(OUTPUT)runqslower - $(Q)$(RM) -r .output - -libbpf_hdrs: $(BPFOBJ) - -$(OUTPUT)/runqslower: $(OUTPUT)/runqslower.o $(BPFOBJ) - $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $^ $(LDLIBS) -o $@ - -$(OUTPUT)/runqslower.o: runqslower.h $(OUTPUT)/runqslower.skel.h \ - $(OUTPUT)/runqslower.bpf.o | libbpf_hdrs - -$(OUTPUT)/runqslower.bpf.o: $(OUTPUT)/vmlinux.h runqslower.h | libbpf_hdrs - -$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(BPFTOOL) - $(QUIET_GEN)$(BPFTOOL) gen skeleton $< > $@ - -$(OUTPUT)/%.bpf.o: %.bpf.c $(BPFOBJ) | $(OUTPUT) - $(QUIET_GEN)$(CLANG) -g -O2 $(BPF_TARGET_ENDIAN) $(INCLUDES) \ - -c $(filter %.c,$^) -o $@ && \ - $(LLVM_STRIP) -g $@ - -$(OUTPUT)/%.o: %.c | $(OUTPUT) - $(QUIET_CC)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ - -$(OUTPUT) $(BPFOBJ_OUTPUT) $(BPFTOOL_OUTPUT): - $(QUIET_MKDIR)mkdir -p $@ - -$(OUTPUT)/vmlinux.h: $(VMLINUX_BTF_PATH) | $(OUTPUT) $(BPFTOOL) -ifeq ($(VMLINUX_H),) - $(Q)if [ ! -e "$(VMLINUX_BTF_PATH)" ] ; then \ - echo "Couldn't find kernel BTF; set VMLINUX_BTF to" \ - "specify its location." >&2; \ - exit 1;\ - fi - $(QUIET_GEN)$(BPFTOOL) btf dump file $(VMLINUX_BTF_PATH) format c > $@ -else - $(Q)cp "$(VMLINUX_H)" $@ -endif - -$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(BPFOBJ_OUTPUT) - $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) OUTPUT=$(BPFOBJ_OUTPUT) \ - DESTDIR=$(BPFOBJ_OUTPUT) prefix= $(abspath $@) install_headers - -$(DEFAULT_BPFTOOL): | $(BPFTOOL_OUTPUT) - $(Q)$(MAKE) $(submake_extras) -C ../bpftool OUTPUT=$(BPFTOOL_OUTPUT) bootstrap diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c deleted file mode 100644 index fced54a3adf642..00000000000000 --- a/tools/bpf/runqslower/runqslower.bpf.c +++ /dev/null @@ -1,106 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (c) 2019 Facebook -#include "vmlinux.h" -#include <bpf/bpf_helpers.h> -#include "runqslower.h" - -#define TASK_RUNNING 0 -#define BPF_F_CURRENT_CPU 0xffffffffULL - -const volatile __u64 min_us = 0; -const volatile pid_t targ_pid = 0; - -struct { - __uint(type, BPF_MAP_TYPE_TASK_STORAGE); - __uint(map_flags, BPF_F_NO_PREALLOC); - __type(key, int); - __type(value, u64); -} start SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); - __uint(key_size, sizeof(u32)); - __uint(value_size, sizeof(u32)); -} events SEC(".maps"); - -/* record enqueue timestamp */ -__always_inline -static int trace_enqueue(struct task_struct *t) -{ - u32 pid = t->pid; - u64 *ptr; - - if (!pid || (targ_pid && targ_pid != pid)) - return 0; - - ptr = bpf_task_storage_get(&start, t, 0, - BPF_LOCAL_STORAGE_GET_F_CREATE); - if (!ptr) - return 0; - - *ptr = bpf_ktime_get_ns(); - return 0; -} - -SEC("tp_btf/sched_wakeup") -int handle__sched_wakeup(u64 *ctx) -{ - /* TP_PROTO(struct task_struct *p) */ - struct task_struct *p = (void *)ctx[0]; - - return trace_enqueue(p); -} - -SEC("tp_btf/sched_wakeup_new") -int handle__sched_wakeup_new(u64 *ctx) -{ - /* TP_PROTO(struct task_struct *p) */ - struct task_struct *p = (void *)ctx[0]; - - return trace_enqueue(p); -} - -SEC("tp_btf/sched_switch") -int handle__sched_switch(u64 *ctx) -{ - /* TP_PROTO(bool preempt, struct task_struct *prev, - * struct task_struct *next) - */ - struct task_struct *prev = (struct task_struct *)ctx[1]; - struct task_struct *next = (struct task_struct *)ctx[2]; - struct runq_event event = {}; - u64 *tsp, delta_us; - u32 pid; - - /* ivcsw: treat like an enqueue event and store timestamp */ - if (prev->__state == TASK_RUNNING) - trace_enqueue(prev); - - pid = next->pid; - - /* For pid mismatch, save a bpf_task_storage_get */ - if (!pid || (targ_pid && targ_pid != pid)) - return 0; - - /* fetch timestamp and calculate delta */ - tsp = bpf_task_storage_get(&start, next, 0, 0); - if (!tsp) - return 0; /* missed enqueue */ - - delta_us = (bpf_ktime_get_ns() - *tsp) / 1000; - if (min_us && delta_us <= min_us) - return 0; - - event.pid = pid; - event.delta_us = delta_us; - bpf_get_current_comm(&event.task, sizeof(event.task)); - - /* output */ - bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, - &event, sizeof(event)); - - bpf_task_storage_delete(&start, next); - return 0; -} - -char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/bpf/runqslower/runqslower.c b/tools/bpf/runqslower/runqslower.c deleted file mode 100644 index 83c5993a139a71..00000000000000 --- a/tools/bpf/runqslower/runqslower.c +++ /dev/null @@ -1,171 +0,0 @@ -// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) -// Copyright (c) 2019 Facebook -#include <argp.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <time.h> -#include <bpf/libbpf.h> -#include <bpf/bpf.h> -#include "runqslower.h" -#include "runqslower.skel.h" - -struct env { - pid_t pid; - __u64 min_us; - bool verbose; -} env = { - .min_us = 10000, -}; - -const char *argp_program_version = "runqslower 0.1"; -const char *argp_program_bug_address = "<bpf@vger.kernel.org>"; -const char argp_program_doc[] = -"runqslower Trace long process scheduling delays.\n" -" For Linux, uses eBPF, BPF CO-RE, libbpf, BTF.\n" -"\n" -"This script traces high scheduling delays between tasks being\n" -"ready to run and them running on CPU after that.\n" -"\n" -"USAGE: runqslower [-p PID] [min_us]\n" -"\n" -"EXAMPLES:\n" -" runqslower # trace run queue latency higher than 10000 us (default)\n" -" runqslower 1000 # trace run queue latency higher than 1000 us\n" -" runqslower -p 123 # trace pid 123 only\n"; - -static const struct argp_option opts[] = { - { "pid", 'p', "PID", 0, "Process PID to trace"}, - { "verbose", 'v', NULL, 0, "Verbose debug output" }, - {}, -}; - -static error_t parse_arg(int key, char *arg, struct argp_state *state) -{ - static int pos_args; - int pid; - long long min_us; - - switch (key) { - case 'v': - env.verbose = true; - break; - case 'p': - errno = 0; - pid = strtol(arg, NULL, 10); - if (errno || pid <= 0) { - fprintf(stderr, "Invalid PID: %s\n", arg); - argp_usage(state); - } - env.pid = pid; - break; - case ARGP_KEY_ARG: - if (pos_args++) { - fprintf(stderr, - "Unrecognized positional argument: %s\n", arg); - argp_usage(state); - } - errno = 0; - min_us = strtoll(arg, NULL, 10); - if (errno || min_us <= 0) { - fprintf(stderr, "Invalid delay (in us): %s\n", arg); - argp_usage(state); - } - env.min_us = min_us; - break; - default: - return ARGP_ERR_UNKNOWN; - } - return 0; -} - -int libbpf_print_fn(enum libbpf_print_level level, - const char *format, va_list args) -{ - if (level == LIBBPF_DEBUG && !env.verbose) - return 0; - return vfprintf(stderr, format, args); -} - -void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) -{ - const struct runq_event *e = data; - struct tm *tm; - char ts[32]; - time_t t; - - time(&t); - tm = localtime(&t); - strftime(ts, sizeof(ts), "%H:%M:%S", tm); - printf("%-8s %-16s %-6d %14llu\n", ts, e->task, e->pid, e->delta_us); -} - -void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt) -{ - printf("Lost %llu events on CPU #%d!\n", lost_cnt, cpu); -} - -int main(int argc, char **argv) -{ - static const struct argp argp = { - .options = opts, - .parser = parse_arg, - .doc = argp_program_doc, - }; - struct perf_buffer *pb = NULL; - struct runqslower_bpf *obj; - int err; - - err = argp_parse(&argp, argc, argv, 0, NULL, NULL); - if (err) - return err; - - libbpf_set_print(libbpf_print_fn); - - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - - obj = runqslower_bpf__open(); - if (!obj) { - fprintf(stderr, "failed to open and/or load BPF object\n"); - return 1; - } - - /* initialize global data (filtering options) */ - obj->rodata->targ_pid = env.pid; - obj->rodata->min_us = env.min_us; - - err = runqslower_bpf__load(obj); - if (err) { - fprintf(stderr, "failed to load BPF object: %d\n", err); - goto cleanup; - } - - err = runqslower_bpf__attach(obj); - if (err) { - fprintf(stderr, "failed to attach BPF programs\n"); - goto cleanup; - } - - printf("Tracing run queue latency higher than %llu us\n", env.min_us); - printf("%-8s %-16s %-6s %14s\n", "TIME", "COMM", "PID", "LAT(us)"); - - pb = perf_buffer__new(bpf_map__fd(obj->maps.events), 64, - handle_event, handle_lost_events, NULL, NULL); - err = libbpf_get_error(pb); - if (err) { - pb = NULL; - fprintf(stderr, "failed to open perf buffer: %d\n", err); - goto cleanup; - } - - while ((err = perf_buffer__poll(pb, 100)) >= 0) - ; - printf("Error polling perf buffer: %d\n", err); - -cleanup: - perf_buffer__free(pb); - runqslower_bpf__destroy(obj); - - return err != 0; -} diff --git a/tools/bpf/runqslower/runqslower.h b/tools/bpf/runqslower/runqslower.h deleted file mode 100644 index 4f70f07200c23d..00000000000000 --- a/tools/bpf/runqslower/runqslower.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ -#ifndef __RUNQSLOWER_H -#define __RUNQSLOWER_H - -#define TASK_COMM_LEN 16 - -struct runq_event { - char task[TASK_COMM_LEN]; - __u64 delta_us; - pid_t pid; -}; - -#endif /* __RUNQSLOWER_H */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6829936d33f58e..f5713f59ac10a0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1026,6 +1026,7 @@ enum bpf_map_type { BPF_MAP_TYPE_USER_RINGBUF, BPF_MAP_TYPE_CGRP_STORAGE, BPF_MAP_TYPE_ARENA, + BPF_MAP_TYPE_INSN_ARRAY, __MAX_BPF_MAP_TYPE }; @@ -1430,6 +1431,9 @@ enum { /* Do not translate kernel bpf_arena pointers to user pointers */ BPF_F_NO_USER_CONV = (1U << 18), + +/* Enable BPF ringbuf overwrite mode */ + BPF_F_RB_OVERWRITE = (1U << 19), }; /* Flags for BPF_PROG_QUERY. */ @@ -5618,7 +5622,7 @@ union bpf_attr { * Return * *sk* if casting is valid, or **NULL** otherwise. * - * long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr) + * long bpf_dynptr_from_mem(void *data, u64 size, u64 flags, struct bpf_dynptr *ptr) * Description * Get a dynptr to local memory *data*. * @@ -5661,7 +5665,7 @@ union bpf_attr { * Return * Nothing. Always succeeds. * - * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags) + * long bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr *src, u64 offset, u64 flags) * Description * Read *len* bytes from *src* into *dst*, starting from *offset* * into *src*. @@ -5671,7 +5675,7 @@ union bpf_attr { * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if * *flags* is not 0. * - * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) + * long bpf_dynptr_write(const struct bpf_dynptr *dst, u64 offset, void *src, u64 len, u64 flags) * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. @@ -5692,7 +5696,7 @@ union bpf_attr { * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs, * other errors correspond to errors returned by **bpf_skb_store_bytes**\ (). * - * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) + * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u64 offset, u64 len) * Description * Get a pointer to the underlying dynptr data. * @@ -6231,6 +6235,7 @@ enum { BPF_RB_RING_SIZE = 1, BPF_RB_CONS_POS = 2, BPF_RB_PROD_POS = 3, + BPF_RB_OVERWRITE_POS = 4, }; /* BPF ring buffer constants */ @@ -7645,4 +7650,24 @@ enum bpf_kfunc_flags { BPF_F_PAD_ZEROS = (1ULL << 0), }; +/* + * Values of a BPF_MAP_TYPE_INSN_ARRAY entry must be of this type. + * + * Before the map is used the orig_off field should point to an + * instruction inside the program being loaded. The other fields + * must be set to 0. + * + * After the program is loaded, the xlated_off will be adjusted + * by the verifier to point to the index of the original instruction + * in the xlated program. If the instruction is deleted, it will + * be set to (u32)-1. The jitted_off will be set to the corresponding + * offset in the jitted image of the program. + */ +struct bpf_insn_array_value { + __u32 orig_off; + __u32 xlated_off; + __u32 jitted_off; + __u32 :32; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 339b197972374f..b66f5fbfbbb291 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -154,7 +154,7 @@ int bump_rlimit_memlock(void) memlock_bumped = true; - /* zero memlock_rlim_max disables auto-bumping RLIMIT_MEMLOCK */ + /* zero memlock_rlim disables auto-bumping RLIMIT_MEMLOCK */ if (memlock_rlim == 0) return 0; diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 18907f0fcf9f01..84a4b0abc8be9d 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -1061,7 +1061,7 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf, b if (base_btf) { btf->base_btf = base_btf; btf->start_id = btf__type_cnt(base_btf); - btf->start_str_off = base_btf->hdr->str_len; + btf->start_str_off = base_btf->hdr->str_len + base_btf->start_str_off; } if (is_mmap) { @@ -3901,6 +3901,20 @@ err_out: return err; } +/* + * Calculate type signature hash of TYPEDEF, ignoring referenced type IDs, + * as referenced type IDs equivalence is established separately during type + * graph equivalence check algorithm. + */ +static long btf_hash_typedef(struct btf_type *t) +{ + long h; + + h = hash_combine(0, t->name_off); + h = hash_combine(h, t->info); + return h; +} + static long btf_hash_common(struct btf_type *t) { long h; @@ -3918,6 +3932,13 @@ static bool btf_equal_common(struct btf_type *t1, struct btf_type *t2) t1->size == t2->size; } +/* Check structural compatibility of two TYPEDEF. */ +static bool btf_equal_typedef(struct btf_type *t1, struct btf_type *t2) +{ + return t1->name_off == t2->name_off && + t1->info == t2->info; +} + /* Calculate type signature hash of INT or TAG. */ static long btf_hash_int_decl_tag(struct btf_type *t) { @@ -4844,13 +4865,30 @@ static void btf_dedup_merge_hypot_map(struct btf_dedup *d) } } +static inline long btf_hash_by_kind(struct btf_type *t, __u16 kind) +{ + if (kind == BTF_KIND_TYPEDEF) + return btf_hash_typedef(t); + else + return btf_hash_struct(t); +} + +static inline bool btf_equal_by_kind(struct btf_type *t1, struct btf_type *t2, __u16 kind) +{ + if (kind == BTF_KIND_TYPEDEF) + return btf_equal_typedef(t1, t2); + else + return btf_shallow_equal_struct(t1, t2); +} + /* - * Deduplicate struct/union types. + * Deduplicate struct/union and typedef types. * * For each struct/union type its type signature hash is calculated, taking * into account type's name, size, number, order and names of fields, but * ignoring type ID's referenced from fields, because they might not be deduped - * completely until after reference types deduplication phase. This type hash + * completely until after reference types deduplication phase. For each typedef + * type, the hash is computed based on the type’s name and size. This type hash * is used to iterate over all potential canonical types, sharing same hash. * For each canonical candidate we check whether type graphs that they form * (through referenced types in fields and so on) are equivalent using algorithm @@ -4882,18 +4920,20 @@ static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id) t = btf_type_by_id(d->btf, type_id); kind = btf_kind(t); - if (kind != BTF_KIND_STRUCT && kind != BTF_KIND_UNION) + if (kind != BTF_KIND_STRUCT && + kind != BTF_KIND_UNION && + kind != BTF_KIND_TYPEDEF) return 0; - h = btf_hash_struct(t); + h = btf_hash_by_kind(t, kind); for_each_dedup_cand(d, hash_entry, h) { __u32 cand_id = hash_entry->value; int eq; /* * Even though btf_dedup_is_equiv() checks for - * btf_shallow_equal_struct() internally when checking two - * structs (unions) for equivalence, we need to guard here + * btf_equal_by_kind() internally when checking two + * structs (unions) or typedefs for equivalence, we need to guard here * from picking matching FWD type as a dedup candidate. * This can happen due to hash collision. In such case just * relying on btf_dedup_is_equiv() would lead to potentially @@ -4901,7 +4941,7 @@ static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id) * FWD and compatible STRUCT/UNION are considered equivalent. */ cand_type = btf_type_by_id(d->btf, cand_id); - if (!btf_shallow_equal_struct(t, cand_type)) + if (!btf_equal_by_kind(t, cand_type, kind)) continue; btf_dedup_clear_hypot_map(d); @@ -4939,18 +4979,18 @@ static int btf_dedup_struct_types(struct btf_dedup *d) /* * Deduplicate reference type. * - * Once all primitive and struct/union types got deduplicated, we can easily + * Once all primitive, struct/union and typedef types got deduplicated, we can easily * deduplicate all other (reference) BTF types. This is done in two steps: * * 1. Resolve all referenced type IDs into their canonical type IDs. This - * resolution can be done either immediately for primitive or struct/union types - * (because they were deduped in previous two phases) or recursively for + * resolution can be done either immediately for primitive, struct/union, and typedef + * types (because they were deduped in previous two phases) or recursively for * reference types. Recursion will always terminate at either primitive or - * struct/union type, at which point we can "unwind" chain of reference types - * one by one. There is no danger of encountering cycles because in C type - * system the only way to form type cycle is through struct/union, so any chain - * of reference types, even those taking part in a type cycle, will inevitably - * reach struct/union at some point. + * struct/union and typedef types, at which point we can "unwind" chain of reference + * types one by one. There is no danger of encountering cycles in C, as the only way to + * form a type cycle is through struct or union types. Go can form such cycles through + * typedef. Thus, any chain of reference types, even those taking part in a type cycle, + * will inevitably reach a struct/union or typedef type at some point. * * 2. Once all referenced type IDs are resolved into canonical ones, BTF type * becomes "stable", in the sense that no further deduplication will cause @@ -4982,7 +5022,6 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id) case BTF_KIND_VOLATILE: case BTF_KIND_RESTRICT: case BTF_KIND_PTR: - case BTF_KIND_TYPEDEF: case BTF_KIND_FUNC: case BTF_KIND_TYPE_TAG: ref_type_id = btf_dedup_ref_type(d, t->type); @@ -5818,7 +5857,7 @@ void btf_set_base_btf(struct btf *btf, const struct btf *base_btf) { btf->base_btf = (struct btf *)base_btf; btf->start_id = btf__type_cnt(base_btf); - btf->start_str_off = base_btf->hdr->str_len; + btf->start_str_off = base_btf->hdr->str_len + base_btf->start_str_off; } int btf__relocate(struct btf *btf, const struct btf *base_btf) diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index ccfd905f03dfe7..cc01494d62107e 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -94,6 +94,7 @@ LIBBPF_API struct btf *btf__new_empty(void); * @brief **btf__new_empty_split()** creates an unpopulated BTF object from an * ELF BTF section except with a base BTF on top of which split BTF should be * based + * @param base_btf base BTF object * @return new BTF object instance which has to be eventually freed with * **btf__free()** * @@ -115,6 +116,10 @@ LIBBPF_API struct btf *btf__new_empty_split(struct btf *base_btf); * When that split BTF is loaded against a (possibly changed) base, this * distilled base BTF will help update references to that (possibly changed) * base BTF. + * @param src_btf source split BTF object + * @param new_base_btf pointer to where the new base BTF object pointer will be stored + * @param new_split_btf pointer to where the new split BTF object pointer will be stored + * @return 0 on success; negative error code, otherwise * * Both the new split and its associated new base BTF must be freed by * the caller. @@ -264,6 +269,9 @@ LIBBPF_API int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts); * to base BTF kinds, and verify those references are compatible with * *base_btf*; if they are, *btf* is adjusted such that is re-parented to * *base_btf* and type ids and strings are adjusted to accommodate this. + * @param btf split BTF object to relocate + * @param base_btf base BTF object + * @return 0 on success; negative error code, otherwise * * If successful, 0 is returned and **btf** now has **base_btf** as its * base. diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 85abc357da315b..3dc8a807881551 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -190,6 +190,7 @@ static const char * const map_type_name[] = { [BPF_MAP_TYPE_USER_RINGBUF] = "user_ringbuf", [BPF_MAP_TYPE_CGRP_STORAGE] = "cgrp_storage", [BPF_MAP_TYPE_ARENA] = "arena", + [BPF_MAP_TYPE_INSN_ARRAY] = "insn_array", }; static const char * const prog_type_name[] = { @@ -369,6 +370,7 @@ enum reloc_type { RELO_EXTERN_CALL, RELO_SUBPROG_ADDR, RELO_CORE, + RELO_INSN_ARRAY, }; struct reloc_desc { @@ -379,7 +381,16 @@ struct reloc_desc { struct { int map_idx; int sym_off; - int ext_idx; + /* + * The following two fields can be unionized, as the + * ext_idx field is used for extern symbols, and the + * sym_size is used for jump tables, which are never + * extern + */ + union { + int ext_idx; + int sym_size; + }; }; }; }; @@ -421,6 +432,11 @@ struct bpf_sec_def { libbpf_prog_attach_fn_t prog_attach_fn; }; +struct bpf_light_subprog { + __u32 sec_insn_off; + __u32 sub_insn_off; +}; + /* * bpf_prog should be a better name but it has been used in * linux/filter.h. @@ -494,6 +510,9 @@ struct bpf_program { __u32 line_info_cnt; __u32 prog_flags; __u8 hash[SHA256_DIGEST_LENGTH]; + + struct bpf_light_subprog *subprogs; + __u32 subprog_cnt; }; struct bpf_struct_ops { @@ -667,6 +686,7 @@ struct elf_state { int symbols_shndx; bool has_st_ops; int arena_data_shndx; + int jumptables_data_shndx; }; struct usdt_manager; @@ -738,6 +758,16 @@ struct bpf_object { void *arena_data; size_t arena_data_sz; + void *jumptables_data; + size_t jumptables_data_sz; + + struct { + struct bpf_program *prog; + int sym_off; + int fd; + } *jumptable_maps; + size_t jumptable_map_cnt; + struct kern_feature_cache *feat_cache; char *token_path; int token_fd; @@ -764,6 +794,7 @@ void bpf_program__unload(struct bpf_program *prog) zfree(&prog->func_info); zfree(&prog->line_info); + zfree(&prog->subprogs); } static void bpf_program__exit(struct bpf_program *prog) @@ -2996,7 +3027,7 @@ static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, scn = elf_sec_by_idx(obj, obj->efile.btf_maps_shndx); data = elf_sec_data(obj, scn); - if (!scn || !data) { + if (!data) { pr_warn("elf: failed to get %s map definitions for %s\n", MAPS_ELF_SEC, obj->path); return -EINVAL; @@ -3942,6 +3973,13 @@ static int bpf_object__elf_collect(struct bpf_object *obj) } else if (strcmp(name, ARENA_SEC) == 0) { obj->efile.arena_data = data; obj->efile.arena_data_shndx = idx; + } else if (strcmp(name, JUMPTABLES_SEC) == 0) { + obj->jumptables_data = malloc(data->d_size); + if (!obj->jumptables_data) + return -ENOMEM; + memcpy(obj->jumptables_data, data->d_buf, data->d_size); + obj->jumptables_data_sz = data->d_size; + obj->efile.jumptables_data_shndx = idx; } else { pr_info("elf: skipping unrecognized data section(%d) %s\n", idx, name); @@ -4634,6 +4672,16 @@ static int bpf_program__record_reloc(struct bpf_program *prog, return 0; } + /* jump table data relocation */ + if (shdr_idx == obj->efile.jumptables_data_shndx) { + reloc_desc->type = RELO_INSN_ARRAY; + reloc_desc->insn_idx = insn_idx; + reloc_desc->map_idx = -1; + reloc_desc->sym_off = sym->st_value; + reloc_desc->sym_size = sym->st_size; + return 0; + } + /* generic map reference relocation */ if (type == LIBBPF_MAP_UNSPEC) { if (!bpf_object__shndx_is_maps(obj, shdr_idx)) { @@ -6144,6 +6192,157 @@ static void poison_kfunc_call(struct bpf_program *prog, int relo_idx, insn->imm = POISON_CALL_KFUNC_BASE + ext_idx; } +static int find_jt_map(struct bpf_object *obj, struct bpf_program *prog, int sym_off) +{ + size_t i; + + for (i = 0; i < obj->jumptable_map_cnt; i++) { + /* + * This might happen that same offset is used for two different + * programs (as jump tables can be the same). However, for + * different programs different maps should be created. + */ + if (obj->jumptable_maps[i].sym_off == sym_off && + obj->jumptable_maps[i].prog == prog) + return obj->jumptable_maps[i].fd; + } + + return -ENOENT; +} + +static int add_jt_map(struct bpf_object *obj, struct bpf_program *prog, int sym_off, int map_fd) +{ + size_t cnt = obj->jumptable_map_cnt; + size_t size = sizeof(obj->jumptable_maps[0]); + void *tmp; + + tmp = libbpf_reallocarray(obj->jumptable_maps, cnt + 1, size); + if (!tmp) + return -ENOMEM; + + obj->jumptable_maps = tmp; + obj->jumptable_maps[cnt].prog = prog; + obj->jumptable_maps[cnt].sym_off = sym_off; + obj->jumptable_maps[cnt].fd = map_fd; + obj->jumptable_map_cnt++; + + return 0; +} + +static int find_subprog_idx(struct bpf_program *prog, int insn_idx) +{ + int i; + + for (i = prog->subprog_cnt - 1; i >= 0; i--) { + if (insn_idx >= prog->subprogs[i].sub_insn_off) + return i; + } + + return -1; +} + +static int create_jt_map(struct bpf_object *obj, struct bpf_program *prog, struct reloc_desc *relo) +{ + const __u32 jt_entry_size = 8; + int sym_off = relo->sym_off; + int jt_size = relo->sym_size; + __u32 max_entries = jt_size / jt_entry_size; + __u32 value_size = sizeof(struct bpf_insn_array_value); + struct bpf_insn_array_value val = {}; + int subprog_idx; + int map_fd, err; + __u64 insn_off; + __u64 *jt; + __u32 i; + + map_fd = find_jt_map(obj, prog, sym_off); + if (map_fd >= 0) + return map_fd; + + if (sym_off % jt_entry_size) { + pr_warn("map '.jumptables': jumptable start %d should be multiple of %u\n", + sym_off, jt_entry_size); + return -EINVAL; + } + + if (jt_size % jt_entry_size) { + pr_warn("map '.jumptables': jumptable size %d should be multiple of %u\n", + jt_size, jt_entry_size); + return -EINVAL; + } + + map_fd = bpf_map_create(BPF_MAP_TYPE_INSN_ARRAY, ".jumptables", + 4, value_size, max_entries, NULL); + if (map_fd < 0) + return map_fd; + + if (!obj->jumptables_data) { + pr_warn("map '.jumptables': ELF file is missing jump table data\n"); + err = -EINVAL; + goto err_close; + } + if (sym_off + jt_size > obj->jumptables_data_sz) { + pr_warn("map '.jumptables': jumptables_data size is %zd, trying to access %d\n", + obj->jumptables_data_sz, sym_off + jt_size); + err = -EINVAL; + goto err_close; + } + + subprog_idx = -1; /* main program */ + if (relo->insn_idx < 0 || relo->insn_idx >= prog->insns_cnt) { + pr_warn("map '.jumptables': invalid instruction index %d\n", relo->insn_idx); + err = -EINVAL; + goto err_close; + } + if (prog->subprogs) + subprog_idx = find_subprog_idx(prog, relo->insn_idx); + + jt = (__u64 *)(obj->jumptables_data + sym_off); + for (i = 0; i < max_entries; i++) { + /* + * The offset should be made to be relative to the beginning of + * the main function, not the subfunction. + */ + insn_off = jt[i]/sizeof(struct bpf_insn); + if (subprog_idx >= 0) { + insn_off -= prog->subprogs[subprog_idx].sec_insn_off; + insn_off += prog->subprogs[subprog_idx].sub_insn_off; + } else { + insn_off -= prog->sec_insn_off; + } + + /* + * LLVM-generated jump tables contain u64 records, however + * should contain values that fit in u32. + */ + if (insn_off > UINT32_MAX) { + pr_warn("map '.jumptables': invalid jump table value 0x%llx at offset %d\n", + (long long)jt[i], sym_off + i * jt_entry_size); + err = -EINVAL; + goto err_close; + } + + val.orig_off = insn_off; + err = bpf_map_update_elem(map_fd, &i, &val, 0); + if (err) + goto err_close; + } + + err = bpf_map_freeze(map_fd); + if (err) + goto err_close; + + err = add_jt_map(obj, prog, sym_off, map_fd); + if (err) + goto err_close; + + return map_fd; + +err_close: + close(map_fd); + return err; +} + /* Relocate data references within program code: * - map references; * - global variable references; @@ -6235,6 +6434,20 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) case RELO_CORE: /* will be handled by bpf_program_record_relos() */ break; + case RELO_INSN_ARRAY: { + int map_fd; + + map_fd = create_jt_map(obj, prog, relo); + if (map_fd < 0) { + pr_warn("prog '%s': relo #%d: can't create jump table: sym_off %u\n", + prog->name, i, relo->sym_off); + return map_fd; + } + insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; + insn->imm = map_fd; + insn->off = 0; + } + break; default: pr_warn("prog '%s': relo #%d: bad relo type %d\n", prog->name, i, relo->type); @@ -6432,36 +6645,62 @@ static int append_subprog_relos(struct bpf_program *main_prog, struct bpf_progra return 0; } +static int save_subprog_offsets(struct bpf_program *main_prog, struct bpf_program *subprog) +{ + size_t size = sizeof(main_prog->subprogs[0]); + int cnt = main_prog->subprog_cnt; + void *tmp; + + tmp = libbpf_reallocarray(main_prog->subprogs, cnt + 1, size); + if (!tmp) + return -ENOMEM; + + main_prog->subprogs = tmp; + main_prog->subprogs[cnt].sec_insn_off = subprog->sec_insn_off; + main_prog->subprogs[cnt].sub_insn_off = subprog->sub_insn_off; + main_prog->subprog_cnt++; + + return 0; +} + static int bpf_object__append_subprog_code(struct bpf_object *obj, struct bpf_program *main_prog, struct bpf_program *subprog) { - struct bpf_insn *insns; - size_t new_cnt; - int err; + struct bpf_insn *insns; + size_t new_cnt; + int err; - subprog->sub_insn_off = main_prog->insns_cnt; + subprog->sub_insn_off = main_prog->insns_cnt; - new_cnt = main_prog->insns_cnt + subprog->insns_cnt; - insns = libbpf_reallocarray(main_prog->insns, new_cnt, sizeof(*insns)); - if (!insns) { - pr_warn("prog '%s': failed to realloc prog code\n", main_prog->name); - return -ENOMEM; - } - main_prog->insns = insns; - main_prog->insns_cnt = new_cnt; + new_cnt = main_prog->insns_cnt + subprog->insns_cnt; + insns = libbpf_reallocarray(main_prog->insns, new_cnt, sizeof(*insns)); + if (!insns) { + pr_warn("prog '%s': failed to realloc prog code\n", main_prog->name); + return -ENOMEM; + } + main_prog->insns = insns; + main_prog->insns_cnt = new_cnt; - memcpy(main_prog->insns + subprog->sub_insn_off, subprog->insns, - subprog->insns_cnt * sizeof(*insns)); + memcpy(main_prog->insns + subprog->sub_insn_off, subprog->insns, + subprog->insns_cnt * sizeof(*insns)); - pr_debug("prog '%s': added %zu insns from sub-prog '%s'\n", - main_prog->name, subprog->insns_cnt, subprog->name); + pr_debug("prog '%s': added %zu insns from sub-prog '%s'\n", + main_prog->name, subprog->insns_cnt, subprog->name); - /* The subprog insns are now appended. Append its relos too. */ - err = append_subprog_relos(main_prog, subprog); - if (err) - return err; - return 0; + /* The subprog insns are now appended. Append its relos too. */ + err = append_subprog_relos(main_prog, subprog); + if (err) + return err; + + err = save_subprog_offsets(main_prog, subprog); + if (err) { + pr_warn("prog '%s': failed to add subprog offsets: %s\n", + main_prog->name, errstr(err)); + return err; + } + + return 0; } static int @@ -9228,6 +9467,13 @@ void bpf_object__close(struct bpf_object *obj) zfree(&obj->arena_data); + zfree(&obj->jumptables_data); + obj->jumptables_data_sz = 0; + + for (i = 0; i < obj->jumptable_map_cnt; i++) + close(obj->jumptable_maps[i].fd); + zfree(&obj->jumptable_maps); + free(obj); } @@ -13854,8 +14100,8 @@ int bpf_program__set_attach_target(struct bpf_program *prog, return libbpf_err(-EINVAL); if (attach_prog_fd && !attach_func_name) { - /* remember attach_prog_fd and let bpf_program__load() find - * BTF ID during the program load + /* Store attach_prog_fd. The BTF ID will be resolved later during + * the normal object/program load phase. */ prog->attach_prog_fd = attach_prog_fd; return 0; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 5118d0a90e243a..65e68e964b8989 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -448,7 +448,7 @@ LIBBPF_API int bpf_program__pin(struct bpf_program *prog, const char *path); /** * @brief **bpf_program__unpin()** unpins the BPF program from a file - * in the BPFFS specified by a path. This decrements the programs + * in the BPFFS specified by a path. This decrements program's in-kernel * reference count. * * The file pinning the BPF program can also be unlinked by a different @@ -481,14 +481,12 @@ LIBBPF_API int bpf_link__pin(struct bpf_link *link, const char *path); /** * @brief **bpf_link__unpin()** unpins the BPF link from a file - * in the BPFFS specified by a path. This decrements the links - * reference count. + * in the BPFFS. This decrements link's in-kernel reference count. * * The file pinning the BPF link can also be unlinked by a different * process in which case this function will return an error. * - * @param prog BPF program to unpin - * @param path file path to the pin in a BPF file system + * @param link BPF link to unpin * @return 0, on success; negative error code, otherwise */ LIBBPF_API int bpf_link__unpin(struct bpf_link *link); @@ -995,8 +993,13 @@ LIBBPF_API __u32 bpf_program__line_info_cnt(const struct bpf_program *prog); * - fentry/fexit/fmod_ret; * - lsm; * - freplace. - * @param prog BPF program to set the attach type for - * @param type attach type to set the BPF map to have + * @param prog BPF program to configure; must be not yet loaded. + * @param attach_prog_fd FD of target BPF program (for freplace/extension). + * If >0 and func name omitted, defers BTF ID resolution. + * @param attach_func_name Target function name. Used either with + * attach_prog_fd to find destination BTF type ID in that BPF program, or + * alone (no attach_prog_fd) to resolve kernel (vmlinux/module) BTF ID. + * Must be provided if attach_prog_fd is 0. * @return error code; or 0 if no error occurred. */ LIBBPF_API int @@ -1098,6 +1101,7 @@ LIBBPF_API __u32 bpf_map__value_size(const struct bpf_map *map); /** * @brief **bpf_map__set_value_size()** sets map value size. * @param map the BPF map instance + * @param size the new value size * @return 0, on success; negative error, otherwise * * There is a special case for maps with associated memory-mapped regions, like @@ -1202,7 +1206,7 @@ LIBBPF_API struct bpf_map *bpf_map__inner_map(struct bpf_map *map); * per-CPU values value size has to be aligned up to closest 8 bytes for * alignment reasons, so expected size is: `round_up(value_size, 8) * * libbpf_num_possible_cpus()`. - * @flags extra flags passed to kernel for this operation + * @param flags extra flags passed to kernel for this operation * @return 0, on success; negative error, otherwise * * **bpf_map__lookup_elem()** is high-level equivalent of @@ -1226,7 +1230,7 @@ LIBBPF_API int bpf_map__lookup_elem(const struct bpf_map *map, * per-CPU values value size has to be aligned up to closest 8 bytes for * alignment reasons, so expected size is: `round_up(value_size, 8) * * libbpf_num_possible_cpus()`. - * @flags extra flags passed to kernel for this operation + * @param flags extra flags passed to kernel for this operation * @return 0, on success; negative error, otherwise * * **bpf_map__update_elem()** is high-level equivalent of @@ -1242,7 +1246,7 @@ LIBBPF_API int bpf_map__update_elem(const struct bpf_map *map, * @param map BPF map to delete element from * @param key pointer to memory containing bytes of the key * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** - * @flags extra flags passed to kernel for this operation + * @param flags extra flags passed to kernel for this operation * @return 0, on success; negative error, otherwise * * **bpf_map__delete_elem()** is high-level equivalent of @@ -1265,7 +1269,7 @@ LIBBPF_API int bpf_map__delete_elem(const struct bpf_map *map, * per-CPU values value size has to be aligned up to closest 8 bytes for * alignment reasons, so expected size is: `round_up(value_size, 8) * * libbpf_num_possible_cpus()`. - * @flags extra flags passed to kernel for this operation + * @param flags extra flags passed to kernel for this operation * @return 0, on success; negative error, otherwise * * **bpf_map__lookup_and_delete_elem()** is high-level equivalent of @@ -1637,6 +1641,7 @@ struct perf_buffer_opts { * @param sample_cb function called on each received data record * @param lost_cb function called when record loss has occurred * @param ctx user-provided extra context passed into *sample_cb* and *lost_cb* + * @param opts optional parameters for the perf buffer, can be null * @return a new instance of struct perf_buffer on success, NULL on error with * *errno* containing an error code */ diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 35b2527bedecbc..fc59b21b51b5e5 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -74,6 +74,8 @@ #define ELF64_ST_VISIBILITY(o) ((o) & 0x03) #endif +#define JUMPTABLES_SEC ".jumptables" + #define BTF_INFO_ENC(kind, kind_flag, vlen) \ ((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN)) #define BTF_TYPE_ENC(name, info, size_or_type) (name), (info), (size_or_type) diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 9dfbe7750f564a..bccf4bb747e1d9 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -364,6 +364,10 @@ static int probe_map_create(enum bpf_map_type map_type) case BPF_MAP_TYPE_SOCKHASH: case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: break; + case BPF_MAP_TYPE_INSN_ARRAY: + key_size = sizeof(__u32); + value_size = sizeof(struct bpf_insn_array_value); + break; case BPF_MAP_TYPE_UNSPEC: default: return -EOPNOTSUPP; diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 56ae77047bc367..f4403e3cf99464 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -2025,6 +2025,9 @@ static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj, obj->sym_map[src_sym_idx] = dst_sec->sec_sym_idx; return 0; } + + if (strcmp(src_sec->sec_name, JUMPTABLES_SEC) == 0) + goto add_sym; } if (sym_bind == STB_LOCAL) diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index be1ee7ba7ce031..19c1638e312abd 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -23,6 +23,7 @@ test_tcpnotify_user test_libbpf xdping test_cpp +test_progs_verification_cert *.d *.subskel.h *.skel.h @@ -32,7 +33,6 @@ test_cpp /cpuv4 /host-tools /tools -/runqslower /bench /veristat /sign-file diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index f00587d4ede68e..b7030a6e2e763b 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -46,6 +46,7 @@ endif CFLAGS += -g $(OPT_FLAGS) -rdynamic -std=gnu11 \ -Wall -Werror -fno-omit-frame-pointer \ + -Wno-unused-but-set-variable \ $(GENFLAGS) $(SAN_CFLAGS) $(LIBELF_CFLAGS) \ -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ -I$(TOOLSINCDIR) -I$(TOOLSARCHINCDIR) -I$(APIDIR) -I$(OUTPUT) @@ -98,14 +99,11 @@ TEST_GEN_PROGS += test_progs-cpuv4 TEST_INST_SUBDIRS += cpuv4 endif -TEST_GEN_FILES = test_tc_edt.bpf.o TEST_FILES = xsk_prereqs.sh $(wildcard progs/btf_dump_test_case_*.c) # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ test_lirc_mode2.sh \ - test_tc_tunnel.sh \ - test_tc_edt.sh \ test_xdping.sh \ test_bpftool_build.sh \ test_bpftool.sh \ @@ -127,7 +125,6 @@ TEST_KMOD_TARGETS = $(addprefix $(OUTPUT)/,$(TEST_KMODS)) TEST_GEN_PROGS_EXTENDED = \ bench \ flow_dissector_load \ - runqslower \ test_cpp \ test_lirc_mode2_user \ veristat \ @@ -209,8 +206,6 @@ HOST_INCLUDE_DIR := $(INCLUDE_DIR) endif HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids -RUNQSLOWER_OUTPUT := $(BUILD_DIR)/runqslower/ - VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ ../../../../vmlinux \ @@ -232,7 +227,7 @@ $(notdir $(TEST_GEN_PROGS) $(TEST_KMODS) \ MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf \ $(BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/bpftool \ $(HOST_BUILD_DIR)/resolve_btfids \ - $(RUNQSLOWER_OUTPUT) $(INCLUDE_DIR)) + $(INCLUDE_DIR)) $(MAKE_DIRS): $(call msg,MKDIR,,$@) $(Q)mkdir -p $@ @@ -304,17 +299,6 @@ TRUNNER_BPFTOOL := $(DEFAULT_BPFTOOL) USE_BOOTSTRAP := "bootstrap/" endif -$(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) $(RUNQSLOWER_OUTPUT) - $(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/runqslower \ - OUTPUT=$(RUNQSLOWER_OUTPUT) VMLINUX_BTF=$(VMLINUX_BTF) \ - BPFTOOL_OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ - BPFOBJ_OUTPUT=$(BUILD_DIR)/libbpf/ \ - BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) \ - BPF_TARGET_ENDIAN=$(BPF_TARGET_ENDIAN) \ - EXTRA_CFLAGS='-g $(OPT_FLAGS) $(SAN_CFLAGS) $(EXTRA_CFLAGS)' \ - EXTRA_LDFLAGS='$(SAN_LDFLAGS) $(EXTRA_LDFLAGS)' && \ - cp $(RUNQSLOWER_OUTPUT)runqslower $@ - TEST_GEN_PROGS_EXTENDED += $(TRUNNER_BPFTOOL) $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(BPFOBJ) @@ -453,7 +437,9 @@ BPF_CFLAGS = -g -Wall -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ -I$(abspath $(OUTPUT)/../usr/include) \ -std=gnu11 \ -fno-strict-aliasing \ - -Wno-compare-distinct-pointer-types + -Wno-compare-distinct-pointer-types \ + -Wno-initializer-overrides \ + # # TODO: enable me -Wsign-compare CLANG_CFLAGS = $(CLANG_SYS_INCLUDES) @@ -498,7 +484,8 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ LSKELS := fexit_sleep.c trace_printk.c trace_vprintk.c map_ptr_kern.c \ core_kern.c core_kern_overflow.c test_ringbuf.c \ - test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c + test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c \ + test_ringbuf_overwrite.c LSKELS_SIGNED := fentry_test.c fexit_test.c atomics.c @@ -543,6 +530,8 @@ TRUNNER_TEST_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.test.o, \ $$(notdir $$(wildcard $(TRUNNER_TESTS_DIR)/*.c))) TRUNNER_EXTRA_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, \ $$(filter %.c,$(TRUNNER_EXTRA_SOURCES))) +TRUNNER_LIB_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, \ + $$(filter %.c,$(TRUNNER_LIB_SOURCES))) TRUNNER_EXTRA_HDRS := $$(filter %.h,$(TRUNNER_EXTRA_SOURCES)) TRUNNER_TESTS_HDR := $(TRUNNER_TESTS_DIR)/tests.h TRUNNER_BPF_SRCS := $$(notdir $$(wildcard $(TRUNNER_BPF_PROGS_DIR)/*.c)) @@ -686,6 +675,10 @@ $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $$(call msg,EXT-OBJ,$(TRUNNER_BINARY),$$@) $(Q)$$(CC) $$(CFLAGS) -c $$< $$(LDLIBS) -o $$@ +$(TRUNNER_LIB_OBJS): $(TRUNNER_OUTPUT)/%.o:$(TOOLSDIR)/lib/%.c + $$(call msg,LIB-OBJ,$(TRUNNER_BINARY),$$@) + $(Q)$$(CC) $$(CFLAGS) -c $$< $$(LDLIBS) -o $$@ + # non-flavored in-srctree builds receive special treatment, in particular, we # do not need to copy extra resources (see e.g. test_btf_dump_case()) $(TRUNNER_BINARY)-extras: $(TRUNNER_EXTRA_FILES) | $(TRUNNER_OUTPUT) @@ -699,6 +692,7 @@ $(OUTPUT)/$(TRUNNER_BINARY): | $(TRUNNER_BPF_OBJS) $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ $(TRUNNER_EXTRA_OBJS) $$(BPFOBJ) \ + $(TRUNNER_LIB_OBJS) \ $(RESOLVE_BTFIDS) \ $(TRUNNER_BPFTOOL) \ $(OUTPUT)/veristat \ @@ -721,7 +715,8 @@ $(VERIFICATION_CERT) $(PRIVATE_KEY): $(VERIFY_SIG_SETUP) $(Q)$(VERIFY_SIG_SETUP) genkey $(BUILD_DIR) $(VERIFY_SIG_HDR): $(VERIFICATION_CERT) - $(Q)xxd -i -n test_progs_verification_cert $< > $@ + $(Q)ln -fs $< test_progs_verification_cert && \ + xxd -i test_progs_verification_cert > $@ # Define test_progs test runner. TRUNNER_TESTS_DIR := prog_tests @@ -745,6 +740,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c \ $(VERIFY_SIG_HDR) \ flow_dissector_load.h \ ip_check_defrag_frags.h +TRUNNER_LIB_SOURCES := find_bit.c TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ $(OUTPUT)/liburandom_read.so \ $(OUTPUT)/xdp_synproxy \ @@ -782,6 +778,7 @@ endif TRUNNER_TESTS_DIR := map_tests TRUNNER_BPF_PROGS_DIR := progs TRUNNER_EXTRA_SOURCES := test_maps.c +TRUNNER_LIB_SOURCES := TRUNNER_EXTRA_FILES := TRUNNER_BPF_BUILD_RULE := $$(error no BPF objects should be built) TRUNNER_BPF_CFLAGS := @@ -803,7 +800,7 @@ $(OUTPUT)/test_verifier: test_verifier.c verifier/tests.h $(BPFOBJ) | $(OUTPUT) $(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@ # Include find_bit.c to compile xskxceiver. -EXTRA_SRC := $(TOOLSDIR)/lib/find_bit.c +EXTRA_SRC := $(TOOLSDIR)/lib/find_bit.c prog_tests/test_xsk.c prog_tests/test_xsk.h $(OUTPUT)/xskxceiver: $(EXTRA_SRC) xskxceiver.c xskxceiver.h $(OUTPUT)/network_helpers.o $(OUTPUT)/xsk.o $(OUTPUT)/xsk_xdp_progs.skel.h $(BPFOBJ) | $(OUTPUT) $(call msg,BINARY,,$@) $(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@ @@ -893,7 +890,8 @@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ $(addprefix $(OUTPUT)/,*.o *.d *.skel.h *.lskel.h *.subskel.h \ no_alu32 cpuv4 bpf_gcc \ liburandom_read.so) \ - $(OUTPUT)/FEATURE-DUMP.selftests + $(OUTPUT)/FEATURE-DUMP.selftests \ + test_progs_verification_cert .PHONY: docs docs-clean diff --git a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c index e1ee979e6acc6f..01bdce692799d0 100644 --- a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c +++ b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c @@ -19,6 +19,8 @@ static struct { int ringbuf_sz; /* per-ringbuf, in bytes */ bool ringbuf_use_output; /* use slower output API */ int perfbuf_sz; /* per-CPU size, in pages */ + bool overwrite; + bool bench_producer; } args = { .back2back = false, .batch_cnt = 500, @@ -27,6 +29,8 @@ static struct { .ringbuf_sz = 512 * 1024, .ringbuf_use_output = false, .perfbuf_sz = 128, + .overwrite = false, + .bench_producer = false, }; enum { @@ -35,6 +39,8 @@ enum { ARG_RB_BATCH_CNT = 2002, ARG_RB_SAMPLED = 2003, ARG_RB_SAMPLE_RATE = 2004, + ARG_RB_OVERWRITE = 2005, + ARG_RB_BENCH_PRODUCER = 2006, }; static const struct argp_option opts[] = { @@ -43,6 +49,8 @@ static const struct argp_option opts[] = { { "rb-batch-cnt", ARG_RB_BATCH_CNT, "CNT", 0, "Set BPF-side record batch count"}, { "rb-sampled", ARG_RB_SAMPLED, NULL, 0, "Notification sampling"}, { "rb-sample-rate", ARG_RB_SAMPLE_RATE, "RATE", 0, "Notification sample rate"}, + { "rb-overwrite", ARG_RB_OVERWRITE, NULL, 0, "Overwrite mode"}, + { "rb-bench-producer", ARG_RB_BENCH_PRODUCER, NULL, 0, "Benchmark producer"}, {}, }; @@ -72,6 +80,12 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) argp_usage(state); } break; + case ARG_RB_OVERWRITE: + args.overwrite = true; + break; + case ARG_RB_BENCH_PRODUCER: + args.bench_producer = true; + break; default: return ARGP_ERR_UNKNOWN; } @@ -95,8 +109,33 @@ static inline void bufs_trigger_batch(void) static void bufs_validate(void) { - if (env.consumer_cnt != 1) { - fprintf(stderr, "rb-libbpf benchmark needs one consumer!\n"); + if (args.bench_producer && strcmp(env.bench_name, "rb-libbpf")) { + fprintf(stderr, "--rb-bench-producer only works with rb-libbpf!\n"); + exit(1); + } + + if (args.overwrite && !args.bench_producer) { + fprintf(stderr, "overwrite mode only works with --rb-bench-producer for now!\n"); + exit(1); + } + + if (args.bench_producer && env.consumer_cnt != 0) { + fprintf(stderr, "no consumer is needed for --rb-bench-producer!\n"); + exit(1); + } + + if (args.bench_producer && args.back2back) { + fprintf(stderr, "back-to-back mode makes no sense for --rb-bench-producer!\n"); + exit(1); + } + + if (args.bench_producer && args.sampled) { + fprintf(stderr, "sampling mode makes no sense for --rb-bench-producer!\n"); + exit(1); + } + + if (!args.bench_producer && env.consumer_cnt != 1) { + fprintf(stderr, "benchmarks without --rb-bench-producer require exactly one consumer!\n"); exit(1); } @@ -128,12 +167,17 @@ static void ringbuf_libbpf_measure(struct bench_res *res) { struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx; - res->hits = atomic_swap(&buf_hits.value, 0); + if (args.bench_producer) + res->hits = atomic_swap(&ctx->skel->bss->hits, 0); + else + res->hits = atomic_swap(&buf_hits.value, 0); res->drops = atomic_swap(&ctx->skel->bss->dropped, 0); } static struct ringbuf_bench *ringbuf_setup_skeleton(void) { + __u32 flags; + struct bpf_map *ringbuf; struct ringbuf_bench *skel; setup_libbpf(); @@ -146,12 +190,19 @@ static struct ringbuf_bench *ringbuf_setup_skeleton(void) skel->rodata->batch_cnt = args.batch_cnt; skel->rodata->use_output = args.ringbuf_use_output ? 1 : 0; + skel->rodata->bench_producer = args.bench_producer; if (args.sampled) /* record data + header take 16 bytes */ skel->rodata->wakeup_data_size = args.sample_rate * 16; - bpf_map__set_max_entries(skel->maps.ringbuf, args.ringbuf_sz); + ringbuf = skel->maps.ringbuf; + if (args.overwrite) { + flags = bpf_map__map_flags(ringbuf) | BPF_F_RB_OVERWRITE; + bpf_map__set_map_flags(ringbuf, flags); + } + + bpf_map__set_max_entries(ringbuf, args.ringbuf_sz); if (ringbuf_bench__load(skel)) { fprintf(stderr, "failed to load skeleton\n"); @@ -171,10 +222,12 @@ static void ringbuf_libbpf_setup(void) { struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx; struct bpf_link *link; + int map_fd; ctx->skel = ringbuf_setup_skeleton(); - ctx->ringbuf = ring_buffer__new(bpf_map__fd(ctx->skel->maps.ringbuf), - buf_process_sample, NULL, NULL); + + map_fd = bpf_map__fd(ctx->skel->maps.ringbuf); + ctx->ringbuf = ring_buffer__new(map_fd, buf_process_sample, NULL, NULL); if (!ctx->ringbuf) { fprintf(stderr, "failed to create ringbuf\n"); exit(1); diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 1e2aff007c2a40..34018fc3927f29 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -180,10 +180,10 @@ static void trigger_kernel_count_setup(void) { setup_ctx(); bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false); - bpf_program__set_autoload(ctx.skel->progs.trigger_count, true); + bpf_program__set_autoload(ctx.skel->progs.trigger_kernel_count, true); load_ctx(); /* override driver program */ - ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_count); + ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_kernel_count); } static void trigger_kprobe_setup(void) diff --git a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh index 91e3567962ffb1..83e05e8378710a 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh @@ -49,3 +49,7 @@ for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do summarize "rb-libbpf nr_prod $b" "$($RUN_RB_BENCH -p$b --rb-batch-cnt 50 rb-libbpf)" done +header "Ringbuf, multi-producer contention in overwrite mode, no consumer" +for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do + summarize "rb-prod nr_prod $b" "$($RUN_BENCH -p$b --rb-batch-cnt 50 --rb-overwrite --rb-bench-producer rb-libbpf)" +done diff --git a/tools/testing/selftests/bpf/bpf_arena_list.h b/tools/testing/selftests/bpf/bpf_arena_list.h index 85dbc3ea4da5f9..e16fa7d95fcf06 100644 --- a/tools/testing/selftests/bpf/bpf_arena_list.h +++ b/tools/testing/selftests/bpf/bpf_arena_list.h @@ -64,14 +64,12 @@ static inline void list_add_head(arena_list_node_t *n, arena_list_head_t *h) static inline void __list_del(arena_list_node_t *n) { - arena_list_node_t *next = n->next, *tmp; + arena_list_node_t *next = n->next; arena_list_node_t * __arena *pprev = n->pprev; cast_user(next); cast_kern(pprev); - tmp = *pprev; - cast_kern(tmp); - WRITE_ONCE(tmp, next); + WRITE_ONCE(*pprev, next); if (next) { cast_user(pprev); cast_kern(next); diff --git a/tools/testing/selftests/bpf/bpf_arena_strsearch.h b/tools/testing/selftests/bpf/bpf_arena_strsearch.h new file mode 100644 index 00000000000000..c1b6eaa905bbd6 --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_arena_strsearch.h @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#pragma once +#include "bpf_arena_common.h" + +__noinline int bpf_arena_strlen(const char __arena *s __arg_arena) +{ + const char __arena *sc; + + for (sc = s; *sc != '\0'; ++sc) + cond_break; + return sc - s; +} + +/** + * glob_match - Shell-style pattern matching, like !fnmatch(pat, str, 0) + * @pat: Shell-style pattern to match, e.g. "*.[ch]". + * @str: String to match. The pattern must match the entire string. + * + * Perform shell-style glob matching, returning true (1) if the match + * succeeds, or false (0) if it fails. Equivalent to !fnmatch(@pat, @str, 0). + * + * Pattern metacharacters are ?, *, [ and \. + * (And, inside character classes, !, - and ].) + * + * This is small and simple implementation intended for device blacklists + * where a string is matched against a number of patterns. Thus, it + * does not preprocess the patterns. It is non-recursive, and run-time + * is at most quadratic: strlen(@str)*strlen(@pat). + * + * An example of the worst case is glob_match("*aaaaa", "aaaaaaaaaa"); + * it takes 6 passes over the pattern before matching the string. + * + * Like !fnmatch(@pat, @str, 0) and unlike the shell, this does NOT + * treat / or leading . specially; it isn't actually used for pathnames. + * + * Note that according to glob(7) (and unlike bash), character classes + * are complemented by a leading !; this does not support the regex-style + * [^a-z] syntax. + * + * An opening bracket without a matching close is matched literally. + */ +__noinline bool glob_match(char const __arena *pat __arg_arena, char const __arena *str __arg_arena) +{ + /* + * Backtrack to previous * on mismatch and retry starting one + * character later in the string. Because * matches all characters + * (no exception for /), it can be easily proved that there's + * never a need to backtrack multiple levels. + */ + char const __arena *back_pat = NULL, *back_str; + + /* + * Loop over each token (character or class) in pat, matching + * it against the remaining unmatched tail of str. Return false + * on mismatch, or true after matching the trailing nul bytes. + */ + for (;;) { + unsigned char c = *str++; + unsigned char d = *pat++; + + switch (d) { + case '?': /* Wildcard: anything but nul */ + if (c == '\0') + return false; + break; + case '*': /* Any-length wildcard */ + if (*pat == '\0') /* Optimize trailing * case */ + return true; + back_pat = pat; + back_str = --str; /* Allow zero-length match */ + break; + case '[': { /* Character class */ + bool match = false, inverted = (*pat == '!'); + char const __arena *class = pat + inverted; + unsigned char a = *class++; + + /* + * Iterate over each span in the character class. + * A span is either a single character a, or a + * range a-b. The first span may begin with ']'. + */ + do { + unsigned char b = a; + + if (a == '\0') /* Malformed */ + goto literal; + + if (class[0] == '-' && class[1] != ']') { + b = class[1]; + + if (b == '\0') + goto literal; + + class += 2; + /* Any special action if a > b? */ + } + match |= (a <= c && c <= b); + cond_break; + } while ((a = *class++) != ']'); + + if (match == inverted) + goto backtrack; + pat = class; + } + break; + case '\\': + d = *pat++; + __attribute__((__fallthrough__)); + default: /* Literal character */ +literal: + if (c == d) { + if (d == '\0') + return true; + break; + } +backtrack: + if (c == '\0' || !back_pat) + return false; /* No point continuing */ + /* Try again from last *, one character later in str. */ + pat = back_pat; + str = ++back_str; + break; + } + cond_break; + } + return false; +} diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index 794d44d19c8865..e0189254bb6ed6 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -28,8 +28,8 @@ extern int bpf_dynptr_from_skb_meta(struct __sk_buff *skb, __u64 flags, * Either a direct pointer to the dynptr data or a pointer to the user-provided * buffer if unable to obtain a direct pointer */ -extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u32 offset, - void *buffer, __u32 buffer__szk) __ksym __weak; +extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u64 offset, + void *buffer, __u64 buffer__szk) __ksym __weak; /* Description * Obtain a read-write pointer to the dynptr's data @@ -37,13 +37,13 @@ extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u32 offset, * Either a direct pointer to the dynptr data or a pointer to the user-provided * buffer if unable to obtain a direct pointer */ -extern void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *ptr, __u32 offset, - void *buffer, __u32 buffer__szk) __ksym __weak; +extern void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *ptr, __u64 offset, void *buffer, + __u64 buffer__szk) __ksym __weak; -extern int bpf_dynptr_adjust(const struct bpf_dynptr *ptr, __u32 start, __u32 end) __ksym __weak; +extern int bpf_dynptr_adjust(const struct bpf_dynptr *ptr, __u64 start, __u64 end) __ksym __weak; extern bool bpf_dynptr_is_null(const struct bpf_dynptr *ptr) __ksym __weak; extern bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *ptr) __ksym __weak; -extern __u32 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym __weak; +extern __u64 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym __weak; extern int bpf_dynptr_clone(const struct bpf_dynptr *ptr, struct bpf_dynptr *clone__init) __ksym __weak; /* Description diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index cdf7b664144420..0a6a5561bed399 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -97,7 +97,7 @@ int settimeo(int fd, int timeout_ms) int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t addrlen, const struct network_helper_opts *opts) { - int fd; + int on = 1, fd; if (!opts) opts = &default_opts; @@ -111,6 +111,12 @@ int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t a if (settimeo(fd, opts->timeout_ms)) goto error_close; + if (type == SOCK_STREAM && + setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on))) { + log_err("Failed to enable SO_REUSEADDR"); + goto error_close; + } + if (opts->post_socket_cb && opts->post_socket_cb(fd, opts->cb_opts)) { log_err("Failed to call post_socket_cb"); @@ -766,6 +772,50 @@ int send_recv_data(int lfd, int fd, uint32_t total_bytes) return err; } +int tc_prog_attach(const char *dev, int ingress_fd, int egress_fd) +{ + int ifindex, ret; + + if (!ASSERT_TRUE(ingress_fd >= 0 || egress_fd >= 0, + "at least one program fd is valid")) + return -1; + + ifindex = if_nametoindex(dev); + if (!ASSERT_NEQ(ifindex, 0, "get ifindex")) + return -1; + + DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .ifindex = ifindex, + .attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS); + DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts1, .handle = 1, + .priority = 1, .prog_fd = ingress_fd); + DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts2, .handle = 1, + .priority = 1, .prog_fd = egress_fd); + + ret = bpf_tc_hook_create(&hook); + if (!ASSERT_OK(ret, "create tc hook")) + return ret; + + if (ingress_fd >= 0) { + hook.attach_point = BPF_TC_INGRESS; + ret = bpf_tc_attach(&hook, &opts1); + if (!ASSERT_OK(ret, "bpf_tc_attach")) { + bpf_tc_hook_destroy(&hook); + return ret; + } + } + + if (egress_fd >= 0) { + hook.attach_point = BPF_TC_EGRESS; + ret = bpf_tc_attach(&hook, &opts2); + if (!ASSERT_OK(ret, "bpf_tc_attach")) { + bpf_tc_hook_destroy(&hook); + return ret; + } + } + + return 0; +} + #ifdef TRAFFIC_MONITOR struct tmonitor_ctx { pcap_t *pcap; diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index ef208eefd571a4..79a010c88e11c8 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -255,6 +255,22 @@ struct tmonitor_ctx; typedef int (*tm_print_fn_t)(const char *format, va_list args); +/** + * tc_prog_attach - attach BPF program(s) to an interface + * + * Takes file descriptors pointing to at least one, at most two BPF + * programs, and attach those programs to an interface ingress, egress or + * both. + * + * @dev: string containing the interface name + * @ingress_fd: file descriptor of the program to attach to interface ingress + * @egress_fd: file descriptor of the program to attach to interface egress + * + * Returns 0 on success, -1 if no valid file descriptor has been found, if + * the interface name is invalid or if an error ocurred during attach. + */ +int tc_prog_attach(const char *dev, int ingress_fd, int egress_fd); + #ifdef TRAFFIC_MONITOR struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name, const char *subtest_name); diff --git a/tools/testing/selftests/bpf/prog_tests/arena_strsearch.c b/tools/testing/selftests/bpf/prog_tests/arena_strsearch.c new file mode 100644 index 00000000000000..f81a0c06650596 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/arena_strsearch.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include <test_progs.h> +#include "arena_strsearch.skel.h" + +static void test_arena_str(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct arena_strsearch *skel; + int ret; + + skel = arena_strsearch__open_and_load(); + if (!ASSERT_OK_PTR(skel, "arena_strsearch__open_and_load")) + return; + + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_strsearch), &opts); + ASSERT_OK(ret, "ret_add"); + ASSERT_OK(opts.retval, "retval"); + if (skel->bss->skip) { + printf("%s:SKIP:compiler doesn't support arena_cast\n", __func__); + test__skip(); + } + arena_strsearch__destroy(skel); +} + +void test_arena_strsearch(void) +{ + if (test__start_subtest("arena_strsearch")) + test_arena_str(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c b/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c new file mode 100644 index 00000000000000..d138cc7b1bda82 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c @@ -0,0 +1,292 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <test_progs.h> + +#include <linux/if_ether.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/in6.h> +#include <linux/udp.h> +#include <linux/tcp.h> + +#include <sys/syscall.h> +#include <bpf/bpf.h> + +#include "bpf_gotox.skel.h" + +static void __test_run(struct bpf_program *prog, void *ctx_in, size_t ctx_size_in) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts, + .ctx_in = ctx_in, + .ctx_size_in = ctx_size_in, + ); + int err, prog_fd; + + prog_fd = bpf_program__fd(prog); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run_opts err"); +} + +static void __subtest(struct bpf_gotox *skel, void (*check)(struct bpf_gotox *)) +{ + if (skel->data->skip) + test__skip(); + else + check(skel); +} + +static void check_simple(struct bpf_gotox *skel, + struct bpf_program *prog, + __u64 ctx_in, + __u64 expected) +{ + skel->bss->ret_user = 0; + + __test_run(prog, &ctx_in, sizeof(ctx_in)); + + if (!ASSERT_EQ(skel->bss->ret_user, expected, "skel->bss->ret_user")) + return; +} + +static void check_simple_fentry(struct bpf_gotox *skel, + struct bpf_program *prog, + __u64 ctx_in, + __u64 expected) +{ + skel->bss->in_user = ctx_in; + skel->bss->ret_user = 0; + + /* trigger */ + usleep(1); + + if (!ASSERT_EQ(skel->bss->ret_user, expected, "skel->bss->ret_user")) + return; +} + +/* validate that for two loads of the same jump table libbpf generates only one map */ +static void check_one_map_two_jumps(struct bpf_gotox *skel) +{ + struct bpf_prog_info prog_info; + struct bpf_map_info map_info; + __u32 len; + __u32 map_ids[16]; + int prog_fd, map_fd; + int ret; + int i; + bool seen = false; + + memset(&prog_info, 0, sizeof(prog_info)); + prog_info.map_ids = (long)map_ids; + prog_info.nr_map_ids = ARRAY_SIZE(map_ids); + prog_fd = bpf_program__fd(skel->progs.one_map_two_jumps); + if (!ASSERT_GE(prog_fd, 0, "bpf_program__fd(one_map_two_jumps)")) + return; + + len = sizeof(prog_info); + ret = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &len); + if (!ASSERT_OK(ret, "bpf_obj_get_info_by_fd(prog_fd)")) + return; + + for (i = 0; i < prog_info.nr_map_ids; i++) { + map_fd = bpf_map_get_fd_by_id(map_ids[i]); + if (!ASSERT_GE(map_fd, 0, "bpf_map_get_fd_by_id")) + return; + + len = sizeof(map_info); + memset(&map_info, 0, len); + ret = bpf_obj_get_info_by_fd(map_fd, &map_info, &len); + if (!ASSERT_OK(ret, "bpf_obj_get_info_by_fd(map_fd)")) { + close(map_fd); + return; + } + + if (map_info.type == BPF_MAP_TYPE_INSN_ARRAY) { + if (!ASSERT_EQ(seen, false, "more than one INSN_ARRAY map")) { + close(map_fd); + return; + } + seen = true; + } + close(map_fd); + } + + ASSERT_EQ(seen, true, "no INSN_ARRAY map"); +} + +static void check_one_switch(struct bpf_gotox *skel) +{ + __u64 in[] = {0, 1, 2, 3, 4, 5, 77}; + __u64 out[] = {2, 3, 4, 5, 7, 19, 19}; + int i; + + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple(skel, skel->progs.one_switch, in[i], out[i]); +} + +static void check_one_switch_non_zero_sec_off(struct bpf_gotox *skel) +{ + __u64 in[] = {0, 1, 2, 3, 4, 5, 77}; + __u64 out[] = {2, 3, 4, 5, 7, 19, 19}; + int i; + + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple(skel, skel->progs.one_switch_non_zero_sec_off, in[i], out[i]); +} + +static void check_two_switches(struct bpf_gotox *skel) +{ + __u64 in[] = {0, 1, 2, 3, 4, 5, 77}; + __u64 out[] = {103, 104, 107, 205, 115, 1019, 1019}; + int i; + + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple(skel, skel->progs.two_switches, in[i], out[i]); +} + +static void check_big_jump_table(struct bpf_gotox *skel) +{ + __u64 in[] = {0, 11, 27, 31, 22, 45, 99}; + __u64 out[] = {2, 3, 4, 5, 19, 19, 19}; + int i; + + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple(skel, skel->progs.big_jump_table, in[i], out[i]); +} + +static void check_one_jump_two_maps(struct bpf_gotox *skel) +{ + __u64 in[] = {0, 1, 2, 3, 4, 5, 77}; + __u64 out[] = {12, 15, 7 , 15, 12, 15, 15}; + int i; + + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple(skel, skel->progs.one_jump_two_maps, in[i], out[i]); +} + +static void check_static_global(struct bpf_gotox *skel) +{ + __u64 in[] = {0, 1, 2, 3, 4, 5, 77}; + __u64 out[] = {2, 3, 4, 5, 7, 19, 19}; + int i; + + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple(skel, skel->progs.use_static_global1, in[i], out[i]); + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple(skel, skel->progs.use_static_global2, in[i], out[i]); +} + +static void check_nonstatic_global(struct bpf_gotox *skel) +{ + __u64 in[] = {0, 1, 2, 3, 4, 5, 77}; + __u64 out[] = {2, 3, 4, 5, 7, 19, 19}; + int i; + + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple(skel, skel->progs.use_nonstatic_global1, in[i], out[i]); + + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple(skel, skel->progs.use_nonstatic_global2, in[i], out[i]); +} + +static void check_other_sec(struct bpf_gotox *skel) +{ + struct bpf_link *link; + __u64 in[] = {0, 1, 2, 3, 4, 5, 77}; + __u64 out[] = {2, 3, 4, 5, 7, 19, 19}; + int i; + + link = bpf_program__attach(skel->progs.simple_test_other_sec); + if (!ASSERT_OK_PTR(link, "link")) + return; + + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple_fentry(skel, skel->progs.simple_test_other_sec, in[i], out[i]); + + bpf_link__destroy(link); +} + +static void check_static_global_other_sec(struct bpf_gotox *skel) +{ + struct bpf_link *link; + __u64 in[] = {0, 1, 2, 3, 4, 5, 77}; + __u64 out[] = {2, 3, 4, 5, 7, 19, 19}; + int i; + + link = bpf_program__attach(skel->progs.use_static_global_other_sec); + if (!ASSERT_OK_PTR(link, "link")) + return; + + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple_fentry(skel, skel->progs.use_static_global_other_sec, in[i], out[i]); + + bpf_link__destroy(link); +} + +static void check_nonstatic_global_other_sec(struct bpf_gotox *skel) +{ + struct bpf_link *link; + __u64 in[] = {0, 1, 2, 3, 4, 5, 77}; + __u64 out[] = {2, 3, 4, 5, 7, 19, 19}; + int i; + + link = bpf_program__attach(skel->progs.use_nonstatic_global_other_sec); + if (!ASSERT_OK_PTR(link, "link")) + return; + + for (i = 0; i < ARRAY_SIZE(in); i++) + check_simple_fentry(skel, skel->progs.use_nonstatic_global_other_sec, in[i], out[i]); + + bpf_link__destroy(link); +} + +void test_bpf_gotox(void) +{ + struct bpf_gotox *skel; + int ret; + + skel = bpf_gotox__open(); + if (!ASSERT_NEQ(skel, NULL, "bpf_gotox__open")) + return; + + ret = bpf_gotox__load(skel); + if (!ASSERT_OK(ret, "bpf_gotox__load")) + return; + + skel->bss->pid = getpid(); + + if (test__start_subtest("one-switch")) + __subtest(skel, check_one_switch); + + if (test__start_subtest("one-switch-non-zero-sec-offset")) + __subtest(skel, check_one_switch_non_zero_sec_off); + + if (test__start_subtest("two-switches")) + __subtest(skel, check_two_switches); + + if (test__start_subtest("big-jump-table")) + __subtest(skel, check_big_jump_table); + + if (test__start_subtest("static-global")) + __subtest(skel, check_static_global); + + if (test__start_subtest("nonstatic-global")) + __subtest(skel, check_nonstatic_global); + + if (test__start_subtest("other-sec")) + __subtest(skel, check_other_sec); + + if (test__start_subtest("static-global-other-sec")) + __subtest(skel, check_static_global_other_sec); + + if (test__start_subtest("nonstatic-global-other-sec")) + __subtest(skel, check_nonstatic_global_other_sec); + + if (test__start_subtest("one-jump-two-maps")) + __subtest(skel, check_one_jump_two_maps); + + if (test__start_subtest("one-map-two-jumps")) + __subtest(skel, check_one_map_two_jumps); + + bpf_gotox__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_insn_array.c b/tools/testing/selftests/bpf/prog_tests/bpf_insn_array.c new file mode 100644 index 00000000000000..269870bec94117 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpf_insn_array.c @@ -0,0 +1,504 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <bpf/bpf.h> +#include <test_progs.h> + +#ifdef __x86_64__ +static int map_create(__u32 map_type, __u32 max_entries) +{ + const char *map_name = "insn_array"; + __u32 key_size = 4; + __u32 value_size = sizeof(struct bpf_insn_array_value); + + return bpf_map_create(map_type, map_name, key_size, value_size, max_entries, NULL); +} + +static int prog_load(struct bpf_insn *insns, __u32 insn_cnt, int *fd_array, __u32 fd_array_cnt) +{ + LIBBPF_OPTS(bpf_prog_load_opts, opts); + + opts.fd_array = fd_array; + opts.fd_array_cnt = fd_array_cnt; + + return bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, &opts); +} + +static void __check_success(struct bpf_insn *insns, __u32 insn_cnt, __u32 *map_in, __u32 *map_out) +{ + struct bpf_insn_array_value val = {}; + int prog_fd = -1, map_fd, i; + + map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, insn_cnt); + if (!ASSERT_GE(map_fd, 0, "map_create")) + return; + + for (i = 0; i < insn_cnt; i++) { + val.orig_off = map_in[i]; + if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &i, &val, 0), 0, "bpf_map_update_elem")) + goto cleanup; + } + + if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) + goto cleanup; + + prog_fd = prog_load(insns, insn_cnt, &map_fd, 1); + if (!ASSERT_GE(prog_fd, 0, "bpf(BPF_PROG_LOAD)")) + goto cleanup; + + for (i = 0; i < insn_cnt; i++) { + char buf[64]; + + if (!ASSERT_EQ(bpf_map_lookup_elem(map_fd, &i, &val), 0, "bpf_map_lookup_elem")) + goto cleanup; + + snprintf(buf, sizeof(buf), "val.xlated_off should be equal map_out[%d]", i); + ASSERT_EQ(val.xlated_off, map_out[i], buf); + } + +cleanup: + close(prog_fd); + close(map_fd); +} + +/* + * Load a program, which will not be anyhow mangled by the verifier. Add an + * insn_array map pointing to every instruction. Check that it hasn't changed + * after the program load. + */ +static void check_one_to_one_mapping(void) +{ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 4), + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + __u32 map_in[] = {0, 1, 2, 3, 4, 5}; + __u32 map_out[] = {0, 1, 2, 3, 4, 5}; + + __check_success(insns, ARRAY_SIZE(insns), map_in, map_out); +} + +/* + * Load a program with two patches (get jiffies, for simplicity). Add an + * insn_array map pointing to every instruction. Check how it was changed + * after the program load. + */ +static void check_simple(void) +{ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + __u32 map_in[] = {0, 1, 2, 3, 4, 5}; + __u32 map_out[] = {0, 1, 4, 5, 8, 9}; + + __check_success(insns, ARRAY_SIZE(insns), map_in, map_out); +} + +/* + * Verifier can delete code in two cases: nops & dead code. From insn + * array's point of view, the two cases are the same, so test using + * the simplest method: by loading some nops + */ +static void check_deletions(void) +{ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */ + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + __u32 map_in[] = {0, 1, 2, 3, 4, 5}; + __u32 map_out[] = {0, -1, 1, -1, 2, 3}; + + __check_success(insns, ARRAY_SIZE(insns), map_in, map_out); +} + +/* + * Same test as check_deletions, but also add code which adds instructions + */ +static void check_deletions_with_functions(void) +{ + struct bpf_insn insns[] = { + BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64), + BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64), + BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */ + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }; + __u32 map_in[] = { 0, 1, 2, 3, 4, 5, /* func */ 6, 7, 8, 9, 10}; + __u32 map_out[] = {-1, 0, -1, 3, 4, 5, /* func */ -1, 6, -1, 9, 10}; + + __check_success(insns, ARRAY_SIZE(insns), map_in, map_out); +} + +/* + * Try to load a program with a map which points to outside of the program + */ +static void check_out_of_bounds_index(void) +{ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 4), + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int prog_fd, map_fd; + struct bpf_insn_array_value val = {}; + int key; + + map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, 1); + if (!ASSERT_GE(map_fd, 0, "map_create")) + return; + + key = 0; + val.orig_off = ARRAY_SIZE(insns); /* too big */ + if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &key, &val, 0), 0, "bpf_map_update_elem")) + goto cleanup; + + if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) + goto cleanup; + + prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1); + if (!ASSERT_EQ(prog_fd, -EINVAL, "program should have been rejected (prog_fd != -EINVAL)")) { + close(prog_fd); + goto cleanup; + } + +cleanup: + close(map_fd); +} + +/* + * Try to load a program with a map which points to the middle of 16-bit insn + */ +static void check_mid_insn_index(void) +{ + struct bpf_insn insns[] = { + BPF_LD_IMM64(BPF_REG_0, 0), /* 2 x 8 */ + BPF_EXIT_INSN(), + }; + int prog_fd, map_fd; + struct bpf_insn_array_value val = {}; + int key; + + map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, 1); + if (!ASSERT_GE(map_fd, 0, "map_create")) + return; + + key = 0; + val.orig_off = 1; /* middle of 16-byte instruction */ + if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &key, &val, 0), 0, "bpf_map_update_elem")) + goto cleanup; + + if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) + goto cleanup; + + prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1); + if (!ASSERT_EQ(prog_fd, -EINVAL, "program should have been rejected (prog_fd != -EINVAL)")) { + close(prog_fd); + goto cleanup; + } + +cleanup: + close(map_fd); +} + +static void check_incorrect_index(void) +{ + check_out_of_bounds_index(); + check_mid_insn_index(); +} + +static int set_bpf_jit_harden(char *level) +{ + char old_level; + int err = -1; + int fd = -1; + + fd = open("/proc/sys/net/core/bpf_jit_harden", O_RDWR | O_NONBLOCK); + if (fd < 0) { + ASSERT_FAIL("open .../bpf_jit_harden returned %d (errno=%d)", fd, errno); + return -1; + } + + err = read(fd, &old_level, 1); + if (err != 1) { + ASSERT_FAIL("read from .../bpf_jit_harden returned %d (errno=%d)", err, errno); + err = -1; + goto end; + } + + lseek(fd, 0, SEEK_SET); + + err = write(fd, level, 1); + if (err != 1) { + ASSERT_FAIL("write to .../bpf_jit_harden returned %d (errno=%d)", err, errno); + err = -1; + goto end; + } + + err = 0; + *level = old_level; +end: + if (fd >= 0) + close(fd); + return err; +} + +static void check_blindness(void) +{ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 4), + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }; + int prog_fd = -1, map_fd; + struct bpf_insn_array_value val = {}; + char bpf_jit_harden = '@'; /* non-exizsting value */ + int i; + + map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, ARRAY_SIZE(insns)); + if (!ASSERT_GE(map_fd, 0, "map_create")) + return; + + for (i = 0; i < ARRAY_SIZE(insns); i++) { + val.orig_off = i; + if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &i, &val, 0), 0, "bpf_map_update_elem")) + goto cleanup; + } + + if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) + goto cleanup; + + bpf_jit_harden = '2'; + if (set_bpf_jit_harden(&bpf_jit_harden)) { + bpf_jit_harden = '@'; /* open, read or write failed => no write was done */ + goto cleanup; + } + + prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1); + if (!ASSERT_GE(prog_fd, 0, "bpf(BPF_PROG_LOAD)")) + goto cleanup; + + for (i = 0; i < ARRAY_SIZE(insns); i++) { + char fmt[32]; + + if (!ASSERT_EQ(bpf_map_lookup_elem(map_fd, &i, &val), 0, "bpf_map_lookup_elem")) + goto cleanup; + + snprintf(fmt, sizeof(fmt), "val should be equal 3*%d", i); + ASSERT_EQ(val.xlated_off, i * 3, fmt); + } + +cleanup: + /* restore the old one */ + if (bpf_jit_harden != '@') + set_bpf_jit_harden(&bpf_jit_harden); + + close(prog_fd); + close(map_fd); +} + +/* Once map was initialized, it should be frozen */ +static void check_load_unfrozen_map(void) +{ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int prog_fd = -1, map_fd; + struct bpf_insn_array_value val = {}; + int i; + + map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, ARRAY_SIZE(insns)); + if (!ASSERT_GE(map_fd, 0, "map_create")) + return; + + for (i = 0; i < ARRAY_SIZE(insns); i++) { + val.orig_off = i; + if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &i, &val, 0), 0, "bpf_map_update_elem")) + goto cleanup; + } + + prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1); + if (!ASSERT_EQ(prog_fd, -EINVAL, "program should have been rejected (prog_fd != -EINVAL)")) + goto cleanup; + + /* correctness: now freeze the map, the program should load fine */ + + if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) + goto cleanup; + + prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1); + if (!ASSERT_GE(prog_fd, 0, "bpf(BPF_PROG_LOAD)")) + goto cleanup; + + for (i = 0; i < ARRAY_SIZE(insns); i++) { + if (!ASSERT_EQ(bpf_map_lookup_elem(map_fd, &i, &val), 0, "bpf_map_lookup_elem")) + goto cleanup; + + ASSERT_EQ(val.xlated_off, i, "val should be equal i"); + } + +cleanup: + close(prog_fd); + close(map_fd); +} + +/* Map can be used only by one BPF program */ +static void check_no_map_reuse(void) +{ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int prog_fd = -1, map_fd, extra_fd = -1; + struct bpf_insn_array_value val = {}; + int i; + + map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, ARRAY_SIZE(insns)); + if (!ASSERT_GE(map_fd, 0, "map_create")) + return; + + for (i = 0; i < ARRAY_SIZE(insns); i++) { + val.orig_off = i; + if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &i, &val, 0), 0, "bpf_map_update_elem")) + goto cleanup; + } + + if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) + goto cleanup; + + prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1); + if (!ASSERT_GE(prog_fd, 0, "bpf(BPF_PROG_LOAD)")) + goto cleanup; + + for (i = 0; i < ARRAY_SIZE(insns); i++) { + if (!ASSERT_EQ(bpf_map_lookup_elem(map_fd, &i, &val), 0, "bpf_map_lookup_elem")) + goto cleanup; + + ASSERT_EQ(val.xlated_off, i, "val should be equal i"); + } + + extra_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1); + if (!ASSERT_EQ(extra_fd, -EBUSY, "program should have been rejected (extra_fd != -EBUSY)")) + goto cleanup; + + /* correctness: check that prog is still loadable without fd_array */ + extra_fd = prog_load(insns, ARRAY_SIZE(insns), NULL, 0); + if (!ASSERT_GE(extra_fd, 0, "bpf(BPF_PROG_LOAD): expected no error")) + goto cleanup; + +cleanup: + close(extra_fd); + close(prog_fd); + close(map_fd); +} + +static void check_bpf_no_lookup(void) +{ + struct bpf_insn insns[] = { + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), + }; + int prog_fd = -1, map_fd; + + map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, 1); + if (!ASSERT_GE(map_fd, 0, "map_create")) + return; + + insns[0].imm = map_fd; + + if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) + goto cleanup; + + prog_fd = prog_load(insns, ARRAY_SIZE(insns), NULL, 0); + if (!ASSERT_EQ(prog_fd, -EINVAL, "program should have been rejected (prog_fd != -EINVAL)")) + goto cleanup; + + /* correctness: check that prog is still loadable with normal map */ + close(map_fd); + map_fd = map_create(BPF_MAP_TYPE_ARRAY, 1); + insns[0].imm = map_fd; + prog_fd = prog_load(insns, ARRAY_SIZE(insns), NULL, 0); + if (!ASSERT_GE(prog_fd, 0, "bpf(BPF_PROG_LOAD)")) + goto cleanup; + +cleanup: + close(prog_fd); + close(map_fd); +} + +static void check_bpf_side(void) +{ + check_bpf_no_lookup(); +} + +static void __test_bpf_insn_array(void) +{ + /* Test if offsets are adjusted properly */ + + if (test__start_subtest("one2one")) + check_one_to_one_mapping(); + + if (test__start_subtest("simple")) + check_simple(); + + if (test__start_subtest("deletions")) + check_deletions(); + + if (test__start_subtest("deletions-with-functions")) + check_deletions_with_functions(); + + if (test__start_subtest("blindness")) + check_blindness(); + + /* Check all kinds of operations and related restrictions */ + + if (test__start_subtest("incorrect-index")) + check_incorrect_index(); + + if (test__start_subtest("load-unfrozen-map")) + check_load_unfrozen_map(); + + if (test__start_subtest("no-map-reuse")) + check_no_map_reuse(); + + if (test__start_subtest("bpf-side-ops")) + check_bpf_side(); +} +#else +static void __test_bpf_insn_array(void) +{ + test__skip(); +} +#endif + +void test_bpf_insn_array(void) +{ + __test_bpf_insn_array(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 8a9ba42921092c..054ecb6b1e9f1b 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -7496,6 +7496,71 @@ static struct btf_dedup_test dedup_tests[] = { }, }, { + .descr = "dedup: recursive typedef", + /* + * This test simulates a recursive typedef, which in GO is defined as such: + * + * type Foo func() Foo + * + * In BTF terms, this is represented as a TYPEDEF referencing + * a FUNC_PROTO that returns the same TYPEDEF. + */ + .input = { + .raw_types = { + /* + * [1] typedef Foo -> func() Foo + * [2] func_proto() -> Foo + * [3] typedef Foo -> func() Foo + * [4] func_proto() -> Foo + */ + BTF_TYPEDEF_ENC(NAME_NTH(1), 2), /* [1] */ + BTF_FUNC_PROTO_ENC(1, 0), /* [2] */ + BTF_TYPEDEF_ENC(NAME_NTH(1), 4), /* [3] */ + BTF_FUNC_PROTO_ENC(3, 0), /* [4] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0Foo"), + }, + .expect = { + .raw_types = { + BTF_TYPEDEF_ENC(NAME_NTH(1), 2), /* [1] */ + BTF_FUNC_PROTO_ENC(1, 0), /* [2] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0Foo"), + }, +}, +{ + .descr = "dedup: typedef", + /* + * // CU 1: + * typedef int foo; + * + * // CU 2: + * typedef int foo; + */ + .input = { + .raw_types = { + /* CU 1 */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPEDEF_ENC(NAME_NTH(1), 1), /* [2] */ + /* CU 2 */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [3] */ + BTF_TYPEDEF_ENC(NAME_NTH(1), 3), /* [4] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0foo"), + }, + .expect = { + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPEDEF_ENC(NAME_NTH(1), 1), /* [2] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0foo"), + }, +}, +{ .descr = "dedup: typedef tags", .input = { .raw_types = { diff --git a/tools/testing/selftests/bpf/prog_tests/btf_split.c b/tools/testing/selftests/bpf/prog_tests/btf_split.c index 3696fb9a05edd0..2d47cad50a51d9 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_split.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_split.c @@ -12,11 +12,45 @@ static void btf_dump_printf(void *ctx, const char *fmt, va_list args) vfprintf(ctx, fmt, args); } +/* Write raw BTF to file, return number of bytes written or negative errno */ +static ssize_t btf_raw_write(struct btf *btf, char *file) +{ + ssize_t written = 0; + const void *data; + __u32 size = 0; + int fd, ret; + + fd = mkstemp(file); + if (!ASSERT_GE(fd, 0, "create_file")) + return -errno; + + data = btf__raw_data(btf, &size); + if (!ASSERT_OK_PTR(data, "btf__raw_data")) { + close(fd); + return -EINVAL; + } + while (written < size) { + ret = write(fd, data + written, size - written); + if (!ASSERT_GE(ret, 0, "write succeeded")) { + close(fd); + return -errno; + } + written += ret; + } + close(fd); + return written; +} + static void __test_btf_split(bool multi) { + char multisplit_btf_file[] = "/tmp/test_btf_multisplit.XXXXXX"; + char split_btf_file[] = "/tmp/test_btf_split.XXXXXX"; + char base_btf_file[] = "/tmp/test_btf_base.XXXXXX"; + ssize_t multisplit_btf_sz = 0, split_btf_sz = 0, base_btf_sz = 0; struct btf_dump *d = NULL; - const struct btf_type *t; - struct btf *btf1, *btf2, *btf3 = NULL; + const struct btf_type *t, *ot; + struct btf *btf1 = NULL, *btf2 = NULL, *btf3 = NULL; + struct btf *btf4 = NULL, *btf5 = NULL, *btf6 = NULL; int str_off, i, err; btf1 = btf__new_empty(); @@ -123,6 +157,45 @@ static void __test_btf_split(bool multi) " int uf2;\n" "};\n\n", "c_dump"); + /* write base, split BTFs to files and ensure parsing succeeds */ + base_btf_sz = btf_raw_write(btf1, base_btf_file); + if (base_btf_sz < 0) + goto cleanup; + split_btf_sz = btf_raw_write(btf2, split_btf_file); + if (split_btf_sz < 0) + goto cleanup; + btf4 = btf__parse(base_btf_file, NULL); + if (!ASSERT_OK_PTR(btf4, "parse_base")) + goto cleanup; + btf5 = btf__parse_split(split_btf_file, btf4); + if (!ASSERT_OK_PTR(btf5, "parse_split")) + goto cleanup; + if (multi) { + multisplit_btf_sz = btf_raw_write(btf3, multisplit_btf_file); + if (multisplit_btf_sz < 0) + goto cleanup; + btf6 = btf__parse_split(multisplit_btf_file, btf5); + if (!ASSERT_OK_PTR(btf6, "parse_multisplit")) + goto cleanup; + } else { + btf6 = btf5; + } + + if (!ASSERT_EQ(btf__type_cnt(btf3), btf__type_cnt(btf6), "cmp_type_cnt")) + goto cleanup; + + /* compare parsed to original BTF */ + for (i = 1; i < btf__type_cnt(btf6); i++) { + t = btf__type_by_id(btf6, i); + if (!ASSERT_OK_PTR(t, "type_in_parsed_btf")) + goto cleanup; + ot = btf__type_by_id(btf3, i); + if (!ASSERT_OK_PTR(ot, "type_in_orig_btf")) + goto cleanup; + if (!ASSERT_EQ(memcmp(t, ot, sizeof(*ot)), 0, "cmp_parsed_orig_btf")) + goto cleanup; + } + cleanup: if (dump_buf_file) fclose(dump_buf_file); @@ -132,6 +205,16 @@ cleanup: btf__free(btf2); if (btf2 != btf3) btf__free(btf3); + btf__free(btf4); + btf__free(btf5); + if (btf5 != btf6) + btf__free(btf6); + if (base_btf_sz > 0) + unlink(base_btf_file); + if (split_btf_sz > 0) + unlink(split_btf_file); + if (multisplit_btf_sz > 0) + unlink(multisplit_btf_file); } void test_btf_split(void) diff --git a/tools/testing/selftests/bpf/prog_tests/check_mtu.c b/tools/testing/selftests/bpf/prog_tests/check_mtu.c index 2a9a30650350ed..65b4512967e746 100644 --- a/tools/testing/selftests/bpf/prog_tests/check_mtu.c +++ b/tools/testing/selftests/bpf/prog_tests/check_mtu.c @@ -153,6 +153,26 @@ static void test_check_mtu_run_tc(struct test_check_mtu *skel, ASSERT_EQ(mtu_result, mtu_expect, "MTU-compare-user"); } +static void test_chk_segs_flag(struct test_check_mtu *skel, __u32 mtu) +{ + int err, prog_fd = bpf_program__fd(skel->progs.tc_chk_segs_flag); + struct __sk_buff skb = { + .gso_size = 10, + }; + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .ctx_in = &skb, + .ctx_size_in = sizeof(skb), + ); + + /* Lower the mtu to test the BPF_MTU_CHK_SEGS */ + SYS_NOFAIL("ip link set dev lo mtu 10"); + err = bpf_prog_test_run_opts(prog_fd, &topts); + SYS_NOFAIL("ip link set dev lo mtu %u", mtu); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, BPF_OK, "retval"); +} static void test_check_mtu_tc(__u32 mtu, __u32 ifindex) { @@ -177,11 +197,12 @@ static void test_check_mtu_tc(__u32 mtu, __u32 ifindex) test_check_mtu_run_tc(skel, skel->progs.tc_minus_delta, mtu); test_check_mtu_run_tc(skel, skel->progs.tc_input_len, mtu); test_check_mtu_run_tc(skel, skel->progs.tc_input_len_exceed, mtu); + test_chk_segs_flag(skel, mtu); cleanup: test_check_mtu__destroy(skel); } -void serial_test_check_mtu(void) +void test_ns_check_mtu(void) { int mtu_lo; diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c index 34b59f6baca11f..7488a7606e6ad7 100644 --- a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c @@ -22,79 +22,37 @@ static int duration = 0; -struct addr_port { - in_port_t port; - union { - struct in_addr in_addr; - struct in6_addr in6_addr; - }; -}; - -struct tuple { - int family; - struct addr_port src; - struct addr_port dst; -}; - -static bool fill_addr_port(const struct sockaddr *sa, struct addr_port *ap) -{ - const struct sockaddr_in6 *in6; - const struct sockaddr_in *in; - - switch (sa->sa_family) { - case AF_INET: - in = (const struct sockaddr_in *)sa; - ap->in_addr = in->sin_addr; - ap->port = in->sin_port; - return true; - - case AF_INET6: - in6 = (const struct sockaddr_in6 *)sa; - ap->in6_addr = in6->sin6_addr; - ap->port = in6->sin6_port; - return true; - - default: - return false; - } -} -static bool set_up_conn(const struct sockaddr *addr, socklen_t len, int type, - int *server, int *conn, struct tuple *tuple) +static bool set_up_conn(const struct sockaddr_storage *addr, socklen_t len, int type, + int *server, int *conn, + struct sockaddr_storage *src, + struct sockaddr_storage *dst) { struct sockaddr_storage ss; socklen_t slen = sizeof(ss); - struct sockaddr *sa = (struct sockaddr *)&ss; - *server = start_server_addr(type, (struct sockaddr_storage *)addr, len, NULL); + *server = start_server_addr(type, addr, len, NULL); if (*server < 0) return false; - if (CHECK_FAIL(getsockname(*server, sa, &slen))) + if (CHECK_FAIL(getsockname(*server, (struct sockaddr *)&ss, &slen))) goto close_server; - *conn = connect_to_addr(type, (struct sockaddr_storage *)sa, slen, NULL); + *conn = connect_to_addr(type, &ss, slen, NULL); if (*conn < 0) goto close_server; /* We want to simulate packets arriving at conn, so we have to * swap src and dst. */ - slen = sizeof(ss); - if (CHECK_FAIL(getsockname(*conn, sa, &slen))) - goto close_conn; - - if (CHECK_FAIL(!fill_addr_port(sa, &tuple->dst))) + slen = sizeof(*dst); + if (CHECK_FAIL(getsockname(*conn, (struct sockaddr *)dst, &slen))) goto close_conn; - slen = sizeof(ss); - if (CHECK_FAIL(getpeername(*conn, sa, &slen))) + slen = sizeof(*src); + if (CHECK_FAIL(getpeername(*conn, (struct sockaddr *)src, &slen))) goto close_conn; - if (CHECK_FAIL(!fill_addr_port(sa, &tuple->src))) - goto close_conn; - - tuple->family = ss.ss_family; return true; close_conn: @@ -110,17 +68,16 @@ static socklen_t prepare_addr(struct sockaddr_storage *addr, int family) { struct sockaddr_in *addr4; struct sockaddr_in6 *addr6; + memset(addr, 0, sizeof(*addr)); switch (family) { case AF_INET: addr4 = (struct sockaddr_in *)addr; - memset(addr4, 0, sizeof(*addr4)); addr4->sin_family = family; addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); return sizeof(*addr4); case AF_INET6: addr6 = (struct sockaddr_in6 *)addr; - memset(addr6, 0, sizeof(*addr6)); addr6->sin6_family = family; addr6->sin6_addr = in6addr_loopback; return sizeof(*addr6); @@ -242,9 +199,15 @@ static void encap_init(encap_headers_t *encap, uint8_t hop_count, uint8_t proto) } static size_t build_input(const struct test_cfg *test, void *const buf, - const struct tuple *tuple) + const struct sockaddr_storage *src, + const struct sockaddr_storage *dst) { - in_port_t sport = tuple->src.port; + struct sockaddr_in6 *src_in6 = (struct sockaddr_in6 *)src; + struct sockaddr_in6 *dst_in6 = (struct sockaddr_in6 *)dst; + struct sockaddr_in *src_in = (struct sockaddr_in *)src; + struct sockaddr_in *dst_in = (struct sockaddr_in *)dst; + sa_family_t family = src->ss_family; + in_port_t sport, dport; encap_headers_t encap; struct iphdr ip; struct ipv6hdr ipv6; @@ -254,8 +217,11 @@ static size_t build_input(const struct test_cfg *test, void *const buf, uint8_t *p = buf; int proto; + sport = (family == AF_INET) ? src_in->sin_port : src_in6->sin6_port; + dport = (family == AF_INET) ? dst_in->sin_port : dst_in6->sin6_port; + proto = IPPROTO_IPIP; - if (tuple->family == AF_INET6) + if (family == AF_INET6) proto = IPPROTO_IPV6; encap_init(&encap, test->hops == ONE_HOP ? 1 : 0, proto); @@ -270,15 +236,15 @@ static size_t build_input(const struct test_cfg *test, void *const buf, if (test->type == UDP) proto = IPPROTO_UDP; - switch (tuple->family) { + switch (family) { case AF_INET: ip = (struct iphdr){ .ihl = 5, .version = 4, .ttl = IPDEFTTL, .protocol = proto, - .saddr = tuple->src.in_addr.s_addr, - .daddr = tuple->dst.in_addr.s_addr, + .saddr = src_in->sin_addr.s_addr, + .daddr = dst_in->sin_addr.s_addr, }; p = mempcpy(p, &ip, sizeof(ip)); break; @@ -287,8 +253,8 @@ static size_t build_input(const struct test_cfg *test, void *const buf, .version = 6, .hop_limit = IPDEFTTL, .nexthdr = proto, - .saddr = tuple->src.in6_addr, - .daddr = tuple->dst.in6_addr, + .saddr = src_in6->sin6_addr, + .daddr = dst_in6->sin6_addr, }; p = mempcpy(p, &ipv6, sizeof(ipv6)); break; @@ -303,18 +269,16 @@ static size_t build_input(const struct test_cfg *test, void *const buf, case TCP: tcp = (struct tcphdr){ .source = sport, - .dest = tuple->dst.port, + .dest = dport, + .syn = (test->flags == SYN), + .ack = (test->flags == ACK), }; - if (test->flags == SYN) - tcp.syn = true; - if (test->flags == ACK) - tcp.ack = true; p = mempcpy(p, &tcp, sizeof(tcp)); break; case UDP: udp = (struct udphdr){ .source = sport, - .dest = tuple->dst.port, + .dest = dport, }; p = mempcpy(p, &udp, sizeof(udp)); break; @@ -339,27 +303,26 @@ static void test_cls_redirect_common(struct bpf_program *prog) LIBBPF_OPTS(bpf_test_run_opts, tattr); int families[] = { AF_INET, AF_INET6 }; struct sockaddr_storage ss; - struct sockaddr *addr; socklen_t slen; int i, j, err, prog_fd; int servers[__NR_KIND][ARRAY_SIZE(families)] = {}; int conns[__NR_KIND][ARRAY_SIZE(families)] = {}; - struct tuple tuples[__NR_KIND][ARRAY_SIZE(families)]; + struct sockaddr_storage srcs[__NR_KIND][ARRAY_SIZE(families)]; + struct sockaddr_storage dsts[__NR_KIND][ARRAY_SIZE(families)]; - addr = (struct sockaddr *)&ss; for (i = 0; i < ARRAY_SIZE(families); i++) { slen = prepare_addr(&ss, families[i]); if (CHECK_FAIL(!slen)) goto cleanup; - if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_DGRAM, + if (CHECK_FAIL(!set_up_conn(&ss, slen, SOCK_DGRAM, &servers[UDP][i], &conns[UDP][i], - &tuples[UDP][i]))) + &srcs[UDP][i], &dsts[UDP][i]))) goto cleanup; - if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_STREAM, + if (CHECK_FAIL(!set_up_conn(&ss, slen, SOCK_STREAM, &servers[TCP][i], &conns[TCP][i], - &tuples[TCP][i]))) + &srcs[TCP][i], &dsts[TCP][i]))) goto cleanup; } @@ -368,11 +331,12 @@ static void test_cls_redirect_common(struct bpf_program *prog) struct test_cfg *test = &tests[i]; for (j = 0; j < ARRAY_SIZE(families); j++) { - struct tuple *tuple = &tuples[test->type][j]; + struct sockaddr_storage *src = &srcs[test->type][j]; + struct sockaddr_storage *dst = &dsts[test->type][j]; char input[256]; char tmp[256]; - test_str(tmp, sizeof(tmp), test, tuple->family); + test_str(tmp, sizeof(tmp), test, families[j]); if (!test__start_subtest(tmp)) continue; @@ -380,7 +344,7 @@ static void test_cls_redirect_common(struct bpf_program *prog) tattr.data_size_out = sizeof(tmp); tattr.data_in = input; - tattr.data_size_in = build_input(test, input, tuple); + tattr.data_size_in = build_input(test, input, src, dst); if (CHECK_FAIL(!tattr.data_size_in)) continue; diff --git a/tools/testing/selftests/bpf/prog_tests/file_reader.c b/tools/testing/selftests/bpf/prog_tests/file_reader.c new file mode 100644 index 00000000000000..5cde32b35da447 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/file_reader.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <test_progs.h> +#include <network_helpers.h> +#include "file_reader.skel.h" +#include "file_reader_fail.skel.h" +#include <dlfcn.h> +#include <sys/mman.h> + +const char *user_ptr = "hello world"; +char file_contents[256000]; + +void *get_executable_base_addr(void) +{ + Dl_info info; + + if (!dladdr((void *)&get_executable_base_addr, &info)) { + fprintf(stderr, "dladdr failed\n"); + return NULL; + } + + return info.dli_fbase; +} + +static int initialize_file_contents(void) +{ + int fd, page_sz = sysconf(_SC_PAGESIZE); + ssize_t n = 0, cur, off; + void *addr; + + fd = open("/proc/self/exe", O_RDONLY); + if (!ASSERT_OK_FD(fd, "Open /proc/self/exe\n")) + return 1; + + do { + cur = read(fd, file_contents + n, sizeof(file_contents) - n); + if (!ASSERT_GT(cur, 0, "read success")) + break; + n += cur; + } while (n < sizeof(file_contents)); + + close(fd); + + if (!ASSERT_EQ(n, sizeof(file_contents), "Read /proc/self/exe\n")) + return 1; + + addr = get_executable_base_addr(); + if (!ASSERT_NEQ(addr, NULL, "get executable address")) + return 1; + + /* page-align base file address */ + addr = (void *)((unsigned long)addr & ~(page_sz - 1)); + + /* + * Page out range 0..512K, use 0..256K for positive tests and + * 256K..512K for negative tests expecting page faults + */ + for (off = 0; off < sizeof(file_contents) * 2; off += page_sz) { + if (!ASSERT_OK(madvise(addr + off, page_sz, MADV_PAGEOUT), + "madvise pageout")) + return errno; + } + + return 0; +} + +static void run_test(const char *prog_name) +{ + struct file_reader *skel; + struct bpf_program *prog; + int err, fd; + + err = initialize_file_contents(); + if (!ASSERT_OK(err, "initialize file contents")) + return; + + skel = file_reader__open(); + if (!ASSERT_OK_PTR(skel, "file_reader__open")) + return; + + bpf_object__for_each_program(prog, skel->obj) { + bpf_program__set_autoload(prog, strcmp(bpf_program__name(prog), prog_name) == 0); + } + + memcpy(skel->bss->user_buf, file_contents, sizeof(file_contents)); + skel->bss->pid = getpid(); + + err = file_reader__load(skel); + if (!ASSERT_OK(err, "file_reader__load")) + goto cleanup; + + err = file_reader__attach(skel); + if (!ASSERT_OK(err, "file_reader__attach")) + goto cleanup; + + fd = open("/proc/self/exe", O_RDONLY); + if (fd >= 0) + close(fd); + + ASSERT_EQ(skel->bss->err, 0, "err"); + ASSERT_EQ(skel->bss->run_success, 1, "run_success"); +cleanup: + file_reader__destroy(skel); +} + +void test_file_reader(void) +{ + if (test__start_subtest("on_open_expect_fault")) + run_test("on_open_expect_fault"); + + if (test__start_subtest("on_open_validate_file_read")) + run_test("on_open_validate_file_read"); + + if (test__start_subtest("negative")) + RUN_TESTS(file_reader_fail); +} diff --git a/tools/testing/selftests/bpf/prog_tests/htab_update.c b/tools/testing/selftests/bpf/prog_tests/htab_update.c index 2bc85f4814f4f9..d0b405eb296627 100644 --- a/tools/testing/selftests/bpf/prog_tests/htab_update.c +++ b/tools/testing/selftests/bpf/prog_tests/htab_update.c @@ -15,17 +15,17 @@ struct htab_update_ctx { static void test_reenter_update(void) { struct htab_update *skel; - unsigned int key, value; + void *value = NULL; + unsigned int key, value_size; int err; skel = htab_update__open(); if (!ASSERT_OK_PTR(skel, "htab_update__open")) return; - /* lookup_elem_raw() may be inlined and find_kernel_btf_id() will return -ESRCH */ - bpf_program__set_autoload(skel->progs.lookup_elem_raw, true); + bpf_program__set_autoload(skel->progs.bpf_obj_free_fields, true); err = htab_update__load(skel); - if (!ASSERT_TRUE(!err || err == -ESRCH, "htab_update__load") || err) + if (!ASSERT_TRUE(!err, "htab_update__load") || err) goto out; skel->bss->pid = getpid(); @@ -33,14 +33,33 @@ static void test_reenter_update(void) if (!ASSERT_OK(err, "htab_update__attach")) goto out; - /* Will trigger the reentrancy of bpf_map_update_elem() */ + value_size = bpf_map__value_size(skel->maps.htab); + + value = calloc(1, value_size); + if (!ASSERT_OK_PTR(value, "calloc value")) + goto out; + /* + * First update: plain insert. This should NOT trigger the re-entrancy + * path, because there is no old element to free yet. + */ key = 0; - value = 0; - err = bpf_map_update_elem(bpf_map__fd(skel->maps.htab), &key, &value, 0); - if (!ASSERT_OK(err, "add element")) + err = bpf_map_update_elem(bpf_map__fd(skel->maps.htab), &key, value, BPF_ANY); + if (!ASSERT_OK(err, "first update (insert)")) + goto out; + + /* + * Second update: replace existing element with same key and trigger + * the reentrancy of bpf_map_update_elem(). + * check_and_free_fields() calls bpf_obj_free_fields() on the old + * value, which is where fentry program runs and performs a nested + * bpf_map_update_elem(), triggering -EDEADLK. + */ + memset(value, 0, value_size); + err = bpf_map_update_elem(bpf_map__fd(skel->maps.htab), &key, value, BPF_ANY); + if (!ASSERT_OK(err, "second update (replace)")) goto out; - ASSERT_EQ(skel->bss->update_err, -EBUSY, "no reentrancy"); + ASSERT_EQ(skel->bss->update_err, -EDEADLK, "no reentrancy"); out: htab_update__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c b/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c index 1de14b111931aa..6e35e13c20220e 100644 --- a/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c @@ -57,7 +57,8 @@ static void subtest_kmem_cache_iter_check_slabinfo(struct kmem_cache_iter *skel) if (!ASSERT_OK(ret, "kmem_cache_lookup")) break; - ASSERT_STREQ(r.name, name, "kmem_cache_name"); + ASSERT_STRNEQ(r.name, name, sizeof(r.name) - 1, + "kmem_cache_name"); ASSERT_EQ(r.obj_size, objsize, "kmem_cache_objsize"); seen++; diff --git a/tools/testing/selftests/bpf/prog_tests/perf_branches.c b/tools/testing/selftests/bpf/prog_tests/perf_branches.c index bc24f83339d642..0a7ef770c487c8 100644 --- a/tools/testing/selftests/bpf/prog_tests/perf_branches.c +++ b/tools/testing/selftests/bpf/prog_tests/perf_branches.c @@ -15,6 +15,10 @@ static void check_good_sample(struct test_perf_branches *skel) int pbe_size = sizeof(struct perf_branch_entry); int duration = 0; + if (CHECK(!skel->bss->run_cnt, "invalid run_cnt", + "checked sample validity before prog run")) + return; + if (CHECK(!skel->bss->valid, "output not valid", "no valid sample from prog")) return; @@ -45,6 +49,10 @@ static void check_bad_sample(struct test_perf_branches *skel) int written_stack = skel->bss->written_stack_out; int duration = 0; + if (CHECK(!skel->bss->run_cnt, "invalid run_cnt", + "checked sample validity before prog run")) + return; + if (CHECK(!skel->bss->valid, "output not valid", "no valid sample from prog")) return; @@ -83,8 +91,12 @@ static void test_perf_branches_common(int perf_fd, err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set); if (CHECK(err, "set_affinity", "cpu #0, err %d\n", err)) goto out_destroy; - /* spin the loop for a while (random high number) */ - for (i = 0; i < 1000000; ++i) + + /* Spin the loop for a while by using a high iteration count, and by + * checking whether the specific run count marker has been explicitly + * incremented at least once by the backing perf_event BPF program. + */ + for (i = 0; i < 100000000 && !*(volatile int *)&skel->bss->run_cnt; ++i) ++j; test_perf_branches__detach(skel); @@ -116,11 +128,11 @@ static void test_perf_branches_hw(void) pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); /* - * Some setups don't support branch records (virtual machines, !x86), - * so skip test in this case. + * Some setups don't support LBR (virtual machines, !x86, AMD Milan Zen + * 3 which only supports BRS), so skip test in this case. */ if (pfd < 0) { - if (errno == ENOENT || errno == EOPNOTSUPP) { + if (errno == ENOENT || errno == EOPNOTSUPP || errno == EINVAL) { printf("%s:SKIP:no PERF_SAMPLE_BRANCH_STACK\n", __func__); test__skip(); diff --git a/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c b/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c index c9f855e5da2409..246eb259c08a18 100644 --- a/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c @@ -28,6 +28,7 @@ static void test_success(void) bpf_program__set_autoload(skel->progs.two_regions, true); bpf_program__set_autoload(skel->progs.non_sleepable_1, true); bpf_program__set_autoload(skel->progs.non_sleepable_2, true); + bpf_program__set_autoload(skel->progs.nested_rcu_region, true); bpf_program__set_autoload(skel->progs.task_trusted_non_rcuptr, true); bpf_program__set_autoload(skel->progs.rcu_read_lock_subprog, true); bpf_program__set_autoload(skel->progs.rcu_read_lock_global_subprog, true); @@ -78,7 +79,8 @@ static const char * const inproper_region_tests[] = { "non_sleepable_rcu_mismatch", "inproper_sleepable_helper", "inproper_sleepable_kfunc", - "nested_rcu_region", + "nested_rcu_region_unbalanced_1", + "nested_rcu_region_unbalanced_2", "rcu_read_lock_global_subprog_lock", "rcu_read_lock_global_subprog_unlock", "rcu_read_lock_sleepable_helper_global_subprog", diff --git a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c index d6bd5e16e6372a..d2c0542716a848 100644 --- a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c +++ b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c @@ -44,3 +44,59 @@ void test_refcounted_kptr_wrong_owner(void) ASSERT_OK(opts.retval, "rbtree_wrong_owner_remove_fail_a2 retval"); refcounted_kptr__destroy(skel); } + +void test_percpu_hash_refcounted_kptr_refcount_leak(void) +{ + struct refcounted_kptr *skel; + int cpu_nr, fd, err, key = 0; + struct bpf_map *map; + size_t values_sz; + u64 *values; + LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1, + ); + + cpu_nr = libbpf_num_possible_cpus(); + if (!ASSERT_GT(cpu_nr, 0, "libbpf_num_possible_cpus")) + return; + + values = calloc(cpu_nr, sizeof(u64)); + if (!ASSERT_OK_PTR(values, "calloc values")) + return; + + skel = refcounted_kptr__open_and_load(); + if (!ASSERT_OK_PTR(skel, "refcounted_kptr__open_and_load")) { + free(values); + return; + } + + values_sz = cpu_nr * sizeof(u64); + memset(values, 0, values_sz); + + map = skel->maps.percpu_hash; + err = bpf_map__update_elem(map, &key, sizeof(key), values, values_sz, 0); + if (!ASSERT_OK(err, "bpf_map__update_elem")) + goto out; + + fd = bpf_program__fd(skel->progs.percpu_hash_refcount_leak); + err = bpf_prog_test_run_opts(fd, &opts); + if (!ASSERT_OK(err, "bpf_prog_test_run_opts")) + goto out; + if (!ASSERT_EQ(opts.retval, 2, "opts.retval")) + goto out; + + err = bpf_map__update_elem(map, &key, sizeof(key), values, values_sz, 0); + if (!ASSERT_OK(err, "bpf_map__update_elem")) + goto out; + + fd = bpf_program__fd(skel->progs.check_percpu_hash_refcount); + err = bpf_prog_test_run_opts(fd, &opts); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + ASSERT_EQ(opts.retval, 1, "opts.retval"); + +out: + refcounted_kptr__destroy(skel); + free(values); +} diff --git a/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c b/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c index 8c6c2043a43275..f0a8c828f8f10b 100644 --- a/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/res_spin_lock.c @@ -110,8 +110,8 @@ void serial_test_res_spin_lock_stress(void) ASSERT_OK(load_module("bpf_test_rqspinlock.ko", false), "load module AA"); sleep(5); unload_module("bpf_test_rqspinlock", false); - - ASSERT_OK(load_module_params("bpf_test_rqspinlock.ko", "test_ab=1", false), "load module ABBA"); - sleep(5); - unload_module("bpf_test_rqspinlock", false); + /* + * Insert bpf_test_rqspinlock.ko manually with test_mode=[1|2] to test + * other cases (ABBA, ABBCCA). + */ } diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index d1e4cb28a72c6b..64520684d2cb9b 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -17,6 +17,7 @@ #include "test_ringbuf_n.lskel.h" #include "test_ringbuf_map_key.lskel.h" #include "test_ringbuf_write.lskel.h" +#include "test_ringbuf_overwrite.lskel.h" #define EDONE 7777 @@ -497,6 +498,68 @@ cleanup: test_ringbuf_map_key_lskel__destroy(skel_map_key); } +static void ringbuf_overwrite_mode_subtest(void) +{ + unsigned long size, len1, len2, len3, len4, len5; + unsigned long expect_avail_data, expect_prod_pos, expect_over_pos; + struct test_ringbuf_overwrite_lskel *skel; + int page_size = getpagesize(); + int err; + + skel = test_ringbuf_overwrite_lskel__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + size = page_size; + len1 = page_size / 2; + len2 = page_size / 4; + len3 = size - len1 - len2 - BPF_RINGBUF_HDR_SZ * 3; + len4 = len3 - 8; + len5 = len3; /* retry with len3 */ + + skel->maps.ringbuf.max_entries = size; + skel->rodata->LEN1 = len1; + skel->rodata->LEN2 = len2; + skel->rodata->LEN3 = len3; + skel->rodata->LEN4 = len4; + skel->rodata->LEN5 = len5; + + skel->bss->pid = getpid(); + + err = test_ringbuf_overwrite_lskel__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + err = test_ringbuf_overwrite_lskel__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + syscall(__NR_getpgid); + + ASSERT_EQ(skel->bss->reserve1_fail, 0, "reserve 1"); + ASSERT_EQ(skel->bss->reserve2_fail, 0, "reserve 2"); + ASSERT_EQ(skel->bss->reserve3_fail, 1, "reserve 3"); + ASSERT_EQ(skel->bss->reserve4_fail, 0, "reserve 4"); + ASSERT_EQ(skel->bss->reserve5_fail, 0, "reserve 5"); + + ASSERT_EQ(skel->bss->ring_size, size, "check_ring_size"); + + expect_avail_data = len2 + len4 + len5 + 3 * BPF_RINGBUF_HDR_SZ; + ASSERT_EQ(skel->bss->avail_data, expect_avail_data, "check_avail_size"); + + ASSERT_EQ(skel->bss->cons_pos, 0, "check_cons_pos"); + + expect_prod_pos = len1 + len2 + len4 + len5 + 4 * BPF_RINGBUF_HDR_SZ; + ASSERT_EQ(skel->bss->prod_pos, expect_prod_pos, "check_prod_pos"); + + expect_over_pos = len1 + BPF_RINGBUF_HDR_SZ; + ASSERT_EQ(skel->bss->over_pos, expect_over_pos, "check_over_pos"); + + test_ringbuf_overwrite_lskel__detach(skel); +cleanup: + test_ringbuf_overwrite_lskel__destroy(skel); +} + void test_ringbuf(void) { if (test__start_subtest("ringbuf")) @@ -507,4 +570,6 @@ void test_ringbuf(void) ringbuf_map_key_subtest(); if (test__start_subtest("ringbuf_write")) ringbuf_write_subtest(); + if (test__start_subtest("ringbuf_overwrite_mode")) + ringbuf_overwrite_mode_subtest(); } diff --git a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c index 036d4760d2c16c..3dbcc091f16c93 100644 --- a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c +++ b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c @@ -41,11 +41,7 @@ static struct bpf_object *obj; static __u32 index_zero; static int epfd; -static union sa46 { - struct sockaddr_in6 v6; - struct sockaddr_in v4; - sa_family_t family; -} srv_sa; +static struct sockaddr_storage srv_sa; #define RET_IF(condition, tag, format...) ({ \ if (CHECK_FAIL(condition)) { \ @@ -135,24 +131,24 @@ static int prepare_bpf_obj(void) return 0; } -static void sa46_init_loopback(union sa46 *sa, sa_family_t family) +static void ss_init_loopback(struct sockaddr_storage *sa, sa_family_t family) { memset(sa, 0, sizeof(*sa)); - sa->family = family; - if (sa->family == AF_INET6) - sa->v6.sin6_addr = in6addr_loopback; + sa->ss_family = family; + if (sa->ss_family == AF_INET6) + ((struct sockaddr_in6 *)sa)->sin6_addr = in6addr_loopback; else - sa->v4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + ((struct sockaddr_in *)sa)->sin_addr.s_addr = htonl(INADDR_LOOPBACK); } -static void sa46_init_inany(union sa46 *sa, sa_family_t family) +static void ss_init_inany(struct sockaddr_storage *sa, sa_family_t family) { memset(sa, 0, sizeof(*sa)); - sa->family = family; - if (sa->family == AF_INET6) - sa->v6.sin6_addr = in6addr_any; + sa->ss_family = family; + if (sa->ss_family == AF_INET6) + ((struct sockaddr_in6 *)sa)->sin6_addr = in6addr_any; else - sa->v4.sin_addr.s_addr = INADDR_ANY; + ((struct sockaddr_in *)sa)->sin_addr.s_addr = INADDR_ANY; } static int read_int_sysctl(const char *sysctl) @@ -228,7 +224,7 @@ static void check_data(int type, sa_family_t family, const struct cmd *cmd, int cli_fd) { struct data_check expected = {}, result; - union sa46 cli_sa; + struct sockaddr_storage cli_sa; socklen_t addrlen; int err; @@ -251,26 +247,32 @@ static void check_data(int type, sa_family_t family, const struct cmd *cmd, } if (family == AF_INET6) { + struct sockaddr_in6 *srv_v6 = (struct sockaddr_in6 *)&srv_sa; + struct sockaddr_in6 *cli_v6 = (struct sockaddr_in6 *)&cli_sa; + expected.eth_protocol = htons(ETH_P_IPV6); - expected.bind_inany = !srv_sa.v6.sin6_addr.s6_addr32[3] && - !srv_sa.v6.sin6_addr.s6_addr32[2] && - !srv_sa.v6.sin6_addr.s6_addr32[1] && - !srv_sa.v6.sin6_addr.s6_addr32[0]; + expected.bind_inany = !srv_v6->sin6_addr.s6_addr32[3] && + !srv_v6->sin6_addr.s6_addr32[2] && + !srv_v6->sin6_addr.s6_addr32[1] && + !srv_v6->sin6_addr.s6_addr32[0]; - memcpy(&expected.skb_addrs[0], cli_sa.v6.sin6_addr.s6_addr32, - sizeof(cli_sa.v6.sin6_addr)); + memcpy(&expected.skb_addrs[0], cli_v6->sin6_addr.s6_addr32, + sizeof(cli_v6->sin6_addr)); memcpy(&expected.skb_addrs[4], &in6addr_loopback, sizeof(in6addr_loopback)); - expected.skb_ports[0] = cli_sa.v6.sin6_port; - expected.skb_ports[1] = srv_sa.v6.sin6_port; + expected.skb_ports[0] = cli_v6->sin6_port; + expected.skb_ports[1] = srv_v6->sin6_port; } else { + struct sockaddr_in *srv_v4 = (struct sockaddr_in *)&srv_sa; + struct sockaddr_in *cli_v4 = (struct sockaddr_in *)&cli_sa; + expected.eth_protocol = htons(ETH_P_IP); - expected.bind_inany = !srv_sa.v4.sin_addr.s_addr; + expected.bind_inany = !srv_v4->sin_addr.s_addr; - expected.skb_addrs[0] = cli_sa.v4.sin_addr.s_addr; + expected.skb_addrs[0] = cli_v4->sin_addr.s_addr; expected.skb_addrs[1] = htonl(INADDR_LOOPBACK); - expected.skb_ports[0] = cli_sa.v4.sin_port; - expected.skb_ports[1] = srv_sa.v4.sin_port; + expected.skb_ports[0] = cli_v4->sin_port; + expected.skb_ports[1] = srv_v4->sin_port; } if (memcmp(&result, &expected, offsetof(struct data_check, @@ -364,16 +366,15 @@ static void check_results(void) static int send_data(int type, sa_family_t family, void *data, size_t len, enum result expected) { - union sa46 cli_sa; + struct sockaddr_storage cli_sa; int fd, err; fd = socket(family, type, 0); RET_ERR(fd == -1, "socket()", "fd:%d errno:%d\n", fd, errno); - sa46_init_loopback(&cli_sa, family); + ss_init_loopback(&cli_sa, family); err = bind(fd, (struct sockaddr *)&cli_sa, sizeof(cli_sa)); RET_ERR(fd == -1, "bind(cli_sa)", "err:%d errno:%d\n", err, errno); - err = sendto(fd, data, len, MSG_FASTOPEN, (struct sockaddr *)&srv_sa, sizeof(srv_sa)); RET_ERR(err != len && expected >= PASS, @@ -589,9 +590,9 @@ static void prepare_sk_fds(int type, sa_family_t family, bool inany) socklen_t addrlen; if (inany) - sa46_init_inany(&srv_sa, family); + ss_init_inany(&srv_sa, family); else - sa46_init_loopback(&srv_sa, family); + ss_init_loopback(&srv_sa, family); addrlen = sizeof(srv_sa); /* diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c index 1702aa592c2c21..7ac4d5a488aa54 100644 --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c @@ -206,6 +206,11 @@ destroy_skel: skel_open_load_failure: close(pipe_c2p[0]); close(pipe_p2c[1]); + /* + * Child is either about to exit cleanly or stuck in case of errors. + * Nudge it to exit. + */ + kill(pid, SIGKILL); wait(NULL); } diff --git a/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c b/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c index 4d66fad3c8bdb9..0f3bf594e7a5eb 100644 --- a/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c +++ b/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c @@ -20,7 +20,9 @@ static const char * const test_cases[] = { "strcspn_str", "strcspn_reject", "strstr", + "strcasestr", "strnstr", + "strncasestr", }; void run_too_long_tests(void) diff --git a/tools/testing/selftests/bpf/prog_tests/test_lsm.c b/tools/testing/selftests/bpf/prog_tests/test_lsm.c index 2a27f3714f5cfd..bdc4fc06bc5a52 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_lsm.c +++ b/tools/testing/selftests/bpf/prog_tests/test_lsm.c @@ -139,7 +139,7 @@ static void test_lsm_tailcall(void) if (CHECK_FAIL(!err)) goto close_prog; - prog_fd = bpf_program__fd(skel->progs.lsm_file_alloc_security_prog); + prog_fd = bpf_program__fd(skel->progs.lsm_kernfs_init_security_prog); if (CHECK_FAIL(prog_fd < 0)) goto close_prog; diff --git a/tools/testing/selftests/bpf/prog_tests/test_tc_edt.c b/tools/testing/selftests/bpf/prog_tests/test_tc_edt.c new file mode 100644 index 00000000000000..462512fb191f1c --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_tc_edt.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* + * BPF-based flow shaping + * + * The test brings up two veth in two isolated namespaces, attach some flow + * shaping program onto it, and ensures that a manual speedtest maximum + * value matches the rate set in the BPF shapers. + */ + +#include <asm-generic/socket.h> +#include <stdio.h> +#include <unistd.h> +#include <fcntl.h> +#include <math.h> +#include <sys/time.h> +#include <sys/socket.h> +#include <bpf/libbpf.h> +#include <pthread.h> +#include "test_progs.h" +#include "network_helpers.h" +#include "test_tc_edt.skel.h" + +#define SERVER_NS "tc-edt-server-ns" +#define CLIENT_NS "tc-edt-client-ns" +#define IP4_ADDR_VETH1 "192.168.1.1" +#define IP4_ADDR_VETH2 "192.168.1.2" +#define IP4_ADDR_VETH2_HEX 0xC0A80102 + +#define TIMEOUT_MS 2000 +#define TEST_PORT 9000 +#define TARGET_RATE_MBPS 5.0 +#define TX_BYTES_COUNT (1 * 1000 * 1000) +#define RATE_ERROR_PERCENT 2.0 + +struct connection { + int server_listen_fd; + int server_conn_fd; + int client_conn_fd; +}; + +static int setup(struct test_tc_edt *skel) +{ + struct nstoken *nstoken_client, *nstoken_server; + int ret; + + if (!ASSERT_OK(make_netns(CLIENT_NS), "create client ns")) + goto fail; + if (!ASSERT_OK(make_netns(SERVER_NS), "create server ns")) + goto fail_delete_client_ns; + + nstoken_client = open_netns(CLIENT_NS); + if (!ASSERT_OK_PTR(nstoken_client, "open client ns")) + goto fail_delete_server_ns; + SYS(fail_close_client_ns, "ip link add veth1 type veth peer name %s", + "veth2 netns " SERVER_NS); + SYS(fail_close_client_ns, "ip -4 addr add " IP4_ADDR_VETH1 "/24 dev veth1"); + SYS(fail_close_client_ns, "ip link set veth1 up"); + + nstoken_server = open_netns(SERVER_NS); + if (!ASSERT_OK_PTR(nstoken_server, "enter server ns")) + goto fail_close_client_ns; + SYS(fail_close_server_ns, "ip -4 addr add " IP4_ADDR_VETH2 "/24 dev veth2"); + SYS(fail_close_server_ns, "ip link set veth2 up"); + SYS(fail_close_server_ns, "tc qdisc add dev veth2 root fq"); + ret = tc_prog_attach("veth2", -1, bpf_program__fd(skel->progs.tc_prog)); + if (!ASSERT_OK(ret, "attach bpf prog")) + goto fail_close_server_ns; + skel->bss->target_rate = TARGET_RATE_MBPS * 1000 * 1000; + close_netns(nstoken_server); + close_netns(nstoken_client); + + return 0; + +fail_close_server_ns: + close_netns(nstoken_server); +fail_close_client_ns: + close_netns(nstoken_client); +fail_delete_server_ns: + remove_netns(SERVER_NS); +fail_delete_client_ns: + remove_netns(CLIENT_NS); +fail: + return -1; +} + +static void cleanup(void) +{ + remove_netns(CLIENT_NS); + remove_netns(SERVER_NS); +} + +static void run_test(void) +{ + int server_fd, client_fd, err; + double rate_mbps, rate_error; + struct nstoken *nstoken; + __u64 ts_start, ts_end; + + nstoken = open_netns(SERVER_NS); + if (!ASSERT_OK_PTR(nstoken, "open server ns")) + return; + server_fd = start_server(AF_INET, SOCK_STREAM, IP4_ADDR_VETH2, + TEST_PORT, TIMEOUT_MS); + if (!ASSERT_OK_FD(server_fd, "start server")) + return; + + close_netns(nstoken); + nstoken = open_netns(CLIENT_NS); + if (!ASSERT_OK_PTR(nstoken, "open client ns")) + return; + client_fd = connect_to_fd(server_fd, 0); + if (!ASSERT_OK_FD(client_fd, "connect client")) + return; + + ts_start = get_time_ns(); + err = send_recv_data(server_fd, client_fd, TX_BYTES_COUNT); + ts_end = get_time_ns(); + close_netns(nstoken); + ASSERT_OK(err, "send_recv_data"); + + rate_mbps = TX_BYTES_COUNT / ((ts_end - ts_start) / 1000.0); + rate_error = + fabs((rate_mbps - TARGET_RATE_MBPS) * 100.0 / TARGET_RATE_MBPS); + + ASSERT_LE(rate_error, RATE_ERROR_PERCENT, + "rate error is lower than threshold"); +} + +void test_tc_edt(void) +{ + struct test_tc_edt *skel; + + skel = test_tc_edt__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel open and load")) + return; + + if (!ASSERT_OK(setup(skel), "global setup")) + return; + + run_test(); + + cleanup(); + test_tc_edt__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_tc_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tc_tunnel.c new file mode 100644 index 00000000000000..0fe0a8f624862d --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_tc_tunnel.c @@ -0,0 +1,714 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* + * End-to-end eBPF tunnel test suite + * The file tests BPF network tunnels implementation. For each tunnel + * type, the test validates that: + * - basic communication can first be established between the two veths + * - when adding a BPF-based encapsulation on client egress, it now fails + * to communicate with the server + * - when adding a kernel-based decapsulation on server ingress, client + * can now connect + * - when replacing the kernel-based decapsulation with a BPF-based one, + * the client can still connect + */ + +#include <stdio.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <bpf/libbpf.h> + +#include "test_progs.h" +#include "network_helpers.h" +#include "test_tc_tunnel.skel.h" + +#define SERVER_NS "tc-tunnel-server-ns" +#define CLIENT_NS "tc-tunnel-client-ns" +#define MAC_ADDR_VETH1 "00:11:22:33:44:55" +#define IP4_ADDR_VETH1 "192.168.1.1" +#define IP6_ADDR_VETH1 "fd::1" +#define MAC_ADDR_VETH2 "66:77:88:99:AA:BB" +#define IP4_ADDR_VETH2 "192.168.1.2" +#define IP6_ADDR_VETH2 "fd::2" + +#define TEST_NAME_MAX_LEN 64 +#define PROG_NAME_MAX_LEN 64 +#define TUNNEL_ARGS_MAX_LEN 128 +#define BUFFER_LEN 2000 +#define DEFAULT_TEST_DATA_SIZE 100 +#define GSO_TEST_DATA_SIZE BUFFER_LEN + +#define TIMEOUT_MS 1000 +#define TEST_PORT 8000 +#define UDP_PORT 5555 +#define MPLS_UDP_PORT 6635 +#define FOU_MPLS_PROTO 137 +#define VXLAN_ID 1 +#define VXLAN_PORT 8472 +#define MPLS_TABLE_ENTRIES_COUNT 65536 + +static char tx_buffer[BUFFER_LEN], rx_buffer[BUFFER_LEN]; + +struct subtest_cfg { + char *ebpf_tun_type; + char *iproute_tun_type; + char *mac_tun_type; + int ipproto; + void (*extra_decap_mod_args_cb)(struct subtest_cfg *cfg, char *dst); + bool tunnel_need_veth_mac; + bool configure_fou_rx_port; + char *tmode; + bool expect_kern_decap_failure; + bool configure_mpls; + bool test_gso; + char *tunnel_client_addr; + char *tunnel_server_addr; + char name[TEST_NAME_MAX_LEN]; + char *server_addr; + int client_egress_prog_fd; + int server_ingress_prog_fd; + char extra_decap_mod_args[TUNNEL_ARGS_MAX_LEN]; + int server_fd; +}; + +struct connection { + int client_fd; + int server_fd; +}; + +static int build_subtest_name(struct subtest_cfg *cfg, char *dst, size_t size) +{ + int ret; + + ret = snprintf(dst, size, "%s_%s", cfg->ebpf_tun_type, + cfg->mac_tun_type); + + return ret < 0 ? ret : 0; +} + +static int set_subtest_progs(struct subtest_cfg *cfg, struct test_tc_tunnel *skel) +{ + char prog_name[PROG_NAME_MAX_LEN]; + struct bpf_program *prog; + int ret; + + ret = snprintf(prog_name, PROG_NAME_MAX_LEN, "__encap_"); + if (ret < 0) + return ret; + ret = build_subtest_name(cfg, prog_name + ret, PROG_NAME_MAX_LEN - ret); + if (ret < 0) + return ret; + prog = bpf_object__find_program_by_name(skel->obj, prog_name); + if (!prog) + return -1; + + cfg->client_egress_prog_fd = bpf_program__fd(prog); + cfg->server_ingress_prog_fd = bpf_program__fd(skel->progs.decap_f); + return 0; +} + +static void set_subtest_addresses(struct subtest_cfg *cfg) +{ + if (cfg->ipproto == 6) + cfg->server_addr = IP6_ADDR_VETH2; + else + cfg->server_addr = IP4_ADDR_VETH2; + + /* Some specific tunnel types need specific addressing, it then + * has been already set in the configuration table. Otherwise, + * deduce the relevant addressing from the ipproto + */ + if (cfg->tunnel_client_addr && cfg->tunnel_server_addr) + return; + + if (cfg->ipproto == 6) { + cfg->tunnel_client_addr = IP6_ADDR_VETH1; + cfg->tunnel_server_addr = IP6_ADDR_VETH2; + } else { + cfg->tunnel_client_addr = IP4_ADDR_VETH1; + cfg->tunnel_server_addr = IP4_ADDR_VETH2; + } +} + +static int run_server(struct subtest_cfg *cfg) +{ + int family = cfg->ipproto == 6 ? AF_INET6 : AF_INET; + struct nstoken *nstoken; + struct network_helper_opts opts = { + .timeout_ms = TIMEOUT_MS + }; + + nstoken = open_netns(SERVER_NS); + if (!ASSERT_OK_PTR(nstoken, "open server ns")) + return -1; + + cfg->server_fd = start_server_str(family, SOCK_STREAM, cfg->server_addr, + TEST_PORT, &opts); + close_netns(nstoken); + if (!ASSERT_OK_FD(cfg->server_fd, "start server")) + return -1; + + return 0; +} + +static int check_server_rx_data(struct subtest_cfg *cfg, + struct connection *conn, int len) +{ + int err; + + memset(rx_buffer, 0, BUFFER_LEN); + err = recv(conn->server_fd, rx_buffer, len, 0); + if (!ASSERT_EQ(err, len, "check rx data len")) + return 1; + if (!ASSERT_MEMEQ(tx_buffer, rx_buffer, len, "check received data")) + return 1; + return 0; +} + +static struct connection *connect_client_to_server(struct subtest_cfg *cfg) +{ + struct network_helper_opts opts = {.timeout_ms = 500}; + int family = cfg->ipproto == 6 ? AF_INET6 : AF_INET; + struct connection *conn = NULL; + int client_fd, server_fd; + + conn = malloc(sizeof(struct connection)); + if (!conn) + return conn; + + client_fd = connect_to_addr_str(family, SOCK_STREAM, cfg->server_addr, + TEST_PORT, &opts); + + if (client_fd < 0) { + free(conn); + return NULL; + } + + server_fd = accept(cfg->server_fd, NULL, NULL); + if (server_fd < 0) { + close(client_fd); + free(conn); + return NULL; + } + + conn->server_fd = server_fd; + conn->client_fd = client_fd; + + return conn; +} + +static void disconnect_client_from_server(struct subtest_cfg *cfg, + struct connection *conn) +{ + close(conn->server_fd); + close(conn->client_fd); + free(conn); +} + +static int send_and_test_data(struct subtest_cfg *cfg, bool must_succeed) +{ + struct connection *conn; + int err, res = -1; + + conn = connect_client_to_server(cfg); + if (!must_succeed && !ASSERT_ERR_PTR(conn, "connection that must fail")) + goto end; + else if (!must_succeed) + return 0; + + if (!ASSERT_OK_PTR(conn, "connection that must succeed")) + return -1; + + err = send(conn->client_fd, tx_buffer, DEFAULT_TEST_DATA_SIZE, 0); + if (!ASSERT_EQ(err, DEFAULT_TEST_DATA_SIZE, "send data from client")) + goto end; + if (check_server_rx_data(cfg, conn, DEFAULT_TEST_DATA_SIZE)) + goto end; + + if (!cfg->test_gso) { + res = 0; + goto end; + } + + err = send(conn->client_fd, tx_buffer, GSO_TEST_DATA_SIZE, 0); + if (!ASSERT_EQ(err, GSO_TEST_DATA_SIZE, "send (large) data from client")) + goto end; + if (check_server_rx_data(cfg, conn, DEFAULT_TEST_DATA_SIZE)) + goto end; + + res = 0; +end: + disconnect_client_from_server(cfg, conn); + return res; +} + +static void vxlan_decap_mod_args_cb(struct subtest_cfg *cfg, char *dst) +{ + snprintf(dst, TUNNEL_ARGS_MAX_LEN, "id %d dstport %d udp6zerocsumrx", + VXLAN_ID, VXLAN_PORT); +} + +static void udp_decap_mod_args_cb(struct subtest_cfg *cfg, char *dst) +{ + bool is_mpls = !strcmp(cfg->mac_tun_type, "mpls"); + + snprintf(dst, TUNNEL_ARGS_MAX_LEN, + "encap fou encap-sport auto encap-dport %d", + is_mpls ? MPLS_UDP_PORT : UDP_PORT); +} + +static int configure_fou_rx_port(struct subtest_cfg *cfg, bool add) +{ + bool is_mpls = strcmp(cfg->mac_tun_type, "mpls") == 0; + int fou_proto; + + if (is_mpls) + fou_proto = FOU_MPLS_PROTO; + else + fou_proto = cfg->ipproto == 6 ? 41 : 4; + + SYS(fail, "ip fou %s port %d ipproto %d%s", add ? "add" : "del", + is_mpls ? MPLS_UDP_PORT : UDP_PORT, fou_proto, + cfg->ipproto == 6 ? " -6" : ""); + + return 0; +fail: + return 1; +} + +static int add_fou_rx_port(struct subtest_cfg *cfg) +{ + return configure_fou_rx_port(cfg, true); +} + +static int del_fou_rx_port(struct subtest_cfg *cfg) +{ + return configure_fou_rx_port(cfg, false); +} + +static int update_tunnel_intf_addr(struct subtest_cfg *cfg) +{ + SYS(fail, "ip link set dev testtun0 address " MAC_ADDR_VETH2); + return 0; +fail: + return -1; +} + +static int configure_kernel_for_mpls(struct subtest_cfg *cfg) +{ + SYS(fail, "sysctl -qw net.mpls.platform_labels=%d", + MPLS_TABLE_ENTRIES_COUNT); + SYS(fail, "ip -f mpls route add 1000 dev lo"); + SYS(fail, "ip link set lo up"); + SYS(fail, "sysctl -qw net.mpls.conf.testtun0.input=1"); + SYS(fail, "sysctl -qw net.ipv4.conf.lo.rp_filter=0"); + return 0; +fail: + return -1; +} + +static int configure_encapsulation(struct subtest_cfg *cfg) +{ + int ret; + + ret = tc_prog_attach("veth1", -1, cfg->client_egress_prog_fd); + + return ret; +} + +static int configure_kernel_decapsulation(struct subtest_cfg *cfg) +{ + struct nstoken *nstoken = open_netns(SERVER_NS); + int ret = -1; + + if (!ASSERT_OK_PTR(nstoken, "open server ns")) + return ret; + + if (cfg->configure_fou_rx_port && + !ASSERT_OK(add_fou_rx_port(cfg), "configure FOU RX port")) + goto fail; + SYS(fail, "ip link add name testtun0 type %s %s remote %s local %s %s", + cfg->iproute_tun_type, cfg->tmode ? cfg->tmode : "", + cfg->tunnel_client_addr, cfg->tunnel_server_addr, + cfg->extra_decap_mod_args); + if (cfg->tunnel_need_veth_mac && + !ASSERT_OK(update_tunnel_intf_addr(cfg), "update testtun0 mac")) + goto fail; + if (cfg->configure_mpls && + (!ASSERT_OK(configure_kernel_for_mpls(cfg), + "configure MPLS decap"))) + goto fail; + SYS(fail, "sysctl -qw net.ipv4.conf.all.rp_filter=0"); + SYS(fail, "sysctl -qw net.ipv4.conf.testtun0.rp_filter=0"); + SYS(fail, "ip link set dev testtun0 up"); + + ret = 0; +fail: + close_netns(nstoken); + return ret; +} + +static void remove_kernel_decapsulation(struct subtest_cfg *cfg) +{ + SYS_NOFAIL("ip link del testtun0"); + if (cfg->configure_mpls) + SYS_NOFAIL("ip -f mpls route del 1000 dev lo"); + if (cfg->configure_fou_rx_port) + del_fou_rx_port(cfg); +} + +static int configure_ebpf_decapsulation(struct subtest_cfg *cfg) +{ + struct nstoken *nstoken = open_netns(SERVER_NS); + int ret = -1; + + if (!ASSERT_OK_PTR(nstoken, "open server ns")) + return ret; + + if (!cfg->expect_kern_decap_failure) + SYS(fail, "ip link del testtun0"); + + if (!ASSERT_OK(tc_prog_attach("veth2", cfg->server_ingress_prog_fd, -1), + "attach_program")) + goto fail; + + ret = 0; +fail: + close_netns(nstoken); + return ret; +} + +static void run_test(struct subtest_cfg *cfg) +{ + struct nstoken *nstoken; + + if (!ASSERT_OK(run_server(cfg), "run server")) + return; + + nstoken = open_netns(CLIENT_NS); + if (!ASSERT_OK_PTR(nstoken, "open client ns")) + goto fail; + + /* Basic communication must work */ + if (!ASSERT_OK(send_and_test_data(cfg, true), "connect without any encap")) + goto fail; + + /* Attach encapsulation program to client */ + if (!ASSERT_OK(configure_encapsulation(cfg), "configure encapsulation")) + goto fail; + + /* If supported, insert kernel decap module, connection must succeed */ + if (!cfg->expect_kern_decap_failure) { + if (!ASSERT_OK(configure_kernel_decapsulation(cfg), + "configure kernel decapsulation")) + goto fail; + if (!ASSERT_OK(send_and_test_data(cfg, true), + "connect with encap prog and kern decap")) + goto fail; + } + + /* Replace kernel decapsulation with BPF decapsulation, test must pass */ + if (!ASSERT_OK(configure_ebpf_decapsulation(cfg), "configure ebpf decapsulation")) + goto fail; + ASSERT_OK(send_and_test_data(cfg, true), "connect with encap and decap progs"); + +fail: + close_netns(nstoken); + close(cfg->server_fd); +} + +static int setup(void) +{ + struct nstoken *nstoken_client, *nstoken_server; + int fd, err; + + fd = open("/dev/urandom", O_RDONLY); + if (!ASSERT_OK_FD(fd, "open urandom")) + goto fail; + err = read(fd, tx_buffer, BUFFER_LEN); + close(fd); + + if (!ASSERT_EQ(err, BUFFER_LEN, "read random bytes")) + goto fail; + + /* Configure the testing network */ + if (!ASSERT_OK(make_netns(CLIENT_NS), "create client ns") || + !ASSERT_OK(make_netns(SERVER_NS), "create server ns")) + goto fail; + + nstoken_client = open_netns(CLIENT_NS); + if (!ASSERT_OK_PTR(nstoken_client, "open client ns")) + goto fail_delete_ns; + SYS(fail_close_ns_client, "ip link add %s type veth peer name %s", + "veth1 mtu 1500 netns " CLIENT_NS " address " MAC_ADDR_VETH1, + "veth2 mtu 1500 netns " SERVER_NS " address " MAC_ADDR_VETH2); + SYS(fail_close_ns_client, "ethtool -K veth1 tso off"); + SYS(fail_close_ns_client, "ip link set veth1 up"); + nstoken_server = open_netns(SERVER_NS); + if (!ASSERT_OK_PTR(nstoken_server, "open server ns")) + goto fail_close_ns_client; + SYS(fail_close_ns_server, "ip link set veth2 up"); + + close_netns(nstoken_server); + close_netns(nstoken_client); + return 0; + +fail_close_ns_server: + close_netns(nstoken_server); +fail_close_ns_client: + close_netns(nstoken_client); +fail_delete_ns: + SYS_NOFAIL("ip netns del " CLIENT_NS); + SYS_NOFAIL("ip netns del " SERVER_NS); +fail: + return -1; +} + +static int subtest_setup(struct test_tc_tunnel *skel, struct subtest_cfg *cfg) +{ + struct nstoken *nstoken_client, *nstoken_server; + int ret = -1; + + set_subtest_addresses(cfg); + if (!ASSERT_OK(set_subtest_progs(cfg, skel), + "find subtest progs")) + goto fail; + if (cfg->extra_decap_mod_args_cb) + cfg->extra_decap_mod_args_cb(cfg, cfg->extra_decap_mod_args); + + nstoken_client = open_netns(CLIENT_NS); + if (!ASSERT_OK_PTR(nstoken_client, "open client ns")) + goto fail; + SYS(fail_close_client_ns, + "ip -4 addr add " IP4_ADDR_VETH1 "/24 dev veth1"); + SYS(fail_close_client_ns, "ip -4 route flush table main"); + SYS(fail_close_client_ns, + "ip -4 route add " IP4_ADDR_VETH2 " mtu 1450 dev veth1"); + SYS(fail_close_client_ns, + "ip -6 addr add " IP6_ADDR_VETH1 "/64 dev veth1 nodad"); + SYS(fail_close_client_ns, "ip -6 route flush table main"); + SYS(fail_close_client_ns, + "ip -6 route add " IP6_ADDR_VETH2 " mtu 1430 dev veth1"); + nstoken_server = open_netns(SERVER_NS); + if (!ASSERT_OK_PTR(nstoken_server, "open server ns")) + goto fail_close_client_ns; + SYS(fail_close_server_ns, + "ip -4 addr add " IP4_ADDR_VETH2 "/24 dev veth2"); + SYS(fail_close_server_ns, + "ip -6 addr add " IP6_ADDR_VETH2 "/64 dev veth2 nodad"); + + ret = 0; + +fail_close_server_ns: + close_netns(nstoken_server); +fail_close_client_ns: + close_netns(nstoken_client); +fail: + return ret; +} + + +static void subtest_cleanup(struct subtest_cfg *cfg) +{ + struct nstoken *nstoken; + + nstoken = open_netns(CLIENT_NS); + if (ASSERT_OK_PTR(nstoken, "open clien ns")) { + SYS_NOFAIL("tc qdisc delete dev veth1 parent ffff:fff1"); + SYS_NOFAIL("ip a flush veth1"); + close_netns(nstoken); + } + nstoken = open_netns(SERVER_NS); + if (ASSERT_OK_PTR(nstoken, "open clien ns")) { + SYS_NOFAIL("tc qdisc delete dev veth2 parent ffff:fff1"); + SYS_NOFAIL("ip a flush veth2"); + if (!cfg->expect_kern_decap_failure) + remove_kernel_decapsulation(cfg); + close_netns(nstoken); + } +} + +static void cleanup(void) +{ + remove_netns(CLIENT_NS); + remove_netns(SERVER_NS); +} + +static struct subtest_cfg subtests_cfg[] = { + { + .ebpf_tun_type = "ipip", + .mac_tun_type = "none", + .iproute_tun_type = "ipip", + .ipproto = 4, + }, + { + .ebpf_tun_type = "ipip6", + .mac_tun_type = "none", + .iproute_tun_type = "ip6tnl", + .ipproto = 4, + .tunnel_client_addr = IP6_ADDR_VETH1, + .tunnel_server_addr = IP6_ADDR_VETH2, + }, + { + .ebpf_tun_type = "ip6tnl", + .iproute_tun_type = "ip6tnl", + .mac_tun_type = "none", + .ipproto = 6, + }, + { + .mac_tun_type = "none", + .ebpf_tun_type = "sit", + .iproute_tun_type = "sit", + .ipproto = 6, + .tunnel_client_addr = IP4_ADDR_VETH1, + .tunnel_server_addr = IP4_ADDR_VETH2, + }, + { + .ebpf_tun_type = "vxlan", + .mac_tun_type = "eth", + .iproute_tun_type = "vxlan", + .ipproto = 4, + .extra_decap_mod_args_cb = vxlan_decap_mod_args_cb, + .tunnel_need_veth_mac = true + }, + { + .ebpf_tun_type = "ip6vxlan", + .mac_tun_type = "eth", + .iproute_tun_type = "vxlan", + .ipproto = 6, + .extra_decap_mod_args_cb = vxlan_decap_mod_args_cb, + .tunnel_need_veth_mac = true + }, + { + .ebpf_tun_type = "gre", + .mac_tun_type = "none", + .iproute_tun_type = "gre", + .ipproto = 4, + .test_gso = true + }, + { + .ebpf_tun_type = "gre", + .mac_tun_type = "eth", + .iproute_tun_type = "gretap", + .ipproto = 4, + .tunnel_need_veth_mac = true, + .test_gso = true + }, + { + .ebpf_tun_type = "gre", + .mac_tun_type = "mpls", + .iproute_tun_type = "gre", + .ipproto = 4, + .configure_mpls = true, + .test_gso = true + }, + { + .ebpf_tun_type = "ip6gre", + .mac_tun_type = "none", + .iproute_tun_type = "ip6gre", + .ipproto = 6, + .test_gso = true, + }, + { + .ebpf_tun_type = "ip6gre", + .mac_tun_type = "eth", + .iproute_tun_type = "ip6gretap", + .ipproto = 6, + .tunnel_need_veth_mac = true, + .test_gso = true + }, + { + .ebpf_tun_type = "ip6gre", + .mac_tun_type = "mpls", + .iproute_tun_type = "ip6gre", + .ipproto = 6, + .configure_mpls = true, + .test_gso = true + }, + { + .ebpf_tun_type = "udp", + .mac_tun_type = "none", + .iproute_tun_type = "ipip", + .ipproto = 4, + .extra_decap_mod_args_cb = udp_decap_mod_args_cb, + .configure_fou_rx_port = true, + .test_gso = true + }, + { + .ebpf_tun_type = "udp", + .mac_tun_type = "eth", + .iproute_tun_type = "ipip", + .ipproto = 4, + .extra_decap_mod_args_cb = udp_decap_mod_args_cb, + .configure_fou_rx_port = true, + .expect_kern_decap_failure = true, + .test_gso = true + }, + { + .ebpf_tun_type = "udp", + .mac_tun_type = "mpls", + .iproute_tun_type = "ipip", + .ipproto = 4, + .extra_decap_mod_args_cb = udp_decap_mod_args_cb, + .configure_fou_rx_port = true, + .tmode = "mode any ttl 255", + .configure_mpls = true, + .test_gso = true + }, + { + .ebpf_tun_type = "ip6udp", + .mac_tun_type = "none", + .iproute_tun_type = "ip6tnl", + .ipproto = 6, + .extra_decap_mod_args_cb = udp_decap_mod_args_cb, + .configure_fou_rx_port = true, + .test_gso = true + }, + { + .ebpf_tun_type = "ip6udp", + .mac_tun_type = "eth", + .iproute_tun_type = "ip6tnl", + .ipproto = 6, + .extra_decap_mod_args_cb = udp_decap_mod_args_cb, + .configure_fou_rx_port = true, + .expect_kern_decap_failure = true, + .test_gso = true + }, + { + .ebpf_tun_type = "ip6udp", + .mac_tun_type = "mpls", + .iproute_tun_type = "ip6tnl", + .ipproto = 6, + .extra_decap_mod_args_cb = udp_decap_mod_args_cb, + .configure_fou_rx_port = true, + .tmode = "mode any ttl 255", + .expect_kern_decap_failure = true, + .test_gso = true + }, +}; + +void test_tc_tunnel(void) +{ + struct test_tc_tunnel *skel; + struct subtest_cfg *cfg; + int i, ret; + + skel = test_tc_tunnel__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel open and load")) + return; + + if (!ASSERT_OK(setup(), "global setup")) + return; + + for (i = 0; i < ARRAY_SIZE(subtests_cfg); i++) { + cfg = &subtests_cfg[i]; + ret = build_subtest_name(cfg, cfg->name, TEST_NAME_MAX_LEN); + if (ret < 0 || !test__start_subtest(cfg->name)) + continue; + if (subtest_setup(skel, cfg) == 0) + run_test(cfg); + subtest_cleanup(cfg); + } + cleanup(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c index bae0e9de277d24..eb93099312720f 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c +++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c @@ -534,85 +534,6 @@ static void ping6_dev1(void) close_netns(nstoken); } -static int attach_tc_prog(int ifindex, int igr_fd, int egr_fd) -{ - DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .ifindex = ifindex, - .attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS); - DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts1, .handle = 1, - .priority = 1, .prog_fd = igr_fd); - DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts2, .handle = 1, - .priority = 1, .prog_fd = egr_fd); - int ret; - - ret = bpf_tc_hook_create(&hook); - if (!ASSERT_OK(ret, "create tc hook")) - return ret; - - if (igr_fd >= 0) { - hook.attach_point = BPF_TC_INGRESS; - ret = bpf_tc_attach(&hook, &opts1); - if (!ASSERT_OK(ret, "bpf_tc_attach")) { - bpf_tc_hook_destroy(&hook); - return ret; - } - } - - if (egr_fd >= 0) { - hook.attach_point = BPF_TC_EGRESS; - ret = bpf_tc_attach(&hook, &opts2); - if (!ASSERT_OK(ret, "bpf_tc_attach")) { - bpf_tc_hook_destroy(&hook); - return ret; - } - } - - return 0; -} - -static int generic_attach(const char *dev, int igr_fd, int egr_fd) -{ - int ifindex; - - if (!ASSERT_OK_FD(igr_fd, "check ingress fd")) - return -1; - if (!ASSERT_OK_FD(egr_fd, "check egress fd")) - return -1; - - ifindex = if_nametoindex(dev); - if (!ASSERT_NEQ(ifindex, 0, "get ifindex")) - return -1; - - return attach_tc_prog(ifindex, igr_fd, egr_fd); -} - -static int generic_attach_igr(const char *dev, int igr_fd) -{ - int ifindex; - - if (!ASSERT_OK_FD(igr_fd, "check ingress fd")) - return -1; - - ifindex = if_nametoindex(dev); - if (!ASSERT_NEQ(ifindex, 0, "get ifindex")) - return -1; - - return attach_tc_prog(ifindex, igr_fd, -1); -} - -static int generic_attach_egr(const char *dev, int egr_fd) -{ - int ifindex; - - if (!ASSERT_OK_FD(egr_fd, "check egress fd")) - return -1; - - ifindex = if_nametoindex(dev); - if (!ASSERT_NEQ(ifindex, 0, "get ifindex")) - return -1; - - return attach_tc_prog(ifindex, -1, egr_fd); -} - static void test_vxlan_tunnel(void) { struct test_tunnel_kern *skel = NULL; @@ -635,12 +556,12 @@ static void test_vxlan_tunnel(void) goto done; get_src_prog_fd = bpf_program__fd(skel->progs.vxlan_get_tunnel_src); set_src_prog_fd = bpf_program__fd(skel->progs.vxlan_set_tunnel_src); - if (generic_attach(VXLAN_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd)) + if (tc_prog_attach(VXLAN_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd)) goto done; /* load and attach bpf prog to veth dev tc hook point */ set_dst_prog_fd = bpf_program__fd(skel->progs.veth_set_outer_dst); - if (generic_attach_igr("veth1", set_dst_prog_fd)) + if (tc_prog_attach("veth1", set_dst_prog_fd, -1)) goto done; /* load and attach prog set_md to tunnel dev tc hook point at_ns0 */ @@ -648,7 +569,7 @@ static void test_vxlan_tunnel(void) if (!ASSERT_OK_PTR(nstoken, "setns src")) goto done; set_dst_prog_fd = bpf_program__fd(skel->progs.vxlan_set_tunnel_dst); - if (generic_attach_egr(VXLAN_TUNL_DEV0, set_dst_prog_fd)) + if (tc_prog_attach(VXLAN_TUNL_DEV0, -1, set_dst_prog_fd)) goto done; close_netns(nstoken); @@ -695,7 +616,7 @@ static void test_ip6vxlan_tunnel(void) goto done; get_src_prog_fd = bpf_program__fd(skel->progs.ip6vxlan_get_tunnel_src); set_src_prog_fd = bpf_program__fd(skel->progs.ip6vxlan_set_tunnel_src); - if (generic_attach(IP6VXLAN_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd)) + if (tc_prog_attach(IP6VXLAN_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd)) goto done; /* load and attach prog set_md to tunnel dev tc hook point at_ns0 */ @@ -703,7 +624,7 @@ static void test_ip6vxlan_tunnel(void) if (!ASSERT_OK_PTR(nstoken, "setns src")) goto done; set_dst_prog_fd = bpf_program__fd(skel->progs.ip6vxlan_set_tunnel_dst); - if (generic_attach_egr(IP6VXLAN_TUNL_DEV0, set_dst_prog_fd)) + if (tc_prog_attach(IP6VXLAN_TUNL_DEV0, -1, set_dst_prog_fd)) goto done; close_netns(nstoken); @@ -764,7 +685,7 @@ static void test_ipip_tunnel(enum ipip_encap encap) skel->progs.ipip_set_tunnel); } - if (generic_attach(IPIP_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd)) + if (tc_prog_attach(IPIP_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd)) goto done; ping_dev0(); @@ -797,7 +718,7 @@ static void test_xfrm_tunnel(void) /* attach tc prog to tunnel dev */ tc_prog_fd = bpf_program__fd(skel->progs.xfrm_get_state); - if (generic_attach_igr("veth1", tc_prog_fd)) + if (tc_prog_attach("veth1", tc_prog_fd, -1)) goto done; /* attach xdp prog to tunnel dev */ @@ -870,7 +791,7 @@ static void test_gre_tunnel(enum gre_test test) if (!ASSERT_OK(err, "add tunnel")) goto done; - if (generic_attach(GRE_TUNL_DEV1, get_fd, set_fd)) + if (tc_prog_attach(GRE_TUNL_DEV1, get_fd, set_fd)) goto done; ping_dev0(); @@ -911,7 +832,7 @@ static void test_ip6gre_tunnel(enum ip6gre_test test) set_fd = bpf_program__fd(skel->progs.ip6gretap_set_tunnel); get_fd = bpf_program__fd(skel->progs.ip6gretap_get_tunnel); - if (generic_attach(IP6GRE_TUNL_DEV1, get_fd, set_fd)) + if (tc_prog_attach(IP6GRE_TUNL_DEV1, get_fd, set_fd)) goto done; ping6_veth0(); @@ -954,7 +875,7 @@ static void test_erspan_tunnel(enum erspan_test test) set_fd = bpf_program__fd(skel->progs.erspan_set_tunnel); get_fd = bpf_program__fd(skel->progs.erspan_get_tunnel); - if (generic_attach(ERSPAN_TUNL_DEV1, get_fd, set_fd)) + if (tc_prog_attach(ERSPAN_TUNL_DEV1, get_fd, set_fd)) goto done; ping_dev0(); @@ -990,7 +911,7 @@ static void test_ip6erspan_tunnel(enum erspan_test test) set_fd = bpf_program__fd(skel->progs.ip4ip6erspan_set_tunnel); get_fd = bpf_program__fd(skel->progs.ip4ip6erspan_get_tunnel); - if (generic_attach(IP6ERSPAN_TUNL_DEV1, get_fd, set_fd)) + if (tc_prog_attach(IP6ERSPAN_TUNL_DEV1, get_fd, set_fd)) goto done; ping6_veth0(); @@ -1017,7 +938,7 @@ static void test_geneve_tunnel(void) set_fd = bpf_program__fd(skel->progs.geneve_set_tunnel); get_fd = bpf_program__fd(skel->progs.geneve_get_tunnel); - if (generic_attach(GENEVE_TUNL_DEV1, get_fd, set_fd)) + if (tc_prog_attach(GENEVE_TUNL_DEV1, get_fd, set_fd)) goto done; ping_dev0(); @@ -1044,7 +965,7 @@ static void test_ip6geneve_tunnel(void) set_fd = bpf_program__fd(skel->progs.ip6geneve_set_tunnel); get_fd = bpf_program__fd(skel->progs.ip6geneve_get_tunnel); - if (generic_attach(IP6GENEVE_TUNL_DEV1, get_fd, set_fd)) + if (tc_prog_attach(IP6GENEVE_TUNL_DEV1, get_fd, set_fd)) goto done; ping_dev0(); @@ -1083,7 +1004,7 @@ static void test_ip6tnl_tunnel(enum ip6tnl_test test) get_fd = bpf_program__fd(skel->progs.ip6ip6_get_tunnel); break; } - if (generic_attach(IP6TNL_TUNL_DEV1, get_fd, set_fd)) + if (tc_prog_attach(IP6TNL_TUNL_DEV1, get_fd, set_fd)) goto done; ping6_veth0(); diff --git a/tools/testing/selftests/bpf/prog_tests/test_xsk.c b/tools/testing/selftests/bpf/prog_tests/test_xsk.c new file mode 100644 index 00000000000000..5af28f359cfda4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_xsk.c @@ -0,0 +1,2596 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <bpf/bpf.h> +#include <errno.h> +#include <linux/bitmap.h> +#include <linux/if_link.h> +#include <linux/mman.h> +#include <linux/netdev.h> +#include <poll.h> +#include <pthread.h> +#include <signal.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <unistd.h> + +#include "network_helpers.h" +#include "test_xsk.h" +#include "xsk_xdp_common.h" +#include "xsk_xdp_progs.skel.h" + +#define DEFAULT_BATCH_SIZE 64 +#define MIN_PKT_SIZE 64 +#define MAX_ETH_JUMBO_SIZE 9000 +#define MAX_INTERFACES 2 +#define MAX_TEARDOWN_ITER 10 +#define MAX_TX_BUDGET_DEFAULT 32 +#define PKT_DUMP_NB_TO_PRINT 16 +/* Just to align the data in the packet */ +#define PKT_HDR_SIZE (sizeof(struct ethhdr) + 2) +#define POLL_TMOUT 1000 +#define THREAD_TMOUT 3 +#define UMEM_HEADROOM_TEST_SIZE 128 +#define XSK_DESC__INVALID_OPTION (0xffff) +#define XSK_UMEM__INVALID_FRAME_SIZE (MAX_ETH_JUMBO_SIZE + 1) +#define XSK_UMEM__LARGE_FRAME_SIZE (3 * 1024) +#define XSK_UMEM__MAX_FRAME_SIZE (4 * 1024) + +static const u8 g_mac[ETH_ALEN] = {0x55, 0x44, 0x33, 0x22, 0x11, 0x00}; + +bool opt_verbose; +pthread_barrier_t barr; +pthread_mutex_t pacing_mutex = PTHREAD_MUTEX_INITIALIZER; + +int pkts_in_flight; + +/* The payload is a word consisting of a packet sequence number in the upper + * 16-bits and a intra packet data sequence number in the lower 16 bits. So the 3rd packet's + * 5th word of data will contain the number (2<<16) | 4 as they are numbered from 0. + */ +static void write_payload(void *dest, u32 pkt_nb, u32 start, u32 size) +{ + u32 *ptr = (u32 *)dest, i; + + start /= sizeof(*ptr); + size /= sizeof(*ptr); + for (i = 0; i < size; i++) + ptr[i] = htonl(pkt_nb << 16 | (i + start)); +} + +static void gen_eth_hdr(struct xsk_socket_info *xsk, struct ethhdr *eth_hdr) +{ + memcpy(eth_hdr->h_dest, xsk->dst_mac, ETH_ALEN); + memcpy(eth_hdr->h_source, xsk->src_mac, ETH_ALEN); + eth_hdr->h_proto = htons(ETH_P_LOOPBACK); +} + +static bool is_umem_valid(struct ifobject *ifobj) +{ + return !!ifobj->umem->umem; +} + +static u32 mode_to_xdp_flags(enum test_mode mode) +{ + return (mode == TEST_MODE_SKB) ? XDP_FLAGS_SKB_MODE : XDP_FLAGS_DRV_MODE; +} + +static u64 umem_size(struct xsk_umem_info *umem) +{ + return umem->num_frames * umem->frame_size; +} + +int xsk_configure_umem(struct ifobject *ifobj, struct xsk_umem_info *umem, void *buffer, + u64 size) +{ + struct xsk_umem_config cfg = { + .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, + .frame_size = umem->frame_size, + .frame_headroom = umem->frame_headroom, + .flags = XSK_UMEM__DEFAULT_FLAGS + }; + int ret; + + if (umem->fill_size) + cfg.fill_size = umem->fill_size; + + if (umem->comp_size) + cfg.comp_size = umem->comp_size; + + if (umem->unaligned_mode) + cfg.flags |= XDP_UMEM_UNALIGNED_CHUNK_FLAG; + + ret = xsk_umem__create(&umem->umem, buffer, size, + &umem->fq, &umem->cq, &cfg); + if (ret) + return ret; + + umem->buffer = buffer; + if (ifobj->shared_umem && ifobj->rx_on) { + umem->base_addr = umem_size(umem); + umem->next_buffer = umem_size(umem); + } + + return 0; +} + +static u64 umem_alloc_buffer(struct xsk_umem_info *umem) +{ + u64 addr; + + addr = umem->next_buffer; + umem->next_buffer += umem->frame_size; + if (umem->next_buffer >= umem->base_addr + umem_size(umem)) + umem->next_buffer = umem->base_addr; + + return addr; +} + +static void umem_reset_alloc(struct xsk_umem_info *umem) +{ + umem->next_buffer = 0; +} + +static int enable_busy_poll(struct xsk_socket_info *xsk) +{ + int sock_opt; + + sock_opt = 1; + if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_PREFER_BUSY_POLL, + (void *)&sock_opt, sizeof(sock_opt)) < 0) + return -errno; + + sock_opt = 20; + if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL, + (void *)&sock_opt, sizeof(sock_opt)) < 0) + return -errno; + + sock_opt = xsk->batch_size; + if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL_BUDGET, + (void *)&sock_opt, sizeof(sock_opt)) < 0) + return -errno; + + return 0; +} + +int xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_info *umem, + struct ifobject *ifobject, bool shared) +{ + struct xsk_socket_config cfg = {}; + struct xsk_ring_cons *rxr; + struct xsk_ring_prod *txr; + + xsk->umem = umem; + cfg.rx_size = xsk->rxqsize; + cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + cfg.bind_flags = ifobject->bind_flags; + if (shared) + cfg.bind_flags |= XDP_SHARED_UMEM; + if (ifobject->mtu > MAX_ETH_PKT_SIZE) + cfg.bind_flags |= XDP_USE_SG; + if (umem->comp_size) + cfg.tx_size = umem->comp_size; + if (umem->fill_size) + cfg.rx_size = umem->fill_size; + + txr = ifobject->tx_on ? &xsk->tx : NULL; + rxr = ifobject->rx_on ? &xsk->rx : NULL; + return xsk_socket__create(&xsk->xsk, ifobject->ifindex, 0, umem->umem, rxr, txr, &cfg); +} + +#define MAX_SKB_FRAGS_PATH "/proc/sys/net/core/max_skb_frags" +static unsigned int get_max_skb_frags(void) +{ + unsigned int max_skb_frags = 0; + FILE *file; + + file = fopen(MAX_SKB_FRAGS_PATH, "r"); + if (!file) { + ksft_print_msg("Error opening %s\n", MAX_SKB_FRAGS_PATH); + return 0; + } + + if (fscanf(file, "%u", &max_skb_frags) != 1) + ksft_print_msg("Error reading %s\n", MAX_SKB_FRAGS_PATH); + + fclose(file); + return max_skb_frags; +} + +static int set_ring_size(struct ifobject *ifobj) +{ + int ret; + u32 ctr = 0; + + while (ctr++ < SOCK_RECONF_CTR) { + ret = set_hw_ring_size(ifobj->ifname, &ifobj->ring); + if (!ret) + break; + + /* Retry if it fails */ + if (ctr >= SOCK_RECONF_CTR || errno != EBUSY) + return -errno; + + usleep(USLEEP_MAX); + } + + return ret; +} + +int hw_ring_size_reset(struct ifobject *ifobj) +{ + ifobj->ring.tx_pending = ifobj->set_ring.default_tx; + ifobj->ring.rx_pending = ifobj->set_ring.default_rx; + return set_ring_size(ifobj); +} + +static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, + struct ifobject *ifobj_rx) +{ + u32 i, j; + + for (i = 0; i < MAX_INTERFACES; i++) { + struct ifobject *ifobj = i ? ifobj_rx : ifobj_tx; + + ifobj->xsk = &ifobj->xsk_arr[0]; + ifobj->use_poll = false; + ifobj->use_fill_ring = true; + ifobj->release_rx = true; + ifobj->validation_func = NULL; + ifobj->use_metadata = false; + + if (i == 0) { + ifobj->rx_on = false; + ifobj->tx_on = true; + } else { + ifobj->rx_on = true; + ifobj->tx_on = false; + } + + memset(ifobj->umem, 0, sizeof(*ifobj->umem)); + ifobj->umem->num_frames = DEFAULT_UMEM_BUFFERS; + ifobj->umem->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; + + for (j = 0; j < MAX_SOCKETS; j++) { + memset(&ifobj->xsk_arr[j], 0, sizeof(ifobj->xsk_arr[j])); + ifobj->xsk_arr[j].rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; + ifobj->xsk_arr[j].batch_size = DEFAULT_BATCH_SIZE; + if (i == 0) + ifobj->xsk_arr[j].pkt_stream = test->tx_pkt_stream_default; + else + ifobj->xsk_arr[j].pkt_stream = test->rx_pkt_stream_default; + + memcpy(ifobj->xsk_arr[j].src_mac, g_mac, ETH_ALEN); + memcpy(ifobj->xsk_arr[j].dst_mac, g_mac, ETH_ALEN); + ifobj->xsk_arr[j].src_mac[5] += ((j * 2) + 0); + ifobj->xsk_arr[j].dst_mac[5] += ((j * 2) + 1); + } + } + + if (ifobj_tx->hw_ring_size_supp) + hw_ring_size_reset(ifobj_tx); + + test->ifobj_tx = ifobj_tx; + test->ifobj_rx = ifobj_rx; + test->current_step = 0; + test->total_steps = 1; + test->nb_sockets = 1; + test->fail = false; + test->set_ring = false; + test->adjust_tail = false; + test->adjust_tail_support = false; + test->mtu = MAX_ETH_PKT_SIZE; + test->xdp_prog_rx = ifobj_rx->xdp_progs->progs.xsk_def_prog; + test->xskmap_rx = ifobj_rx->xdp_progs->maps.xsk; + test->xdp_prog_tx = ifobj_tx->xdp_progs->progs.xsk_def_prog; + test->xskmap_tx = ifobj_tx->xdp_progs->maps.xsk; +} + +void test_init(struct test_spec *test, struct ifobject *ifobj_tx, + struct ifobject *ifobj_rx, enum test_mode mode, + const struct test_spec *test_to_run) +{ + struct pkt_stream *tx_pkt_stream; + struct pkt_stream *rx_pkt_stream; + u32 i; + + tx_pkt_stream = test->tx_pkt_stream_default; + rx_pkt_stream = test->rx_pkt_stream_default; + memset(test, 0, sizeof(*test)); + test->tx_pkt_stream_default = tx_pkt_stream; + test->rx_pkt_stream_default = rx_pkt_stream; + + for (i = 0; i < MAX_INTERFACES; i++) { + struct ifobject *ifobj = i ? ifobj_rx : ifobj_tx; + + ifobj->bind_flags = XDP_USE_NEED_WAKEUP; + if (mode == TEST_MODE_ZC) + ifobj->bind_flags |= XDP_ZEROCOPY; + else + ifobj->bind_flags |= XDP_COPY; + } + + memcpy(test->name, test_to_run->name, MAX_TEST_NAME_SIZE); + test->test_func = test_to_run->test_func; + test->mode = mode; + __test_spec_init(test, ifobj_tx, ifobj_rx); +} + +static void test_spec_reset(struct test_spec *test) +{ + __test_spec_init(test, test->ifobj_tx, test->ifobj_rx); +} + +static void test_spec_set_xdp_prog(struct test_spec *test, struct bpf_program *xdp_prog_rx, + struct bpf_program *xdp_prog_tx, struct bpf_map *xskmap_rx, + struct bpf_map *xskmap_tx) +{ + test->xdp_prog_rx = xdp_prog_rx; + test->xdp_prog_tx = xdp_prog_tx; + test->xskmap_rx = xskmap_rx; + test->xskmap_tx = xskmap_tx; +} + +static int test_spec_set_mtu(struct test_spec *test, int mtu) +{ + int err; + + if (test->ifobj_rx->mtu != mtu) { + err = xsk_set_mtu(test->ifobj_rx->ifindex, mtu); + if (err) + return err; + test->ifobj_rx->mtu = mtu; + } + if (test->ifobj_tx->mtu != mtu) { + err = xsk_set_mtu(test->ifobj_tx->ifindex, mtu); + if (err) + return err; + test->ifobj_tx->mtu = mtu; + } + + return 0; +} + +void pkt_stream_reset(struct pkt_stream *pkt_stream) +{ + if (pkt_stream) { + pkt_stream->current_pkt_nb = 0; + pkt_stream->nb_rx_pkts = 0; + } +} + +static struct pkt *pkt_stream_get_next_tx_pkt(struct pkt_stream *pkt_stream) +{ + if (pkt_stream->current_pkt_nb >= pkt_stream->nb_pkts) + return NULL; + + return &pkt_stream->pkts[pkt_stream->current_pkt_nb++]; +} + +static struct pkt *pkt_stream_get_next_rx_pkt(struct pkt_stream *pkt_stream, u32 *pkts_sent) +{ + while (pkt_stream->current_pkt_nb < pkt_stream->nb_pkts) { + (*pkts_sent)++; + if (pkt_stream->pkts[pkt_stream->current_pkt_nb].valid) + return &pkt_stream->pkts[pkt_stream->current_pkt_nb++]; + pkt_stream->current_pkt_nb++; + } + return NULL; +} + +void pkt_stream_delete(struct pkt_stream *pkt_stream) +{ + free(pkt_stream->pkts); + free(pkt_stream); +} + +void pkt_stream_restore_default(struct test_spec *test) +{ + struct pkt_stream *tx_pkt_stream = test->ifobj_tx->xsk->pkt_stream; + struct pkt_stream *rx_pkt_stream = test->ifobj_rx->xsk->pkt_stream; + + if (tx_pkt_stream != test->tx_pkt_stream_default) { + pkt_stream_delete(test->ifobj_tx->xsk->pkt_stream); + test->ifobj_tx->xsk->pkt_stream = test->tx_pkt_stream_default; + } + + if (rx_pkt_stream != test->rx_pkt_stream_default) { + pkt_stream_delete(test->ifobj_rx->xsk->pkt_stream); + test->ifobj_rx->xsk->pkt_stream = test->rx_pkt_stream_default; + } +} + +static struct pkt_stream *__pkt_stream_alloc(u32 nb_pkts) +{ + struct pkt_stream *pkt_stream; + + pkt_stream = calloc(1, sizeof(*pkt_stream)); + if (!pkt_stream) + return NULL; + + pkt_stream->pkts = calloc(nb_pkts, sizeof(*pkt_stream->pkts)); + if (!pkt_stream->pkts) { + free(pkt_stream); + return NULL; + } + + pkt_stream->nb_pkts = nb_pkts; + return pkt_stream; +} + +static u32 pkt_nb_frags(u32 frame_size, struct pkt_stream *pkt_stream, struct pkt *pkt) +{ + u32 nb_frags = 1, next_frag; + + if (!pkt) + return 1; + + if (!pkt_stream->verbatim) { + if (!pkt->valid || !pkt->len) + return 1; + return ceil_u32(pkt->len, frame_size); + } + + /* Search for the end of the packet in verbatim mode */ + if (!pkt_continues(pkt->options)) + return nb_frags; + + next_frag = pkt_stream->current_pkt_nb; + pkt++; + while (next_frag++ < pkt_stream->nb_pkts) { + nb_frags++; + if (!pkt_continues(pkt->options) || !pkt->valid) + break; + pkt++; + } + return nb_frags; +} + +static bool set_pkt_valid(int offset, u32 len) +{ + return len <= MAX_ETH_JUMBO_SIZE; +} + +static void pkt_set(struct pkt_stream *pkt_stream, struct pkt *pkt, int offset, u32 len) +{ + pkt->offset = offset; + pkt->len = len; + pkt->valid = set_pkt_valid(offset, len); +} + +static void pkt_stream_pkt_set(struct pkt_stream *pkt_stream, struct pkt *pkt, int offset, u32 len) +{ + bool prev_pkt_valid = pkt->valid; + + pkt_set(pkt_stream, pkt, offset, len); + pkt_stream->nb_valid_entries += pkt->valid - prev_pkt_valid; +} + +static u32 pkt_get_buffer_len(struct xsk_umem_info *umem, u32 len) +{ + return ceil_u32(len, umem->frame_size) * umem->frame_size; +} + +static struct pkt_stream *__pkt_stream_generate(u32 nb_pkts, u32 pkt_len, u32 nb_start, u32 nb_off) +{ + struct pkt_stream *pkt_stream; + u32 i; + + pkt_stream = __pkt_stream_alloc(nb_pkts); + if (!pkt_stream) + return NULL; + + pkt_stream->nb_pkts = nb_pkts; + pkt_stream->max_pkt_len = pkt_len; + for (i = 0; i < nb_pkts; i++) { + struct pkt *pkt = &pkt_stream->pkts[i]; + + pkt_stream_pkt_set(pkt_stream, pkt, 0, pkt_len); + pkt->pkt_nb = nb_start + i * nb_off; + } + + return pkt_stream; +} + +struct pkt_stream *pkt_stream_generate(u32 nb_pkts, u32 pkt_len) +{ + return __pkt_stream_generate(nb_pkts, pkt_len, 0, 1); +} + +static struct pkt_stream *pkt_stream_clone(struct pkt_stream *pkt_stream) +{ + return pkt_stream_generate(pkt_stream->nb_pkts, pkt_stream->pkts[0].len); +} + +static int pkt_stream_replace_ifobject(struct ifobject *ifobj, u32 nb_pkts, u32 pkt_len) +{ + ifobj->xsk->pkt_stream = pkt_stream_generate(nb_pkts, pkt_len); + + if (!ifobj->xsk->pkt_stream) + return -ENOMEM; + + return 0; +} + +static int pkt_stream_replace(struct test_spec *test, u32 nb_pkts, u32 pkt_len) +{ + int ret; + + ret = pkt_stream_replace_ifobject(test->ifobj_tx, nb_pkts, pkt_len); + if (ret) + return ret; + + return pkt_stream_replace_ifobject(test->ifobj_rx, nb_pkts, pkt_len); +} + +static int __pkt_stream_replace_half(struct ifobject *ifobj, u32 pkt_len, + int offset) +{ + struct pkt_stream *pkt_stream; + u32 i; + + pkt_stream = pkt_stream_clone(ifobj->xsk->pkt_stream); + if (!pkt_stream) + return -ENOMEM; + + for (i = 1; i < ifobj->xsk->pkt_stream->nb_pkts; i += 2) + pkt_stream_pkt_set(pkt_stream, &pkt_stream->pkts[i], offset, pkt_len); + + ifobj->xsk->pkt_stream = pkt_stream; + + return 0; +} + +static int pkt_stream_replace_half(struct test_spec *test, u32 pkt_len, int offset) +{ + int ret = __pkt_stream_replace_half(test->ifobj_tx, pkt_len, offset); + + if (ret) + return ret; + + return __pkt_stream_replace_half(test->ifobj_rx, pkt_len, offset); +} + +static int pkt_stream_receive_half(struct test_spec *test) +{ + struct pkt_stream *pkt_stream = test->ifobj_tx->xsk->pkt_stream; + u32 i; + + if (test->ifobj_rx->xsk->pkt_stream != test->rx_pkt_stream_default) + /* Packet stream has already been replaced so we have to release this one. + * The newly created one will be freed by the restore_default() at the + * end of the test + */ + pkt_stream_delete(test->ifobj_rx->xsk->pkt_stream); + + test->ifobj_rx->xsk->pkt_stream = pkt_stream_generate(pkt_stream->nb_pkts, + pkt_stream->pkts[0].len); + if (!test->ifobj_rx->xsk->pkt_stream) + return -ENOMEM; + + pkt_stream = test->ifobj_rx->xsk->pkt_stream; + for (i = 1; i < pkt_stream->nb_pkts; i += 2) + pkt_stream->pkts[i].valid = false; + + pkt_stream->nb_valid_entries /= 2; + + return 0; +} + +static int pkt_stream_even_odd_sequence(struct test_spec *test) +{ + struct pkt_stream *pkt_stream; + u32 i; + + for (i = 0; i < test->nb_sockets; i++) { + pkt_stream = test->ifobj_tx->xsk_arr[i].pkt_stream; + pkt_stream = __pkt_stream_generate(pkt_stream->nb_pkts / 2, + pkt_stream->pkts[0].len, i, 2); + if (!pkt_stream) + return -ENOMEM; + test->ifobj_tx->xsk_arr[i].pkt_stream = pkt_stream; + + pkt_stream = test->ifobj_rx->xsk_arr[i].pkt_stream; + pkt_stream = __pkt_stream_generate(pkt_stream->nb_pkts / 2, + pkt_stream->pkts[0].len, i, 2); + if (!pkt_stream) + return -ENOMEM; + test->ifobj_rx->xsk_arr[i].pkt_stream = pkt_stream; + } + + return 0; +} + +static void release_even_odd_sequence(struct test_spec *test) +{ + struct pkt_stream *later_free_tx = test->ifobj_tx->xsk->pkt_stream; + struct pkt_stream *later_free_rx = test->ifobj_rx->xsk->pkt_stream; + int i; + + for (i = 0; i < test->nb_sockets; i++) { + /* later_free_{rx/tx} will be freed by restore_default() */ + if (test->ifobj_tx->xsk_arr[i].pkt_stream != later_free_tx) + pkt_stream_delete(test->ifobj_tx->xsk_arr[i].pkt_stream); + if (test->ifobj_rx->xsk_arr[i].pkt_stream != later_free_rx) + pkt_stream_delete(test->ifobj_rx->xsk_arr[i].pkt_stream); + } + +} + +static u64 pkt_get_addr(struct pkt *pkt, struct xsk_umem_info *umem) +{ + if (!pkt->valid) + return pkt->offset; + return pkt->offset + umem_alloc_buffer(umem); +} + +static void pkt_stream_cancel(struct pkt_stream *pkt_stream) +{ + pkt_stream->current_pkt_nb--; +} + +static void pkt_generate(struct xsk_socket_info *xsk, struct xsk_umem_info *umem, u64 addr, u32 len, + u32 pkt_nb, u32 bytes_written) +{ + void *data = xsk_umem__get_data(umem->buffer, addr); + + if (len < MIN_PKT_SIZE) + return; + + if (!bytes_written) { + gen_eth_hdr(xsk, data); + + len -= PKT_HDR_SIZE; + data += PKT_HDR_SIZE; + } else { + bytes_written -= PKT_HDR_SIZE; + } + + write_payload(data, pkt_nb, bytes_written, len); +} + +static struct pkt_stream *__pkt_stream_generate_custom(struct ifobject *ifobj, struct pkt *frames, + u32 nb_frames, bool verbatim) +{ + u32 i, len = 0, pkt_nb = 0, payload = 0; + struct pkt_stream *pkt_stream; + + pkt_stream = __pkt_stream_alloc(nb_frames); + if (!pkt_stream) + return NULL; + + for (i = 0; i < nb_frames; i++) { + struct pkt *pkt = &pkt_stream->pkts[pkt_nb]; + struct pkt *frame = &frames[i]; + + pkt->offset = frame->offset; + if (verbatim) { + *pkt = *frame; + pkt->pkt_nb = payload; + if (!frame->valid || !pkt_continues(frame->options)) + payload++; + } else { + if (frame->valid) + len += frame->len; + if (frame->valid && pkt_continues(frame->options)) + continue; + + pkt->pkt_nb = pkt_nb; + pkt->len = len; + pkt->valid = frame->valid; + pkt->options = 0; + + len = 0; + } + + print_verbose("offset: %d len: %u valid: %u options: %u pkt_nb: %u\n", + pkt->offset, pkt->len, pkt->valid, pkt->options, pkt->pkt_nb); + + if (pkt->valid && pkt->len > pkt_stream->max_pkt_len) + pkt_stream->max_pkt_len = pkt->len; + + if (pkt->valid) + pkt_stream->nb_valid_entries++; + + pkt_nb++; + } + + pkt_stream->nb_pkts = pkt_nb; + pkt_stream->verbatim = verbatim; + return pkt_stream; +} + +static int pkt_stream_generate_custom(struct test_spec *test, struct pkt *pkts, u32 nb_pkts) +{ + struct pkt_stream *pkt_stream; + + pkt_stream = __pkt_stream_generate_custom(test->ifobj_tx, pkts, nb_pkts, true); + if (!pkt_stream) + return -ENOMEM; + test->ifobj_tx->xsk->pkt_stream = pkt_stream; + + pkt_stream = __pkt_stream_generate_custom(test->ifobj_rx, pkts, nb_pkts, false); + if (!pkt_stream) + return -ENOMEM; + test->ifobj_rx->xsk->pkt_stream = pkt_stream; + + return 0; +} + +static void pkt_print_data(u32 *data, u32 cnt) +{ + u32 i; + + for (i = 0; i < cnt; i++) { + u32 seqnum, pkt_nb; + + seqnum = ntohl(*data) & 0xffff; + pkt_nb = ntohl(*data) >> 16; + ksft_print_msg("%u:%u ", pkt_nb, seqnum); + data++; + } +} + +static void pkt_dump(void *pkt, u32 len, bool eth_header) +{ + struct ethhdr *ethhdr = pkt; + u32 i, *data; + + if (eth_header) { + /*extract L2 frame */ + ksft_print_msg("DEBUG>> L2: dst mac: "); + for (i = 0; i < ETH_ALEN; i++) + ksft_print_msg("%02X", ethhdr->h_dest[i]); + + ksft_print_msg("\nDEBUG>> L2: src mac: "); + for (i = 0; i < ETH_ALEN; i++) + ksft_print_msg("%02X", ethhdr->h_source[i]); + + data = pkt + PKT_HDR_SIZE; + } else { + data = pkt; + } + + /*extract L5 frame */ + ksft_print_msg("\nDEBUG>> L5: seqnum: "); + pkt_print_data(data, PKT_DUMP_NB_TO_PRINT); + ksft_print_msg("...."); + if (len > PKT_DUMP_NB_TO_PRINT * sizeof(u32)) { + ksft_print_msg("\n.... "); + pkt_print_data(data + len / sizeof(u32) - PKT_DUMP_NB_TO_PRINT, + PKT_DUMP_NB_TO_PRINT); + } + ksft_print_msg("\n---------------------------------------\n"); +} + +static bool is_offset_correct(struct xsk_umem_info *umem, struct pkt *pkt, u64 addr) +{ + u32 headroom = umem->unaligned_mode ? 0 : umem->frame_headroom; + u32 offset = addr % umem->frame_size, expected_offset; + int pkt_offset = pkt->valid ? pkt->offset : 0; + + if (!umem->unaligned_mode) + pkt_offset = 0; + + expected_offset = (pkt_offset + headroom + XDP_PACKET_HEADROOM) % umem->frame_size; + + if (offset == expected_offset) + return true; + + ksft_print_msg("[%s] expected [%u], got [%u]\n", __func__, expected_offset, offset); + return false; +} + +static bool is_metadata_correct(struct pkt *pkt, void *buffer, u64 addr) +{ + void *data = xsk_umem__get_data(buffer, addr); + struct xdp_info *meta = data - sizeof(struct xdp_info); + + if (meta->count != pkt->pkt_nb) { + ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%llu]\n", + __func__, pkt->pkt_nb, + (unsigned long long)meta->count); + return false; + } + + return true; +} + +static int is_adjust_tail_supported(struct xsk_xdp_progs *skel_rx, bool *supported) +{ + struct bpf_map *data_map; + int adjust_value = 0; + int key = 0; + int ret; + + data_map = bpf_object__find_map_by_name(skel_rx->obj, "xsk_xdp_.bss"); + if (!data_map || !bpf_map__is_internal(data_map)) { + ksft_print_msg("Error: could not find bss section of XDP program\n"); + return -EINVAL; + } + + ret = bpf_map_lookup_elem(bpf_map__fd(data_map), &key, &adjust_value); + if (ret) { + ksft_print_msg("Error: bpf_map_lookup_elem failed with error %d\n", ret); + return ret; + } + + /* Set the 'adjust_value' variable to -EOPNOTSUPP in the XDP program if the adjust_tail + * helper is not supported. Skip the adjust_tail test case in this scenario. + */ + *supported = adjust_value != -EOPNOTSUPP; + + return 0; +} + +static bool is_frag_valid(struct xsk_umem_info *umem, u64 addr, u32 len, u32 expected_pkt_nb, + u32 bytes_processed) +{ + u32 seqnum, pkt_nb, *pkt_data, words_to_end, expected_seqnum; + void *data = xsk_umem__get_data(umem->buffer, addr); + + addr -= umem->base_addr; + + if (addr >= umem->num_frames * umem->frame_size || + addr + len > umem->num_frames * umem->frame_size) { + ksft_print_msg("Frag invalid addr: %llx len: %u\n", + (unsigned long long)addr, len); + return false; + } + if (!umem->unaligned_mode && addr % umem->frame_size + len > umem->frame_size) { + ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", + (unsigned long long)addr, len); + return false; + } + + pkt_data = data; + if (!bytes_processed) { + pkt_data += PKT_HDR_SIZE / sizeof(*pkt_data); + len -= PKT_HDR_SIZE; + } else { + bytes_processed -= PKT_HDR_SIZE; + } + + expected_seqnum = bytes_processed / sizeof(*pkt_data); + seqnum = ntohl(*pkt_data) & 0xffff; + pkt_nb = ntohl(*pkt_data) >> 16; + + if (expected_pkt_nb != pkt_nb) { + ksft_print_msg("[%s] expected pkt_nb [%u], got pkt_nb [%u]\n", + __func__, expected_pkt_nb, pkt_nb); + goto error; + } + if (expected_seqnum != seqnum) { + ksft_print_msg("[%s] expected seqnum at start [%u], got seqnum [%u]\n", + __func__, expected_seqnum, seqnum); + goto error; + } + + words_to_end = len / sizeof(*pkt_data) - 1; + pkt_data += words_to_end; + seqnum = ntohl(*pkt_data) & 0xffff; + expected_seqnum += words_to_end; + if (expected_seqnum != seqnum) { + ksft_print_msg("[%s] expected seqnum at end [%u], got seqnum [%u]\n", + __func__, expected_seqnum, seqnum); + goto error; + } + + return true; + +error: + pkt_dump(data, len, !bytes_processed); + return false; +} + +static bool is_pkt_valid(struct pkt *pkt, void *buffer, u64 addr, u32 len) +{ + if (pkt->len != len) { + ksft_print_msg("[%s] expected packet length [%d], got length [%d]\n", + __func__, pkt->len, len); + pkt_dump(xsk_umem__get_data(buffer, addr), len, true); + return false; + } + + return true; +} + +static u32 load_value(u32 *counter) +{ + return __atomic_load_n(counter, __ATOMIC_ACQUIRE); +} + +static bool kick_tx_with_check(struct xsk_socket_info *xsk, int *ret) +{ + u32 max_budget = MAX_TX_BUDGET_DEFAULT; + u32 cons, ready_to_send; + int delta; + + cons = load_value(xsk->tx.consumer); + ready_to_send = load_value(xsk->tx.producer) - cons; + *ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); + + delta = load_value(xsk->tx.consumer) - cons; + /* By default, xsk should consume exact @max_budget descs at one + * send in this case where hitting the max budget limit in while + * loop is triggered in __xsk_generic_xmit(). Please make sure that + * the number of descs to be sent is larger than @max_budget, or + * else the tx.consumer will be updated in xskq_cons_peek_desc() + * in time which hides the issue we try to verify. + */ + if (ready_to_send > max_budget && delta != max_budget) + return false; + + return true; +} + +int kick_tx(struct xsk_socket_info *xsk) +{ + int ret; + + if (xsk->check_consumer) { + if (!kick_tx_with_check(xsk, &ret)) + return TEST_FAILURE; + } else { + ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); + } + if (ret >= 0) + return TEST_PASS; + if (errno == ENOBUFS || errno == EAGAIN || errno == EBUSY || errno == ENETDOWN) { + usleep(100); + return TEST_PASS; + } + return TEST_FAILURE; +} + +int kick_rx(struct xsk_socket_info *xsk) +{ + int ret; + + ret = recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); + if (ret < 0) + return TEST_FAILURE; + + return TEST_PASS; +} + +static int complete_pkts(struct xsk_socket_info *xsk, int batch_size) +{ + unsigned int rcvd; + u32 idx; + int ret; + + if (xsk_ring_prod__needs_wakeup(&xsk->tx)) { + ret = kick_tx(xsk); + if (ret) + return TEST_FAILURE; + } + + rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx); + if (rcvd) { + if (rcvd > xsk->outstanding_tx) { + u64 addr = *xsk_ring_cons__comp_addr(&xsk->umem->cq, idx + rcvd - 1); + + ksft_print_msg("[%s] Too many packets completed\n", __func__); + ksft_print_msg("Last completion address: %llx\n", + (unsigned long long)addr); + return TEST_FAILURE; + } + + xsk_ring_cons__release(&xsk->umem->cq, rcvd); + xsk->outstanding_tx -= rcvd; + } + + return TEST_PASS; +} + +static int __receive_pkts(struct test_spec *test, struct xsk_socket_info *xsk) +{ + u32 frags_processed = 0, nb_frags = 0, pkt_len = 0; + u32 idx_rx = 0, idx_fq = 0, rcvd, pkts_sent = 0; + struct pkt_stream *pkt_stream = xsk->pkt_stream; + struct ifobject *ifobj = test->ifobj_rx; + struct xsk_umem_info *umem = xsk->umem; + struct pollfd fds = { }; + struct pkt *pkt; + u64 first_addr = 0; + int ret; + + fds.fd = xsk_socket__fd(xsk->xsk); + fds.events = POLLIN; + + ret = kick_rx(xsk); + if (ret) + return TEST_FAILURE; + + if (ifobj->use_poll) { + ret = poll(&fds, 1, POLL_TMOUT); + if (ret < 0) + return TEST_FAILURE; + + if (!ret) { + if (!is_umem_valid(test->ifobj_tx)) + return TEST_PASS; + + ksft_print_msg("ERROR: [%s] Poll timed out\n", __func__); + return TEST_CONTINUE; + } + + if (!(fds.revents & POLLIN)) + return TEST_CONTINUE; + } + + rcvd = xsk_ring_cons__peek(&xsk->rx, xsk->batch_size, &idx_rx); + if (!rcvd) + return TEST_CONTINUE; + + if (ifobj->use_fill_ring) { + ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); + while (ret != rcvd) { + if (xsk_ring_prod__needs_wakeup(&umem->fq)) { + ret = poll(&fds, 1, POLL_TMOUT); + if (ret < 0) + return TEST_FAILURE; + } + ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); + } + } + + while (frags_processed < rcvd) { + const struct xdp_desc *desc = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++); + u64 addr = desc->addr, orig; + + orig = xsk_umem__extract_addr(addr); + addr = xsk_umem__add_offset_to_addr(addr); + + if (!nb_frags) { + pkt = pkt_stream_get_next_rx_pkt(pkt_stream, &pkts_sent); + if (!pkt) { + ksft_print_msg("[%s] received too many packets addr: %lx len %u\n", + __func__, addr, desc->len); + return TEST_FAILURE; + } + } + + print_verbose("Rx: addr: %lx len: %u options: %u pkt_nb: %u valid: %u\n", + addr, desc->len, desc->options, pkt->pkt_nb, pkt->valid); + + if (!is_frag_valid(umem, addr, desc->len, pkt->pkt_nb, pkt_len) || + !is_offset_correct(umem, pkt, addr) || (ifobj->use_metadata && + !is_metadata_correct(pkt, umem->buffer, addr))) + return TEST_FAILURE; + + if (!nb_frags++) + first_addr = addr; + frags_processed++; + pkt_len += desc->len; + if (ifobj->use_fill_ring) + *xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) = orig; + + if (pkt_continues(desc->options)) + continue; + + /* The complete packet has been received */ + if (!is_pkt_valid(pkt, umem->buffer, first_addr, pkt_len) || + !is_offset_correct(umem, pkt, addr)) + return TEST_FAILURE; + + pkt_stream->nb_rx_pkts++; + nb_frags = 0; + pkt_len = 0; + } + + if (nb_frags) { + /* In the middle of a packet. Start over from beginning of packet. */ + idx_rx -= nb_frags; + xsk_ring_cons__cancel(&xsk->rx, nb_frags); + if (ifobj->use_fill_ring) { + idx_fq -= nb_frags; + xsk_ring_prod__cancel(&umem->fq, nb_frags); + } + frags_processed -= nb_frags; + } + + if (ifobj->use_fill_ring) + xsk_ring_prod__submit(&umem->fq, frags_processed); + if (ifobj->release_rx) + xsk_ring_cons__release(&xsk->rx, frags_processed); + + pthread_mutex_lock(&pacing_mutex); + pkts_in_flight -= pkts_sent; + pthread_mutex_unlock(&pacing_mutex); + pkts_sent = 0; + + return TEST_CONTINUE; +} + +bool all_packets_received(struct test_spec *test, struct xsk_socket_info *xsk, u32 sock_num, + unsigned long *bitmap) +{ + struct pkt_stream *pkt_stream = xsk->pkt_stream; + + if (!pkt_stream) { + __set_bit(sock_num, bitmap); + return false; + } + + if (pkt_stream->nb_rx_pkts == pkt_stream->nb_valid_entries) { + __set_bit(sock_num, bitmap); + if (bitmap_full(bitmap, test->nb_sockets)) + return true; + } + + return false; +} + +static int receive_pkts(struct test_spec *test) +{ + struct timeval tv_end, tv_now, tv_timeout = {THREAD_TMOUT, 0}; + DECLARE_BITMAP(bitmap, test->nb_sockets); + struct xsk_socket_info *xsk; + u32 sock_num = 0; + int res, ret; + + bitmap_zero(bitmap, test->nb_sockets); + + ret = gettimeofday(&tv_now, NULL); + if (ret) + return TEST_FAILURE; + + timeradd(&tv_now, &tv_timeout, &tv_end); + + while (1) { + xsk = &test->ifobj_rx->xsk_arr[sock_num]; + + if ((all_packets_received(test, xsk, sock_num, bitmap))) + break; + + res = __receive_pkts(test, xsk); + if (!(res == TEST_PASS || res == TEST_CONTINUE)) + return res; + + ret = gettimeofday(&tv_now, NULL); + if (ret) + return TEST_FAILURE; + + if (timercmp(&tv_now, &tv_end, >)) { + ksft_print_msg("ERROR: [%s] Receive loop timed out\n", __func__); + return TEST_FAILURE; + } + sock_num = (sock_num + 1) % test->nb_sockets; + } + + return TEST_PASS; +} + +static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, bool timeout) +{ + u32 i, idx = 0, valid_pkts = 0, valid_frags = 0, buffer_len; + struct pkt_stream *pkt_stream = xsk->pkt_stream; + struct xsk_umem_info *umem = ifobject->umem; + bool use_poll = ifobject->use_poll; + struct pollfd fds = { }; + int ret; + + buffer_len = pkt_get_buffer_len(umem, pkt_stream->max_pkt_len); + /* pkts_in_flight might be negative if many invalid packets are sent */ + if (pkts_in_flight >= (int)((umem_size(umem) - xsk->batch_size * buffer_len) / + buffer_len)) { + ret = kick_tx(xsk); + if (ret) + return TEST_FAILURE; + return TEST_CONTINUE; + } + + fds.fd = xsk_socket__fd(xsk->xsk); + fds.events = POLLOUT; + + while (xsk_ring_prod__reserve(&xsk->tx, xsk->batch_size, &idx) < xsk->batch_size) { + if (use_poll) { + ret = poll(&fds, 1, POLL_TMOUT); + if (timeout) { + if (ret < 0) { + ksft_print_msg("ERROR: [%s] Poll error %d\n", + __func__, errno); + return TEST_FAILURE; + } + if (ret == 0) + return TEST_PASS; + break; + } + if (ret <= 0) { + ksft_print_msg("ERROR: [%s] Poll error %d\n", + __func__, errno); + return TEST_FAILURE; + } + } + + complete_pkts(xsk, xsk->batch_size); + } + + for (i = 0; i < xsk->batch_size; i++) { + struct pkt *pkt = pkt_stream_get_next_tx_pkt(pkt_stream); + u32 nb_frags_left, nb_frags, bytes_written = 0; + + if (!pkt) + break; + + nb_frags = pkt_nb_frags(umem->frame_size, pkt_stream, pkt); + if (nb_frags > xsk->batch_size - i) { + pkt_stream_cancel(pkt_stream); + xsk_ring_prod__cancel(&xsk->tx, xsk->batch_size - i); + break; + } + nb_frags_left = nb_frags; + + while (nb_frags_left--) { + struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); + + tx_desc->addr = pkt_get_addr(pkt, ifobject->umem); + if (pkt_stream->verbatim) { + tx_desc->len = pkt->len; + tx_desc->options = pkt->options; + } else if (nb_frags_left) { + tx_desc->len = umem->frame_size; + tx_desc->options = XDP_PKT_CONTD; + } else { + tx_desc->len = pkt->len - bytes_written; + tx_desc->options = 0; + } + if (pkt->valid) + pkt_generate(xsk, umem, tx_desc->addr, tx_desc->len, pkt->pkt_nb, + bytes_written); + bytes_written += tx_desc->len; + + print_verbose("Tx addr: %llx len: %u options: %u pkt_nb: %u\n", + tx_desc->addr, tx_desc->len, tx_desc->options, pkt->pkt_nb); + + if (nb_frags_left) { + i++; + if (pkt_stream->verbatim) + pkt = pkt_stream_get_next_tx_pkt(pkt_stream); + } + } + + if (pkt && pkt->valid) { + valid_pkts++; + valid_frags += nb_frags; + } + } + + pthread_mutex_lock(&pacing_mutex); + pkts_in_flight += valid_pkts; + pthread_mutex_unlock(&pacing_mutex); + + xsk_ring_prod__submit(&xsk->tx, i); + xsk->outstanding_tx += valid_frags; + + if (use_poll) { + ret = poll(&fds, 1, POLL_TMOUT); + if (ret <= 0) { + if (ret == 0 && timeout) + return TEST_PASS; + + ksft_print_msg("ERROR: [%s] Poll error %d\n", __func__, ret); + return TEST_FAILURE; + } + } + + if (!timeout) { + if (complete_pkts(xsk, i)) + return TEST_FAILURE; + + usleep(10); + return TEST_PASS; + } + + return TEST_CONTINUE; +} + +static int wait_for_tx_completion(struct xsk_socket_info *xsk) +{ + struct timeval tv_end, tv_now, tv_timeout = {THREAD_TMOUT, 0}; + int ret; + + ret = gettimeofday(&tv_now, NULL); + if (ret) + return TEST_FAILURE; + timeradd(&tv_now, &tv_timeout, &tv_end); + + while (xsk->outstanding_tx) { + ret = gettimeofday(&tv_now, NULL); + if (ret) + return TEST_FAILURE; + if (timercmp(&tv_now, &tv_end, >)) { + ksft_print_msg("ERROR: [%s] Transmission loop timed out\n", __func__); + return TEST_FAILURE; + } + + complete_pkts(xsk, xsk->batch_size); + } + + return TEST_PASS; +} + +bool all_packets_sent(struct test_spec *test, unsigned long *bitmap) +{ + return bitmap_full(bitmap, test->nb_sockets); +} + +static int send_pkts(struct test_spec *test, struct ifobject *ifobject) +{ + bool timeout = !is_umem_valid(test->ifobj_rx); + DECLARE_BITMAP(bitmap, test->nb_sockets); + u32 i, ret; + + bitmap_zero(bitmap, test->nb_sockets); + + while (!(all_packets_sent(test, bitmap))) { + for (i = 0; i < test->nb_sockets; i++) { + struct pkt_stream *pkt_stream; + + pkt_stream = ifobject->xsk_arr[i].pkt_stream; + if (!pkt_stream || pkt_stream->current_pkt_nb >= pkt_stream->nb_pkts) { + __set_bit(i, bitmap); + continue; + } + ret = __send_pkts(ifobject, &ifobject->xsk_arr[i], timeout); + if (ret == TEST_CONTINUE && !test->fail) + continue; + + if ((ret || test->fail) && !timeout) + return TEST_FAILURE; + + if (ret == TEST_PASS && timeout) + return ret; + + ret = wait_for_tx_completion(&ifobject->xsk_arr[i]); + if (ret) + return TEST_FAILURE; + } + } + + return TEST_PASS; +} + +static int get_xsk_stats(struct xsk_socket *xsk, struct xdp_statistics *stats) +{ + int fd = xsk_socket__fd(xsk), err; + socklen_t optlen, expected_len; + + optlen = sizeof(*stats); + err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, stats, &optlen); + if (err) { + ksft_print_msg("[%s] getsockopt(XDP_STATISTICS) error %u %s\n", + __func__, -err, strerror(-err)); + return TEST_FAILURE; + } + + expected_len = sizeof(struct xdp_statistics); + if (optlen != expected_len) { + ksft_print_msg("[%s] getsockopt optlen error. Expected: %u got: %u\n", + __func__, expected_len, optlen); + return TEST_FAILURE; + } + + return TEST_PASS; +} + +static int validate_rx_dropped(struct ifobject *ifobject) +{ + struct xsk_socket *xsk = ifobject->xsk->xsk; + struct xdp_statistics stats; + int err; + + err = kick_rx(ifobject->xsk); + if (err) + return TEST_FAILURE; + + err = get_xsk_stats(xsk, &stats); + if (err) + return TEST_FAILURE; + + /* The receiver calls getsockopt after receiving the last (valid) + * packet which is not the final packet sent in this test (valid and + * invalid packets are sent in alternating fashion with the final + * packet being invalid). Since the last packet may or may not have + * been dropped already, both outcomes must be allowed. + */ + if (stats.rx_dropped == ifobject->xsk->pkt_stream->nb_pkts / 2 || + stats.rx_dropped == ifobject->xsk->pkt_stream->nb_pkts / 2 - 1) + return TEST_PASS; + + return TEST_FAILURE; +} + +static int validate_rx_full(struct ifobject *ifobject) +{ + struct xsk_socket *xsk = ifobject->xsk->xsk; + struct xdp_statistics stats; + int err; + + usleep(1000); + err = kick_rx(ifobject->xsk); + if (err) + return TEST_FAILURE; + + err = get_xsk_stats(xsk, &stats); + if (err) + return TEST_FAILURE; + + if (stats.rx_ring_full) + return TEST_PASS; + + return TEST_FAILURE; +} + +static int validate_fill_empty(struct ifobject *ifobject) +{ + struct xsk_socket *xsk = ifobject->xsk->xsk; + struct xdp_statistics stats; + int err; + + usleep(1000); + err = kick_rx(ifobject->xsk); + if (err) + return TEST_FAILURE; + + err = get_xsk_stats(xsk, &stats); + if (err) + return TEST_FAILURE; + + if (stats.rx_fill_ring_empty_descs) + return TEST_PASS; + + return TEST_FAILURE; +} + +static int validate_tx_invalid_descs(struct ifobject *ifobject) +{ + struct xsk_socket *xsk = ifobject->xsk->xsk; + int fd = xsk_socket__fd(xsk); + struct xdp_statistics stats; + socklen_t optlen; + int err; + + optlen = sizeof(stats); + err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen); + if (err) { + ksft_print_msg("[%s] getsockopt(XDP_STATISTICS) error %u %s\n", + __func__, -err, strerror(-err)); + return TEST_FAILURE; + } + + if (stats.tx_invalid_descs != ifobject->xsk->pkt_stream->nb_pkts / 2) { + ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%llu] expected [%u]\n", + __func__, + (unsigned long long)stats.tx_invalid_descs, + ifobject->xsk->pkt_stream->nb_pkts); + return TEST_FAILURE; + } + + return TEST_PASS; +} + +static int xsk_configure(struct test_spec *test, struct ifobject *ifobject, + struct xsk_umem_info *umem, bool tx) +{ + int i, ret; + + for (i = 0; i < test->nb_sockets; i++) { + bool shared = (ifobject->shared_umem && tx) ? true : !!i; + u32 ctr = 0; + + while (ctr++ < SOCK_RECONF_CTR) { + ret = xsk_configure_socket(&ifobject->xsk_arr[i], umem, + ifobject, shared); + if (!ret) + break; + + /* Retry if it fails as xsk_socket__create() is asynchronous */ + if (ctr >= SOCK_RECONF_CTR) + return ret; + usleep(USLEEP_MAX); + } + if (ifobject->busy_poll) { + ret = enable_busy_poll(&ifobject->xsk_arr[i]); + if (ret) + return ret; + } + } + + return 0; +} + +static int thread_common_ops_tx(struct test_spec *test, struct ifobject *ifobject) +{ + int ret = xsk_configure(test, ifobject, test->ifobj_rx->umem, true); + + if (ret) + return ret; + ifobject->xsk = &ifobject->xsk_arr[0]; + ifobject->xskmap = test->ifobj_rx->xskmap; + memcpy(ifobject->umem, test->ifobj_rx->umem, sizeof(struct xsk_umem_info)); + ifobject->umem->base_addr = 0; + + return 0; +} + +static int xsk_populate_fill_ring(struct xsk_umem_info *umem, struct pkt_stream *pkt_stream, + bool fill_up) +{ + u32 rx_frame_size = umem->frame_size - XDP_PACKET_HEADROOM; + u32 idx = 0, filled = 0, buffers_to_fill, nb_pkts; + int ret; + + if (umem->num_frames < XSK_RING_PROD__DEFAULT_NUM_DESCS) + buffers_to_fill = umem->num_frames; + else + buffers_to_fill = umem->fill_size; + + ret = xsk_ring_prod__reserve(&umem->fq, buffers_to_fill, &idx); + if (ret != buffers_to_fill) + return -ENOSPC; + + while (filled < buffers_to_fill) { + struct pkt *pkt = pkt_stream_get_next_rx_pkt(pkt_stream, &nb_pkts); + u64 addr; + u32 i; + + for (i = 0; i < pkt_nb_frags(rx_frame_size, pkt_stream, pkt); i++) { + if (!pkt) { + if (!fill_up) + break; + addr = filled * umem->frame_size + umem->base_addr; + } else if (pkt->offset >= 0) { + addr = pkt->offset % umem->frame_size + umem_alloc_buffer(umem); + } else { + addr = pkt->offset + umem_alloc_buffer(umem); + } + + *xsk_ring_prod__fill_addr(&umem->fq, idx++) = addr; + if (++filled >= buffers_to_fill) + break; + } + } + xsk_ring_prod__submit(&umem->fq, filled); + xsk_ring_prod__cancel(&umem->fq, buffers_to_fill - filled); + + pkt_stream_reset(pkt_stream); + umem_reset_alloc(umem); + + return 0; +} + +static int thread_common_ops(struct test_spec *test, struct ifobject *ifobject) +{ + LIBBPF_OPTS(bpf_xdp_query_opts, opts); + int mmap_flags; + u64 umem_sz; + void *bufs; + int ret; + u32 i; + + umem_sz = ifobject->umem->num_frames * ifobject->umem->frame_size; + mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE; + + if (ifobject->umem->unaligned_mode) + mmap_flags |= MAP_HUGETLB | MAP_HUGE_2MB; + + if (ifobject->shared_umem) + umem_sz *= 2; + + bufs = mmap(NULL, umem_sz, PROT_READ | PROT_WRITE, mmap_flags, -1, 0); + if (bufs == MAP_FAILED) + return -errno; + + ret = xsk_configure_umem(ifobject, ifobject->umem, bufs, umem_sz); + if (ret) + return ret; + + ret = xsk_configure(test, ifobject, ifobject->umem, false); + if (ret) + return ret; + + ifobject->xsk = &ifobject->xsk_arr[0]; + + if (!ifobject->rx_on) + return 0; + + ret = xsk_populate_fill_ring(ifobject->umem, ifobject->xsk->pkt_stream, + ifobject->use_fill_ring); + if (ret) + return ret; + + for (i = 0; i < test->nb_sockets; i++) { + ifobject->xsk = &ifobject->xsk_arr[i]; + ret = xsk_update_xskmap(ifobject->xskmap, ifobject->xsk->xsk, i); + if (ret) + return ret; + } + + return 0; +} + +void *worker_testapp_validate_tx(void *arg) +{ + struct test_spec *test = (struct test_spec *)arg; + struct ifobject *ifobject = test->ifobj_tx; + int err; + + if (test->current_step == 1) { + if (!ifobject->shared_umem) { + if (thread_common_ops(test, ifobject)) { + test->fail = true; + pthread_exit(NULL); + } + } else { + if (thread_common_ops_tx(test, ifobject)) { + test->fail = true; + pthread_exit(NULL); + } + } + } + + err = send_pkts(test, ifobject); + + if (!err && ifobject->validation_func) + err = ifobject->validation_func(ifobject); + if (err) + test->fail = true; + + pthread_exit(NULL); +} + +void *worker_testapp_validate_rx(void *arg) +{ + struct test_spec *test = (struct test_spec *)arg; + struct ifobject *ifobject = test->ifobj_rx; + int err; + + if (test->current_step == 1) { + err = thread_common_ops(test, ifobject); + } else { + xsk_clear_xskmap(ifobject->xskmap); + err = xsk_update_xskmap(ifobject->xskmap, ifobject->xsk->xsk, 0); + if (err) + ksft_print_msg("Error: Failed to update xskmap, error %s\n", + strerror(-err)); + } + + pthread_barrier_wait(&barr); + + /* We leave only now in case of error to avoid getting stuck in the barrier */ + if (err) { + test->fail = true; + pthread_exit(NULL); + } + + err = receive_pkts(test); + + if (!err && ifobject->validation_func) + err = ifobject->validation_func(ifobject); + + if (err) { + if (!test->adjust_tail) { + test->fail = true; + } else { + bool supported; + + if (is_adjust_tail_supported(ifobject->xdp_progs, &supported)) + test->fail = true; + else if (!supported) + test->adjust_tail_support = false; + else + test->fail = true; + } + } + + pthread_exit(NULL); +} + +static void testapp_clean_xsk_umem(struct ifobject *ifobj) +{ + u64 umem_sz = ifobj->umem->num_frames * ifobj->umem->frame_size; + + if (ifobj->shared_umem) + umem_sz *= 2; + + umem_sz = ceil_u64(umem_sz, HUGEPAGE_SIZE) * HUGEPAGE_SIZE; + xsk_umem__delete(ifobj->umem->umem); + munmap(ifobj->umem->buffer, umem_sz); +} + +static void handler(int signum) +{ + pthread_exit(NULL); +} + +static bool xdp_prog_changed_rx(struct test_spec *test) +{ + struct ifobject *ifobj = test->ifobj_rx; + + return ifobj->xdp_prog != test->xdp_prog_rx || ifobj->mode != test->mode; +} + +static bool xdp_prog_changed_tx(struct test_spec *test) +{ + struct ifobject *ifobj = test->ifobj_tx; + + return ifobj->xdp_prog != test->xdp_prog_tx || ifobj->mode != test->mode; +} + +static int xsk_reattach_xdp(struct ifobject *ifobj, struct bpf_program *xdp_prog, + struct bpf_map *xskmap, enum test_mode mode) +{ + int err; + + xsk_detach_xdp_program(ifobj->ifindex, mode_to_xdp_flags(ifobj->mode)); + err = xsk_attach_xdp_program(xdp_prog, ifobj->ifindex, mode_to_xdp_flags(mode)); + if (err) { + ksft_print_msg("Error attaching XDP program\n"); + return err; + } + + if (ifobj->mode != mode && (mode == TEST_MODE_DRV || mode == TEST_MODE_ZC)) + if (!xsk_is_in_mode(ifobj->ifindex, XDP_FLAGS_DRV_MODE)) { + ksft_print_msg("ERROR: XDP prog not in DRV mode\n"); + return -EINVAL; + } + + ifobj->xdp_prog = xdp_prog; + ifobj->xskmap = xskmap; + ifobj->mode = mode; + + return 0; +} + +static int xsk_attach_xdp_progs(struct test_spec *test, struct ifobject *ifobj_rx, + struct ifobject *ifobj_tx) +{ + int err = 0; + + if (xdp_prog_changed_rx(test)) { + err = xsk_reattach_xdp(ifobj_rx, test->xdp_prog_rx, test->xskmap_rx, test->mode); + if (err) + return err; + } + + if (!ifobj_tx || ifobj_tx->shared_umem) + return 0; + + if (xdp_prog_changed_tx(test)) + err = xsk_reattach_xdp(ifobj_tx, test->xdp_prog_tx, test->xskmap_tx, test->mode); + + return err; +} + +static void clean_sockets(struct test_spec *test, struct ifobject *ifobj) +{ + u32 i; + + if (!ifobj || !test) + return; + + for (i = 0; i < test->nb_sockets; i++) + xsk_socket__delete(ifobj->xsk_arr[i].xsk); +} + +static void clean_umem(struct test_spec *test, struct ifobject *ifobj1, struct ifobject *ifobj2) +{ + if (!ifobj1) + return; + + testapp_clean_xsk_umem(ifobj1); + if (ifobj2 && !ifobj2->shared_umem) + testapp_clean_xsk_umem(ifobj2); +} + +static int __testapp_validate_traffic(struct test_spec *test, struct ifobject *ifobj1, + struct ifobject *ifobj2) +{ + pthread_t t0, t1; + int err; + + if (test->mtu > MAX_ETH_PKT_SIZE) { + if (test->mode == TEST_MODE_ZC && (!ifobj1->multi_buff_zc_supp || + (ifobj2 && !ifobj2->multi_buff_zc_supp))) { + ksft_print_msg("Multi buffer for zero-copy not supported.\n"); + return TEST_SKIP; + } + if (test->mode != TEST_MODE_ZC && (!ifobj1->multi_buff_supp || + (ifobj2 && !ifobj2->multi_buff_supp))) { + ksft_print_msg("Multi buffer not supported.\n"); + return TEST_SKIP; + } + } + err = test_spec_set_mtu(test, test->mtu); + if (err) { + ksft_print_msg("Error, could not set mtu.\n"); + return TEST_FAILURE; + } + + if (ifobj2) { + if (pthread_barrier_init(&barr, NULL, 2)) + return TEST_FAILURE; + pkt_stream_reset(ifobj2->xsk->pkt_stream); + } + + test->current_step++; + pkt_stream_reset(ifobj1->xsk->pkt_stream); + pkts_in_flight = 0; + + signal(SIGUSR1, handler); + /*Spawn RX thread */ + pthread_create(&t0, NULL, ifobj1->func_ptr, test); + + if (ifobj2) { + pthread_barrier_wait(&barr); + if (pthread_barrier_destroy(&barr)) { + pthread_kill(t0, SIGUSR1); + clean_sockets(test, ifobj1); + clean_umem(test, ifobj1, NULL); + return TEST_FAILURE; + } + + /*Spawn TX thread */ + pthread_create(&t1, NULL, ifobj2->func_ptr, test); + + pthread_join(t1, NULL); + } + + if (!ifobj2) + pthread_kill(t0, SIGUSR1); + else + pthread_join(t0, NULL); + + if (test->total_steps == test->current_step || test->fail) { + clean_sockets(test, ifobj1); + clean_sockets(test, ifobj2); + clean_umem(test, ifobj1, ifobj2); + } + + if (test->fail) + return TEST_FAILURE; + + return TEST_PASS; +} + +static int testapp_validate_traffic(struct test_spec *test) +{ + struct ifobject *ifobj_rx = test->ifobj_rx; + struct ifobject *ifobj_tx = test->ifobj_tx; + + if ((ifobj_rx->umem->unaligned_mode && !ifobj_rx->unaligned_supp) || + (ifobj_tx->umem->unaligned_mode && !ifobj_tx->unaligned_supp)) { + ksft_print_msg("No huge pages present.\n"); + return TEST_SKIP; + } + + if (test->set_ring) { + if (ifobj_tx->hw_ring_size_supp) { + if (set_ring_size(ifobj_tx)) { + ksft_print_msg("Failed to change HW ring size.\n"); + return TEST_FAILURE; + } + } else { + ksft_print_msg("Changing HW ring size not supported.\n"); + return TEST_SKIP; + } + } + + if (xsk_attach_xdp_progs(test, ifobj_rx, ifobj_tx)) + return TEST_FAILURE; + return __testapp_validate_traffic(test, ifobj_rx, ifobj_tx); +} + +static int testapp_validate_traffic_single_thread(struct test_spec *test, struct ifobject *ifobj) +{ + return __testapp_validate_traffic(test, ifobj, NULL); +} + +int testapp_teardown(struct test_spec *test) +{ + int i; + + for (i = 0; i < MAX_TEARDOWN_ITER; i++) { + if (testapp_validate_traffic(test)) + return TEST_FAILURE; + test_spec_reset(test); + } + + return TEST_PASS; +} + +static void swap_directions(struct ifobject **ifobj1, struct ifobject **ifobj2) +{ + thread_func_t tmp_func_ptr = (*ifobj1)->func_ptr; + struct ifobject *tmp_ifobj = (*ifobj1); + + (*ifobj1)->func_ptr = (*ifobj2)->func_ptr; + (*ifobj2)->func_ptr = tmp_func_ptr; + + *ifobj1 = *ifobj2; + *ifobj2 = tmp_ifobj; +} + +int testapp_bidirectional(struct test_spec *test) +{ + int res; + + test->ifobj_tx->rx_on = true; + test->ifobj_rx->tx_on = true; + test->total_steps = 2; + if (testapp_validate_traffic(test)) + return TEST_FAILURE; + + print_verbose("Switching Tx/Rx direction\n"); + swap_directions(&test->ifobj_rx, &test->ifobj_tx); + res = __testapp_validate_traffic(test, test->ifobj_rx, test->ifobj_tx); + + swap_directions(&test->ifobj_rx, &test->ifobj_tx); + return res; +} + +static int swap_xsk_resources(struct test_spec *test) +{ + int ret; + + test->ifobj_tx->xsk_arr[0].pkt_stream = NULL; + test->ifobj_rx->xsk_arr[0].pkt_stream = NULL; + test->ifobj_tx->xsk_arr[1].pkt_stream = test->tx_pkt_stream_default; + test->ifobj_rx->xsk_arr[1].pkt_stream = test->rx_pkt_stream_default; + test->ifobj_tx->xsk = &test->ifobj_tx->xsk_arr[1]; + test->ifobj_rx->xsk = &test->ifobj_rx->xsk_arr[1]; + + ret = xsk_update_xskmap(test->ifobj_rx->xskmap, test->ifobj_rx->xsk->xsk, 0); + if (ret) + return TEST_FAILURE; + + return TEST_PASS; +} + +int testapp_xdp_prog_cleanup(struct test_spec *test) +{ + test->total_steps = 2; + test->nb_sockets = 2; + if (testapp_validate_traffic(test)) + return TEST_FAILURE; + + if (swap_xsk_resources(test)) { + clean_sockets(test, test->ifobj_rx); + clean_sockets(test, test->ifobj_tx); + clean_umem(test, test->ifobj_rx, test->ifobj_tx); + return TEST_FAILURE; + } + + return testapp_validate_traffic(test); +} + +int testapp_headroom(struct test_spec *test) +{ + test->ifobj_rx->umem->frame_headroom = UMEM_HEADROOM_TEST_SIZE; + return testapp_validate_traffic(test); +} + +int testapp_stats_rx_dropped(struct test_spec *test) +{ + if (test->mode == TEST_MODE_ZC) { + ksft_print_msg("Can not run RX_DROPPED test for ZC mode\n"); + return TEST_SKIP; + } + + if (pkt_stream_replace_half(test, MIN_PKT_SIZE * 4, 0)) + return TEST_FAILURE; + test->ifobj_rx->umem->frame_headroom = test->ifobj_rx->umem->frame_size - + XDP_PACKET_HEADROOM - MIN_PKT_SIZE * 3; + if (pkt_stream_receive_half(test)) + return TEST_FAILURE; + test->ifobj_rx->validation_func = validate_rx_dropped; + return testapp_validate_traffic(test); +} + +int testapp_stats_tx_invalid_descs(struct test_spec *test) +{ + if (pkt_stream_replace_half(test, XSK_UMEM__INVALID_FRAME_SIZE, 0)) + return TEST_FAILURE; + test->ifobj_tx->validation_func = validate_tx_invalid_descs; + return testapp_validate_traffic(test); +} + +int testapp_stats_rx_full(struct test_spec *test) +{ + if (pkt_stream_replace(test, DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, MIN_PKT_SIZE)) + return TEST_FAILURE; + test->ifobj_rx->xsk->pkt_stream = pkt_stream_generate(DEFAULT_UMEM_BUFFERS, MIN_PKT_SIZE); + + test->ifobj_rx->xsk->rxqsize = DEFAULT_UMEM_BUFFERS; + test->ifobj_rx->release_rx = false; + test->ifobj_rx->validation_func = validate_rx_full; + return testapp_validate_traffic(test); +} + +int testapp_stats_fill_empty(struct test_spec *test) +{ + if (pkt_stream_replace(test, DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, MIN_PKT_SIZE)) + return TEST_FAILURE; + test->ifobj_rx->xsk->pkt_stream = pkt_stream_generate(DEFAULT_UMEM_BUFFERS, MIN_PKT_SIZE); + + test->ifobj_rx->use_fill_ring = false; + test->ifobj_rx->validation_func = validate_fill_empty; + return testapp_validate_traffic(test); +} + +int testapp_send_receive_unaligned(struct test_spec *test) +{ + test->ifobj_tx->umem->unaligned_mode = true; + test->ifobj_rx->umem->unaligned_mode = true; + /* Let half of the packets straddle a 4K buffer boundary */ + if (pkt_stream_replace_half(test, MIN_PKT_SIZE, -MIN_PKT_SIZE / 2)) + return TEST_FAILURE; + + return testapp_validate_traffic(test); +} + +int testapp_send_receive_unaligned_mb(struct test_spec *test) +{ + test->mtu = MAX_ETH_JUMBO_SIZE; + test->ifobj_tx->umem->unaligned_mode = true; + test->ifobj_rx->umem->unaligned_mode = true; + if (pkt_stream_replace(test, DEFAULT_PKT_CNT, MAX_ETH_JUMBO_SIZE)) + return TEST_FAILURE; + return testapp_validate_traffic(test); +} + +int testapp_single_pkt(struct test_spec *test) +{ + struct pkt pkts[] = {{0, MIN_PKT_SIZE, 0, true}}; + + if (pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts))) + return TEST_FAILURE; + return testapp_validate_traffic(test); +} + +int testapp_send_receive_mb(struct test_spec *test) +{ + test->mtu = MAX_ETH_JUMBO_SIZE; + if (pkt_stream_replace(test, DEFAULT_PKT_CNT, MAX_ETH_JUMBO_SIZE)) + return TEST_FAILURE; + + return testapp_validate_traffic(test); +} + +int testapp_invalid_desc_mb(struct test_spec *test) +{ + struct xsk_umem_info *umem = test->ifobj_tx->umem; + u64 umem_size = umem->num_frames * umem->frame_size; + struct pkt pkts[] = { + /* Valid packet for synch to start with */ + {0, MIN_PKT_SIZE, 0, true, 0}, + /* Zero frame len is not legal */ + {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + {0, 0, 0, false, 0}, + /* Invalid address in the second frame */ + {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + {umem_size, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + /* Invalid len in the middle */ + {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + {0, XSK_UMEM__INVALID_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + /* Invalid options in the middle */ + {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XSK_DESC__INVALID_OPTION}, + /* Transmit 2 frags, receive 3 */ + {0, XSK_UMEM__MAX_FRAME_SIZE, 0, true, XDP_PKT_CONTD}, + {0, XSK_UMEM__MAX_FRAME_SIZE, 0, true, 0}, + /* Middle frame crosses chunk boundary with small length */ + {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + {-MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false, 0}, + /* Valid packet for synch so that something is received */ + {0, MIN_PKT_SIZE, 0, true, 0}}; + + if (umem->unaligned_mode) { + /* Crossing a chunk boundary allowed */ + pkts[12].valid = true; + pkts[13].valid = true; + } + + test->mtu = MAX_ETH_JUMBO_SIZE; + if (pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts))) + return TEST_FAILURE; + return testapp_validate_traffic(test); +} + +int testapp_invalid_desc(struct test_spec *test) +{ + struct xsk_umem_info *umem = test->ifobj_tx->umem; + u64 umem_size = umem->num_frames * umem->frame_size; + struct pkt pkts[] = { + /* Zero packet address allowed */ + {0, MIN_PKT_SIZE, 0, true}, + /* Allowed packet */ + {0, MIN_PKT_SIZE, 0, true}, + /* Straddling the start of umem */ + {-2, MIN_PKT_SIZE, 0, false}, + /* Packet too large */ + {0, XSK_UMEM__INVALID_FRAME_SIZE, 0, false}, + /* Up to end of umem allowed */ + {umem_size - MIN_PKT_SIZE - 2 * umem->frame_size, MIN_PKT_SIZE, 0, true}, + /* After umem ends */ + {umem_size, MIN_PKT_SIZE, 0, false}, + /* Straddle the end of umem */ + {umem_size - MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false}, + /* Straddle a 4K boundary */ + {0x1000 - MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false}, + /* Straddle a 2K boundary */ + {0x800 - MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, true}, + /* Valid packet for synch so that something is received */ + {0, MIN_PKT_SIZE, 0, true}}; + + if (umem->unaligned_mode) { + /* Crossing a page boundary allowed */ + pkts[7].valid = true; + } + if (umem->frame_size == XSK_UMEM__DEFAULT_FRAME_SIZE / 2) { + /* Crossing a 2K frame size boundary not allowed */ + pkts[8].valid = false; + } + + if (test->ifobj_tx->shared_umem) { + pkts[4].offset += umem_size; + pkts[5].offset += umem_size; + pkts[6].offset += umem_size; + } + + if (pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts))) + return TEST_FAILURE; + return testapp_validate_traffic(test); +} + +int testapp_xdp_drop(struct test_spec *test) +{ + struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs; + struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs; + + test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_drop, skel_tx->progs.xsk_xdp_drop, + skel_rx->maps.xsk, skel_tx->maps.xsk); + + if (pkt_stream_receive_half(test)) + return TEST_FAILURE; + return testapp_validate_traffic(test); +} + +int testapp_xdp_metadata_copy(struct test_spec *test) +{ + struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs; + struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs; + + test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_populate_metadata, + skel_tx->progs.xsk_xdp_populate_metadata, + skel_rx->maps.xsk, skel_tx->maps.xsk); + test->ifobj_rx->use_metadata = true; + + skel_rx->bss->count = 0; + + return testapp_validate_traffic(test); +} + +int testapp_xdp_shared_umem(struct test_spec *test) +{ + struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs; + struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs; + int ret; + + test->total_steps = 1; + test->nb_sockets = 2; + + test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_shared_umem, + skel_tx->progs.xsk_xdp_shared_umem, + skel_rx->maps.xsk, skel_tx->maps.xsk); + + if (pkt_stream_even_odd_sequence(test)) + return TEST_FAILURE; + + ret = testapp_validate_traffic(test); + + release_even_odd_sequence(test); + + return ret; +} + +int testapp_poll_txq_tmout(struct test_spec *test) +{ + test->ifobj_tx->use_poll = true; + /* create invalid frame by set umem frame_size and pkt length equal to 2048 */ + test->ifobj_tx->umem->frame_size = 2048; + if (pkt_stream_replace(test, 2 * DEFAULT_PKT_CNT, 2048)) + return TEST_FAILURE; + return testapp_validate_traffic_single_thread(test, test->ifobj_tx); +} + +int testapp_poll_rxq_tmout(struct test_spec *test) +{ + test->ifobj_rx->use_poll = true; + return testapp_validate_traffic_single_thread(test, test->ifobj_rx); +} + +int testapp_too_many_frags(struct test_spec *test) +{ + struct pkt *pkts; + u32 max_frags, i; + int ret = TEST_FAILURE; + + if (test->mode == TEST_MODE_ZC) { + max_frags = test->ifobj_tx->xdp_zc_max_segs; + } else { + max_frags = get_max_skb_frags(); + if (!max_frags) { + ksft_print_msg("Can't get MAX_SKB_FRAGS from system, using default (17)\n"); + max_frags = 17; + } + max_frags += 1; + } + + pkts = calloc(2 * max_frags + 2, sizeof(struct pkt)); + if (!pkts) + return TEST_FAILURE; + + test->mtu = MAX_ETH_JUMBO_SIZE; + + /* Valid packet for synch */ + pkts[0].len = MIN_PKT_SIZE; + pkts[0].valid = true; + + /* One valid packet with the max amount of frags */ + for (i = 1; i < max_frags + 1; i++) { + pkts[i].len = MIN_PKT_SIZE; + pkts[i].options = XDP_PKT_CONTD; + pkts[i].valid = true; + } + pkts[max_frags].options = 0; + + /* An invalid packet with the max amount of frags but signals packet + * continues on the last frag + */ + for (i = max_frags + 1; i < 2 * max_frags + 1; i++) { + pkts[i].len = MIN_PKT_SIZE; + pkts[i].options = XDP_PKT_CONTD; + pkts[i].valid = false; + } + + /* Valid packet for synch */ + pkts[2 * max_frags + 1].len = MIN_PKT_SIZE; + pkts[2 * max_frags + 1].valid = true; + + if (pkt_stream_generate_custom(test, pkts, 2 * max_frags + 2)) { + free(pkts); + return TEST_FAILURE; + } + + ret = testapp_validate_traffic(test); + free(pkts); + return ret; +} + +static int xsk_load_xdp_programs(struct ifobject *ifobj) +{ + ifobj->xdp_progs = xsk_xdp_progs__open_and_load(); + if (libbpf_get_error(ifobj->xdp_progs)) + return libbpf_get_error(ifobj->xdp_progs); + + return 0; +} + +/* Simple test */ +static bool hugepages_present(void) +{ + size_t mmap_sz = 2 * DEFAULT_UMEM_BUFFERS * XSK_UMEM__DEFAULT_FRAME_SIZE; + void *bufs; + + bufs = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, MAP_HUGE_2MB); + if (bufs == MAP_FAILED) + return false; + + mmap_sz = ceil_u64(mmap_sz, HUGEPAGE_SIZE) * HUGEPAGE_SIZE; + munmap(bufs, mmap_sz); + return true; +} + +int init_iface(struct ifobject *ifobj, thread_func_t func_ptr) +{ + LIBBPF_OPTS(bpf_xdp_query_opts, query_opts); + int err; + + ifobj->func_ptr = func_ptr; + + err = xsk_load_xdp_programs(ifobj); + if (err) { + ksft_print_msg("Error loading XDP program\n"); + return err; + } + + if (hugepages_present()) + ifobj->unaligned_supp = true; + + err = bpf_xdp_query(ifobj->ifindex, XDP_FLAGS_DRV_MODE, &query_opts); + if (err) { + ksft_print_msg("Error querying XDP capabilities\n"); + return err; + } + if (query_opts.feature_flags & NETDEV_XDP_ACT_RX_SG) + ifobj->multi_buff_supp = true; + if (query_opts.feature_flags & NETDEV_XDP_ACT_XSK_ZEROCOPY) { + if (query_opts.xdp_zc_max_segs > 1) { + ifobj->multi_buff_zc_supp = true; + ifobj->xdp_zc_max_segs = query_opts.xdp_zc_max_segs; + } else { + ifobj->xdp_zc_max_segs = 0; + } + } + + return 0; +} + +int testapp_send_receive(struct test_spec *test) +{ + return testapp_validate_traffic(test); +} + +int testapp_send_receive_2k_frame(struct test_spec *test) +{ + test->ifobj_tx->umem->frame_size = 2048; + test->ifobj_rx->umem->frame_size = 2048; + if (pkt_stream_replace(test, DEFAULT_PKT_CNT, MIN_PKT_SIZE)) + return TEST_FAILURE; + return testapp_validate_traffic(test); +} + +int testapp_poll_rx(struct test_spec *test) +{ + test->ifobj_rx->use_poll = true; + return testapp_validate_traffic(test); +} + +int testapp_poll_tx(struct test_spec *test) +{ + test->ifobj_tx->use_poll = true; + return testapp_validate_traffic(test); +} + +int testapp_aligned_inv_desc(struct test_spec *test) +{ + return testapp_invalid_desc(test); +} + +int testapp_aligned_inv_desc_2k_frame(struct test_spec *test) +{ + test->ifobj_tx->umem->frame_size = 2048; + test->ifobj_rx->umem->frame_size = 2048; + return testapp_invalid_desc(test); +} + +int testapp_unaligned_inv_desc(struct test_spec *test) +{ + test->ifobj_tx->umem->unaligned_mode = true; + test->ifobj_rx->umem->unaligned_mode = true; + return testapp_invalid_desc(test); +} + +int testapp_unaligned_inv_desc_4001_frame(struct test_spec *test) +{ + u64 page_size, umem_size; + + /* Odd frame size so the UMEM doesn't end near a page boundary. */ + test->ifobj_tx->umem->frame_size = 4001; + test->ifobj_rx->umem->frame_size = 4001; + test->ifobj_tx->umem->unaligned_mode = true; + test->ifobj_rx->umem->unaligned_mode = true; + /* This test exists to test descriptors that staddle the end of + * the UMEM but not a page. + */ + page_size = sysconf(_SC_PAGESIZE); + umem_size = test->ifobj_tx->umem->num_frames * test->ifobj_tx->umem->frame_size; + assert(umem_size % page_size > MIN_PKT_SIZE); + assert(umem_size % page_size < page_size - MIN_PKT_SIZE); + + return testapp_invalid_desc(test); +} + +int testapp_aligned_inv_desc_mb(struct test_spec *test) +{ + return testapp_invalid_desc_mb(test); +} + +int testapp_unaligned_inv_desc_mb(struct test_spec *test) +{ + test->ifobj_tx->umem->unaligned_mode = true; + test->ifobj_rx->umem->unaligned_mode = true; + return testapp_invalid_desc_mb(test); +} + +int testapp_xdp_metadata(struct test_spec *test) +{ + return testapp_xdp_metadata_copy(test); +} + +int testapp_xdp_metadata_mb(struct test_spec *test) +{ + test->mtu = MAX_ETH_JUMBO_SIZE; + return testapp_xdp_metadata_copy(test); +} + +int testapp_hw_sw_min_ring_size(struct test_spec *test) +{ + int ret; + + test->set_ring = true; + test->total_steps = 2; + test->ifobj_tx->ring.tx_pending = DEFAULT_BATCH_SIZE; + test->ifobj_tx->ring.rx_pending = DEFAULT_BATCH_SIZE * 2; + test->ifobj_tx->xsk->batch_size = 1; + test->ifobj_rx->xsk->batch_size = 1; + ret = testapp_validate_traffic(test); + if (ret) + return ret; + + /* Set batch size to hw_ring_size - 1 */ + test->ifobj_tx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1; + test->ifobj_rx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1; + return testapp_validate_traffic(test); +} + +int testapp_hw_sw_max_ring_size(struct test_spec *test) +{ + u32 max_descs = XSK_RING_PROD__DEFAULT_NUM_DESCS * 4; + int ret; + + test->set_ring = true; + test->total_steps = 2; + test->ifobj_tx->ring.tx_pending = test->ifobj_tx->ring.tx_max_pending; + test->ifobj_tx->ring.rx_pending = test->ifobj_tx->ring.rx_max_pending; + test->ifobj_rx->umem->num_frames = max_descs; + test->ifobj_rx->umem->fill_size = max_descs; + test->ifobj_rx->umem->comp_size = max_descs; + test->ifobj_tx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + test->ifobj_rx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + + ret = testapp_validate_traffic(test); + if (ret) + return ret; + + /* Set batch_size to 8152 for testing, as the ice HW ignores the 3 lowest bits when + * updating the Rx HW tail register. + */ + test->ifobj_tx->xsk->batch_size = test->ifobj_tx->ring.tx_max_pending - 8; + test->ifobj_rx->xsk->batch_size = test->ifobj_tx->ring.tx_max_pending - 8; + if (pkt_stream_replace(test, max_descs, MIN_PKT_SIZE)) { + clean_sockets(test, test->ifobj_tx); + clean_sockets(test, test->ifobj_rx); + clean_umem(test, test->ifobj_rx, test->ifobj_tx); + return TEST_FAILURE; + } + + return testapp_validate_traffic(test); +} + +static int testapp_xdp_adjust_tail(struct test_spec *test, int adjust_value) +{ + struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs; + struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs; + + test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_adjust_tail, + skel_tx->progs.xsk_xdp_adjust_tail, + skel_rx->maps.xsk, skel_tx->maps.xsk); + + skel_rx->bss->adjust_value = adjust_value; + + return testapp_validate_traffic(test); +} + +static int testapp_adjust_tail(struct test_spec *test, u32 value, u32 pkt_len) +{ + int ret; + + test->adjust_tail_support = true; + test->adjust_tail = true; + test->total_steps = 1; + + ret = pkt_stream_replace_ifobject(test->ifobj_tx, DEFAULT_BATCH_SIZE, pkt_len); + if (ret) + return TEST_FAILURE; + + ret = pkt_stream_replace_ifobject(test->ifobj_rx, DEFAULT_BATCH_SIZE, pkt_len + value); + if (ret) + return TEST_FAILURE; + + ret = testapp_xdp_adjust_tail(test, value); + if (ret) + return ret; + + if (!test->adjust_tail_support) { + ksft_print_msg("%s %sResize pkt with bpf_xdp_adjust_tail() not supported\n", + mode_string(test), busy_poll_string(test)); + return TEST_SKIP; + } + + return 0; +} + +int testapp_adjust_tail_shrink(struct test_spec *test) +{ + /* Shrink by 4 bytes for testing purpose */ + return testapp_adjust_tail(test, -4, MIN_PKT_SIZE * 2); +} + +int testapp_adjust_tail_shrink_mb(struct test_spec *test) +{ + test->mtu = MAX_ETH_JUMBO_SIZE; + /* Shrink by the frag size */ + return testapp_adjust_tail(test, -XSK_UMEM__MAX_FRAME_SIZE, XSK_UMEM__LARGE_FRAME_SIZE * 2); +} + +int testapp_adjust_tail_grow(struct test_spec *test) +{ + /* Grow by 4 bytes for testing purpose */ + return testapp_adjust_tail(test, 4, MIN_PKT_SIZE * 2); +} + +int testapp_adjust_tail_grow_mb(struct test_spec *test) +{ + test->mtu = MAX_ETH_JUMBO_SIZE; + /* Grow by (frag_size - last_frag_Size) - 1 to stay inside the last fragment */ + return testapp_adjust_tail(test, (XSK_UMEM__MAX_FRAME_SIZE / 2) - 1, + XSK_UMEM__LARGE_FRAME_SIZE * 2); +} + +int testapp_tx_queue_consumer(struct test_spec *test) +{ + int nr_packets; + + if (test->mode == TEST_MODE_ZC) { + ksft_print_msg("Can not run TX_QUEUE_CONSUMER test for ZC mode\n"); + return TEST_SKIP; + } + + nr_packets = MAX_TX_BUDGET_DEFAULT + 1; + if (pkt_stream_replace(test, nr_packets, MIN_PKT_SIZE)) + return TEST_FAILURE; + test->ifobj_tx->xsk->batch_size = nr_packets; + test->ifobj_tx->xsk->check_consumer = true; + + return testapp_validate_traffic(test); +} + +struct ifobject *ifobject_create(void) +{ + struct ifobject *ifobj; + + ifobj = calloc(1, sizeof(struct ifobject)); + if (!ifobj) + return NULL; + + ifobj->xsk_arr = calloc(MAX_SOCKETS, sizeof(*ifobj->xsk_arr)); + if (!ifobj->xsk_arr) + goto out_xsk_arr; + + ifobj->umem = calloc(1, sizeof(*ifobj->umem)); + if (!ifobj->umem) + goto out_umem; + + return ifobj; + +out_umem: + free(ifobj->xsk_arr); +out_xsk_arr: + free(ifobj); + return NULL; +} + +void ifobject_delete(struct ifobject *ifobj) +{ + free(ifobj->umem); + free(ifobj->xsk_arr); + free(ifobj); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_xsk.h b/tools/testing/selftests/bpf/prog_tests/test_xsk.h new file mode 100644 index 00000000000000..8fc78a057de0be --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_xsk.h @@ -0,0 +1,298 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef TEST_XSK_H_ +#define TEST_XSK_H_ + +#include <linux/ethtool.h> +#include <linux/if_xdp.h> + +#include "../kselftest.h" +#include "xsk.h" + +#ifndef SO_PREFER_BUSY_POLL +#define SO_PREFER_BUSY_POLL 69 +#endif + +#ifndef SO_BUSY_POLL_BUDGET +#define SO_BUSY_POLL_BUDGET 70 +#endif + +#define TEST_PASS 0 +#define TEST_FAILURE -1 +#define TEST_CONTINUE 1 +#define TEST_SKIP 2 + +#define DEFAULT_PKT_CNT (4 * 1024) +#define DEFAULT_UMEM_BUFFERS (DEFAULT_PKT_CNT / 4) +#define HUGEPAGE_SIZE (2 * 1024 * 1024) +#define MIN_PKT_SIZE 64 +#define MAX_ETH_PKT_SIZE 1518 +#define MAX_INTERFACE_NAME_CHARS 16 +#define MAX_TEST_NAME_SIZE 48 +#define SOCK_RECONF_CTR 10 +#define USLEEP_MAX 10000 + +extern bool opt_verbose; +#define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0) + + +static inline u32 ceil_u32(u32 a, u32 b) +{ + return (a + b - 1) / b; +} + +static inline u64 ceil_u64(u64 a, u64 b) +{ + return (a + b - 1) / b; +} + +/* Simple test */ +enum test_mode { + TEST_MODE_SKB, + TEST_MODE_DRV, + TEST_MODE_ZC, + TEST_MODE_ALL +}; + +struct ifobject; +struct test_spec; +typedef int (*validation_func_t)(struct ifobject *ifobj); +typedef void *(*thread_func_t)(void *arg); +typedef int (*test_func_t)(struct test_spec *test); + +struct xsk_socket_info { + struct xsk_ring_cons rx; + struct xsk_ring_prod tx; + struct xsk_umem_info *umem; + struct xsk_socket *xsk; + struct pkt_stream *pkt_stream; + u32 outstanding_tx; + u32 rxqsize; + u32 batch_size; + u8 dst_mac[ETH_ALEN]; + u8 src_mac[ETH_ALEN]; + bool check_consumer; +}; + +int kick_rx(struct xsk_socket_info *xsk); +int kick_tx(struct xsk_socket_info *xsk); + +struct xsk_umem_info { + struct xsk_ring_prod fq; + struct xsk_ring_cons cq; + struct xsk_umem *umem; + u64 next_buffer; + u32 num_frames; + u32 frame_headroom; + void *buffer; + u32 frame_size; + u32 base_addr; + u32 fill_size; + u32 comp_size; + bool unaligned_mode; +}; + +struct set_hw_ring { + u32 default_tx; + u32 default_rx; +}; + +int hw_ring_size_reset(struct ifobject *ifobj); + +struct ifobject { + char ifname[MAX_INTERFACE_NAME_CHARS]; + struct xsk_socket_info *xsk; + struct xsk_socket_info *xsk_arr; + struct xsk_umem_info *umem; + thread_func_t func_ptr; + validation_func_t validation_func; + struct xsk_xdp_progs *xdp_progs; + struct bpf_map *xskmap; + struct bpf_program *xdp_prog; + struct ethtool_ringparam ring; + struct set_hw_ring set_ring; + enum test_mode mode; + int ifindex; + int mtu; + u32 bind_flags; + u32 xdp_zc_max_segs; + bool tx_on; + bool rx_on; + bool use_poll; + bool busy_poll; + bool use_fill_ring; + bool release_rx; + bool shared_umem; + bool use_metadata; + bool unaligned_supp; + bool multi_buff_supp; + bool multi_buff_zc_supp; + bool hw_ring_size_supp; +}; +struct ifobject *ifobject_create(void); +void ifobject_delete(struct ifobject *ifobj); +int init_iface(struct ifobject *ifobj, thread_func_t func_ptr); + +int xsk_configure_umem(struct ifobject *ifobj, struct xsk_umem_info *umem, void *buffer, u64 size); +int xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_info *umem, + struct ifobject *ifobject, bool shared); + + +struct pkt { + int offset; + u32 len; + u32 pkt_nb; + bool valid; + u16 options; +}; + +struct pkt_stream { + u32 nb_pkts; + u32 current_pkt_nb; + struct pkt *pkts; + u32 max_pkt_len; + u32 nb_rx_pkts; + u32 nb_valid_entries; + bool verbatim; +}; + +static inline bool pkt_continues(u32 options) +{ + return options & XDP_PKT_CONTD; +} + +struct pkt_stream *pkt_stream_generate(u32 nb_pkts, u32 pkt_len); +void pkt_stream_delete(struct pkt_stream *pkt_stream); +void pkt_stream_reset(struct pkt_stream *pkt_stream); +void pkt_stream_restore_default(struct test_spec *test); + +struct test_spec { + struct ifobject *ifobj_tx; + struct ifobject *ifobj_rx; + struct pkt_stream *tx_pkt_stream_default; + struct pkt_stream *rx_pkt_stream_default; + struct bpf_program *xdp_prog_rx; + struct bpf_program *xdp_prog_tx; + struct bpf_map *xskmap_rx; + struct bpf_map *xskmap_tx; + test_func_t test_func; + int mtu; + u16 total_steps; + u16 current_step; + u16 nb_sockets; + bool fail; + bool set_ring; + bool adjust_tail; + bool adjust_tail_support; + enum test_mode mode; + char name[MAX_TEST_NAME_SIZE]; +}; + +#define busy_poll_string(test) (test)->ifobj_tx->busy_poll ? "BUSY-POLL " : "" +static inline char *mode_string(struct test_spec *test) +{ + switch (test->mode) { + case TEST_MODE_SKB: + return "SKB"; + case TEST_MODE_DRV: + return "DRV"; + case TEST_MODE_ZC: + return "ZC"; + default: + return "BOGUS"; + } +} + +void test_init(struct test_spec *test, struct ifobject *ifobj_tx, + struct ifobject *ifobj_rx, enum test_mode mode, + const struct test_spec *test_to_run); + +int testapp_adjust_tail_grow(struct test_spec *test); +int testapp_adjust_tail_grow_mb(struct test_spec *test); +int testapp_adjust_tail_shrink(struct test_spec *test); +int testapp_adjust_tail_shrink_mb(struct test_spec *test); +int testapp_aligned_inv_desc(struct test_spec *test); +int testapp_aligned_inv_desc_2k_frame(struct test_spec *test); +int testapp_aligned_inv_desc_mb(struct test_spec *test); +int testapp_bidirectional(struct test_spec *test); +int testapp_headroom(struct test_spec *test); +int testapp_hw_sw_max_ring_size(struct test_spec *test); +int testapp_hw_sw_min_ring_size(struct test_spec *test); +int testapp_poll_rx(struct test_spec *test); +int testapp_poll_rxq_tmout(struct test_spec *test); +int testapp_poll_tx(struct test_spec *test); +int testapp_poll_txq_tmout(struct test_spec *test); +int testapp_send_receive(struct test_spec *test); +int testapp_send_receive_2k_frame(struct test_spec *test); +int testapp_send_receive_mb(struct test_spec *test); +int testapp_send_receive_unaligned(struct test_spec *test); +int testapp_send_receive_unaligned_mb(struct test_spec *test); +int testapp_single_pkt(struct test_spec *test); +int testapp_stats_fill_empty(struct test_spec *test); +int testapp_stats_rx_dropped(struct test_spec *test); +int testapp_stats_tx_invalid_descs(struct test_spec *test); +int testapp_stats_rx_full(struct test_spec *test); +int testapp_teardown(struct test_spec *test); +int testapp_too_many_frags(struct test_spec *test); +int testapp_tx_queue_consumer(struct test_spec *test); +int testapp_unaligned_inv_desc(struct test_spec *test); +int testapp_unaligned_inv_desc_4001_frame(struct test_spec *test); +int testapp_unaligned_inv_desc_mb(struct test_spec *test); +int testapp_xdp_drop(struct test_spec *test); +int testapp_xdp_metadata(struct test_spec *test); +int testapp_xdp_metadata_mb(struct test_spec *test); +int testapp_xdp_prog_cleanup(struct test_spec *test); +int testapp_xdp_shared_umem(struct test_spec *test); + +void *worker_testapp_validate_rx(void *arg); +void *worker_testapp_validate_tx(void *arg); + +static const struct test_spec tests[] = { + {.name = "SEND_RECEIVE", .test_func = testapp_send_receive}, + {.name = "SEND_RECEIVE_2K_FRAME", .test_func = testapp_send_receive_2k_frame}, + {.name = "SEND_RECEIVE_SINGLE_PKT", .test_func = testapp_single_pkt}, + {.name = "POLL_RX", .test_func = testapp_poll_rx}, + {.name = "POLL_TX", .test_func = testapp_poll_tx}, + {.name = "POLL_RXQ_FULL", .test_func = testapp_poll_rxq_tmout}, + {.name = "POLL_TXQ_FULL", .test_func = testapp_poll_txq_tmout}, + {.name = "ALIGNED_INV_DESC", .test_func = testapp_aligned_inv_desc}, + {.name = "ALIGNED_INV_DESC_2K_FRAME_SIZE", .test_func = testapp_aligned_inv_desc_2k_frame}, + {.name = "UMEM_HEADROOM", .test_func = testapp_headroom}, + {.name = "BIDIRECTIONAL", .test_func = testapp_bidirectional}, + {.name = "STAT_RX_DROPPED", .test_func = testapp_stats_rx_dropped}, + {.name = "STAT_TX_INVALID", .test_func = testapp_stats_tx_invalid_descs}, + {.name = "STAT_RX_FULL", .test_func = testapp_stats_rx_full}, + {.name = "STAT_FILL_EMPTY", .test_func = testapp_stats_fill_empty}, + {.name = "XDP_PROG_CLEANUP", .test_func = testapp_xdp_prog_cleanup}, + {.name = "XDP_DROP_HALF", .test_func = testapp_xdp_drop}, + {.name = "XDP_SHARED_UMEM", .test_func = testapp_xdp_shared_umem}, + {.name = "XDP_METADATA_COPY", .test_func = testapp_xdp_metadata}, + {.name = "XDP_METADATA_COPY_MULTI_BUFF", .test_func = testapp_xdp_metadata_mb}, + {.name = "ALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_aligned_inv_desc_mb}, + {.name = "TOO_MANY_FRAGS", .test_func = testapp_too_many_frags}, + {.name = "XDP_ADJUST_TAIL_SHRINK", .test_func = testapp_adjust_tail_shrink}, + {.name = "TX_QUEUE_CONSUMER", .test_func = testapp_tx_queue_consumer}, + }; + +static const struct test_spec ci_skip_tests[] = { + /* Flaky tests */ + {.name = "XDP_ADJUST_TAIL_SHRINK_MULTI_BUFF", .test_func = testapp_adjust_tail_shrink_mb}, + {.name = "XDP_ADJUST_TAIL_GROW", .test_func = testapp_adjust_tail_grow}, + {.name = "XDP_ADJUST_TAIL_GROW_MULTI_BUFF", .test_func = testapp_adjust_tail_grow_mb}, + {.name = "SEND_RECEIVE_9K_PACKETS", .test_func = testapp_send_receive_mb}, + /* Tests with huge page dependency */ + {.name = "SEND_RECEIVE_UNALIGNED", .test_func = testapp_send_receive_unaligned}, + {.name = "UNALIGNED_INV_DESC", .test_func = testapp_unaligned_inv_desc}, + {.name = "UNALIGNED_INV_DESC_4001_FRAME_SIZE", + .test_func = testapp_unaligned_inv_desc_4001_frame}, + {.name = "SEND_RECEIVE_UNALIGNED_9K_PACKETS", + .test_func = testapp_send_receive_unaligned_mb}, + {.name = "UNALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_unaligned_inv_desc_mb}, + /* Test with HW ring size dependency */ + {.name = "HW_SW_MIN_RING_SIZE", .test_func = testapp_hw_sw_min_ring_size}, + {.name = "HW_SW_MAX_RING_SIZE", .test_func = testapp_hw_sw_max_ring_size}, + /* Too long test */ + {.name = "TEARDOWN", .test_func = testapp_teardown}, +}; + + +#endif /* TEST_XSK_H_ */ diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 28e81161e6fca9..4b4b081b46ccc8 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -7,6 +7,7 @@ #include "verifier_arena.skel.h" #include "verifier_arena_large.skel.h" #include "verifier_array_access.skel.h" +#include "verifier_async_cb_context.skel.h" #include "verifier_basic_stack.skel.h" #include "verifier_bitfield_write.skel.h" #include "verifier_bounds.skel.h" @@ -34,6 +35,7 @@ #include "verifier_global_subprogs.skel.h" #include "verifier_global_ptr_args.skel.h" #include "verifier_gotol.skel.h" +#include "verifier_gotox.skel.h" #include "verifier_helper_access_var_len.skel.h" #include "verifier_helper_packet_access.skel.h" #include "verifier_helper_restricted.skel.h" @@ -172,6 +174,7 @@ void test_verifier_div_overflow(void) { RUN(verifier_div_overflow); } void test_verifier_global_subprogs(void) { RUN(verifier_global_subprogs); } void test_verifier_global_ptr_args(void) { RUN(verifier_global_ptr_args); } void test_verifier_gotol(void) { RUN(verifier_gotol); } +void test_verifier_gotox(void) { RUN(verifier_gotox); } void test_verifier_helper_access_var_len(void) { RUN(verifier_helper_access_var_len); } void test_verifier_helper_packet_access(void) { RUN(verifier_helper_packet_access); } void test_verifier_helper_restricted(void) { RUN(verifier_helper_restricted); } @@ -280,6 +283,7 @@ void test_verifier_array_access(void) verifier_array_access__elf_bytes, init_array_access_maps); } +void test_verifier_async_cb_context(void) { RUN(verifier_async_cb_context); } static int init_value_ptr_arith_maps(struct bpf_object *obj) { diff --git a/tools/testing/selftests/bpf/prog_tests/wq.c b/tools/testing/selftests/bpf/prog_tests/wq.c index 99e438fe12acd5..15c67d23128b24 100644 --- a/tools/testing/selftests/bpf/prog_tests/wq.c +++ b/tools/testing/selftests/bpf/prog_tests/wq.c @@ -38,3 +38,59 @@ void serial_test_failures_wq(void) { RUN_TESTS(wq_failures); } + +static void test_failure_map_no_btf(void) +{ + struct wq *skel = NULL; + char log[8192]; + const struct bpf_insn *insns; + size_t insn_cnt; + int ret, err, map_fd; + LIBBPF_OPTS(bpf_prog_load_opts, opts, .log_size = sizeof(log), .log_buf = log, + .log_level = 2); + + skel = wq__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + err = bpf_object__prepare(skel->obj); + if (!ASSERT_OK(err, "skel__prepare")) + goto out; + + map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "map_no_btf", sizeof(__u32), sizeof(__u64), 100, + NULL); + if (!ASSERT_GT(map_fd, -1, "map create")) + goto out; + + err = bpf_map__reuse_fd(skel->maps.array, map_fd); + if (!ASSERT_OK(err, "map reuse fd")) { + close(map_fd); + goto out; + } + + insns = bpf_program__insns(skel->progs.test_map_no_btf); + if (!ASSERT_OK_PTR(insns, "insns ptr")) + goto out; + + insn_cnt = bpf_program__insn_cnt(skel->progs.test_map_no_btf); + if (!ASSERT_GT(insn_cnt, 0u, "insn cnt")) + goto out; + + ret = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, &opts); + if (!ASSERT_LT(ret, 0, "prog load failed")) { + if (ret > 0) + close(ret); + goto out; + } + + ASSERT_HAS_SUBSTR(log, "map 'map_no_btf' has to have BTF in order to use bpf_wq", + "log complains no map BTF"); +out: + wq__destroy(skel); +} + +void test_wq_custom(void) +{ + if (test__start_subtest("test_failure_map_no_btf")) + test_failure_map_no_btf(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xsk.c b/tools/testing/selftests/bpf/prog_tests/xsk.c new file mode 100644 index 00000000000000..dd4c35c0e428b4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xsk.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <net/if.h> +#include <stdarg.h> + +#include "network_helpers.h" +#include "test_progs.h" +#include "test_xsk.h" +#include "xsk_xdp_progs.skel.h" + +#define VETH_RX "veth0" +#define VETH_TX "veth1" +#define MTU 1500 + +int setup_veth(bool busy_poll) +{ + SYS(fail, + "ip link add %s numtxqueues 4 numrxqueues 4 type veth peer name %s numtxqueues 4 numrxqueues 4", + VETH_RX, VETH_TX); + SYS(fail, "sysctl -wq net.ipv6.conf.%s.disable_ipv6=1", VETH_RX); + SYS(fail, "sysctl -wq net.ipv6.conf.%s.disable_ipv6=1", VETH_TX); + + if (busy_poll) { + SYS(fail, "echo 2 > /sys/class/net/%s/napi_defer_hard_irqs", VETH_RX); + SYS(fail, "echo 200000 > /sys/class/net/%s/gro_flush_timeout", VETH_RX); + SYS(fail, "echo 2 > /sys/class/net/%s/napi_defer_hard_irqs", VETH_TX); + SYS(fail, "echo 200000 > /sys/class/net/%s/gro_flush_timeout", VETH_TX); + } + + SYS(fail, "ip link set %s mtu %d", VETH_RX, MTU); + SYS(fail, "ip link set %s mtu %d", VETH_TX, MTU); + SYS(fail, "ip link set %s up", VETH_RX); + SYS(fail, "ip link set %s up", VETH_TX); + + return 0; + +fail: + return -1; +} + +void delete_veth(void) +{ + SYS_NOFAIL("ip link del %s", VETH_RX); + SYS_NOFAIL("ip link del %s", VETH_TX); +} + +int configure_ifobj(struct ifobject *tx, struct ifobject *rx) +{ + rx->ifindex = if_nametoindex(VETH_RX); + if (!ASSERT_OK_FD(rx->ifindex, "get RX ifindex")) + return -1; + + tx->ifindex = if_nametoindex(VETH_TX); + if (!ASSERT_OK_FD(tx->ifindex, "get TX ifindex")) + return -1; + + tx->shared_umem = false; + rx->shared_umem = false; + + + return 0; +} + +static void test_xsk(const struct test_spec *test_to_run, enum test_mode mode) +{ + struct ifobject *ifobj_tx, *ifobj_rx; + struct test_spec test; + int ret; + + ifobj_tx = ifobject_create(); + if (!ASSERT_OK_PTR(ifobj_tx, "create ifobj_tx")) + return; + + ifobj_rx = ifobject_create(); + if (!ASSERT_OK_PTR(ifobj_rx, "create ifobj_rx")) + goto delete_tx; + + if (!ASSERT_OK(configure_ifobj(ifobj_tx, ifobj_rx), "conigure ifobj")) + goto delete_rx; + + ret = get_hw_ring_size(ifobj_tx->ifname, &ifobj_tx->ring); + if (!ret) { + ifobj_tx->hw_ring_size_supp = true; + ifobj_tx->set_ring.default_tx = ifobj_tx->ring.tx_pending; + ifobj_tx->set_ring.default_rx = ifobj_tx->ring.rx_pending; + } + + if (!ASSERT_OK(init_iface(ifobj_rx, worker_testapp_validate_rx), "init RX")) + goto delete_rx; + if (!ASSERT_OK(init_iface(ifobj_tx, worker_testapp_validate_tx), "init TX")) + goto delete_rx; + + test_init(&test, ifobj_tx, ifobj_rx, 0, &tests[0]); + + test.tx_pkt_stream_default = pkt_stream_generate(DEFAULT_PKT_CNT, MIN_PKT_SIZE); + if (!ASSERT_OK_PTR(test.tx_pkt_stream_default, "TX pkt generation")) + goto delete_rx; + test.rx_pkt_stream_default = pkt_stream_generate(DEFAULT_PKT_CNT, MIN_PKT_SIZE); + if (!ASSERT_OK_PTR(test.rx_pkt_stream_default, "RX pkt generation")) + goto delete_rx; + + + test_init(&test, ifobj_tx, ifobj_rx, mode, test_to_run); + ret = test.test_func(&test); + if (ret != TEST_SKIP) + ASSERT_OK(ret, "Run test"); + pkt_stream_restore_default(&test); + + if (ifobj_tx->hw_ring_size_supp) + hw_ring_size_reset(ifobj_tx); + + pkt_stream_delete(test.tx_pkt_stream_default); + pkt_stream_delete(test.rx_pkt_stream_default); + xsk_xdp_progs__destroy(ifobj_tx->xdp_progs); + xsk_xdp_progs__destroy(ifobj_rx->xdp_progs); + +delete_rx: + ifobject_delete(ifobj_rx); +delete_tx: + ifobject_delete(ifobj_tx); +} + +void test_ns_xsk_skb(void) +{ + int i; + + if (!ASSERT_OK(setup_veth(false), "setup veth")) + return; + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + if (test__start_subtest(tests[i].name)) + test_xsk(&tests[i], TEST_MODE_SKB); + } + + delete_veth(); +} + +void test_ns_xsk_drv(void) +{ + int i; + + if (!ASSERT_OK(setup_veth(false), "setup veth")) + return; + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + if (test__start_subtest(tests[i].name)) + test_xsk(&tests[i], TEST_MODE_DRV); + } + + delete_veth(); +} + diff --git a/tools/testing/selftests/bpf/progs/arena_strsearch.c b/tools/testing/selftests/bpf/progs/arena_strsearch.c new file mode 100644 index 00000000000000..ef6b76658f7f1c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/arena_strsearch.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include <vmlinux.h> +#include "bpf_experimental.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 100); /* number of pages */ +} arena SEC(".maps"); + +#include "bpf_arena_strsearch.h" + +struct glob_test { + char const __arena *pat, *str; + bool expected; +}; + +static bool test(char const __arena *pat, char const __arena *str, bool expected) +{ + bool match = glob_match(pat, str); + bool success = match == expected; + + /* bpf_printk("glob_match %s %s res %d ok %d", pat, str, match, success); */ + return success; +} + +/* + * The tests are all jammed together in one array to make it simpler + * to place that array in the .init.rodata section. The obvious + * "array of structures containing char *" has no way to force the + * pointed-to strings to be in a particular section. + * + * Anyway, a test consists of: + * 1. Expected glob_match result: '1' or '0'. + * 2. Pattern to match: null-terminated string + * 3. String to match against: null-terminated string + * + * The list of tests is terminated with a final '\0' instead of + * a glob_match result character. + */ +static const char __arena glob_tests[] = + /* Some basic tests */ + "1" "a\0" "a\0" + "0" "a\0" "b\0" + "0" "a\0" "aa\0" + "0" "a\0" "\0" + "1" "\0" "\0" + "0" "\0" "a\0" + /* Simple character class tests */ + "1" "[a]\0" "a\0" + "0" "[a]\0" "b\0" + "0" "[!a]\0" "a\0" + "1" "[!a]\0" "b\0" + "1" "[ab]\0" "a\0" + "1" "[ab]\0" "b\0" + "0" "[ab]\0" "c\0" + "1" "[!ab]\0" "c\0" + "1" "[a-c]\0" "b\0" + "0" "[a-c]\0" "d\0" + /* Corner cases in character class parsing */ + "1" "[a-c-e-g]\0" "-\0" + "0" "[a-c-e-g]\0" "d\0" + "1" "[a-c-e-g]\0" "f\0" + "1" "[]a-ceg-ik[]\0" "a\0" + "1" "[]a-ceg-ik[]\0" "]\0" + "1" "[]a-ceg-ik[]\0" "[\0" + "1" "[]a-ceg-ik[]\0" "h\0" + "0" "[]a-ceg-ik[]\0" "f\0" + "0" "[!]a-ceg-ik[]\0" "h\0" + "0" "[!]a-ceg-ik[]\0" "]\0" + "1" "[!]a-ceg-ik[]\0" "f\0" + /* Simple wild cards */ + "1" "?\0" "a\0" + "0" "?\0" "aa\0" + "0" "??\0" "a\0" + "1" "?x?\0" "axb\0" + "0" "?x?\0" "abx\0" + "0" "?x?\0" "xab\0" + /* Asterisk wild cards (backtracking) */ + "0" "*??\0" "a\0" + "1" "*??\0" "ab\0" + "1" "*??\0" "abc\0" + "1" "*??\0" "abcd\0" + "0" "??*\0" "a\0" + "1" "??*\0" "ab\0" + "1" "??*\0" "abc\0" + "1" "??*\0" "abcd\0" + "0" "?*?\0" "a\0" + "1" "?*?\0" "ab\0" + "1" "?*?\0" "abc\0" + "1" "?*?\0" "abcd\0" + "1" "*b\0" "b\0" + "1" "*b\0" "ab\0" + "0" "*b\0" "ba\0" + "1" "*b\0" "bb\0" + "1" "*b\0" "abb\0" + "1" "*b\0" "bab\0" + "1" "*bc\0" "abbc\0" + "1" "*bc\0" "bc\0" + "1" "*bc\0" "bbc\0" + "1" "*bc\0" "bcbc\0" + /* Multiple asterisks (complex backtracking) */ + "1" "*ac*\0" "abacadaeafag\0" + "1" "*ac*ae*ag*\0" "abacadaeafag\0" + "1" "*a*b*[bc]*[ef]*g*\0" "abacadaeafag\0" + "0" "*a*b*[ef]*[cd]*g*\0" "abacadaeafag\0" + "1" "*abcd*\0" "abcabcabcabcdefg\0" + "1" "*ab*cd*\0" "abcabcabcabcdefg\0" + "1" "*abcd*abcdef*\0" "abcabcdabcdeabcdefg\0" + "0" "*abcd*\0" "abcabcabcabcefg\0" + "0" "*ab*cd*\0" "abcabcabcabcefg\0"; + +bool skip = false; + +SEC("syscall") +int arena_strsearch(void *ctx) +{ + unsigned successes = 0; + unsigned n = 0; + char const __arena *p = glob_tests; + + /* + * Tests are jammed together in a string. The first byte is '1' + * or '0' to indicate the expected outcome, or '\0' to indicate the + * end of the tests. Then come two null-terminated strings: the + * pattern and the string to match it against. + */ + while (*p) { + bool expected = *p++ & 1; + char const __arena *pat = p; + + cond_break; + p += bpf_arena_strlen(p) + 1; + successes += test(pat, p, expected); + p += bpf_arena_strlen(p) + 1; + n++; + } + + n -= successes; + /* bpf_printk("glob: %u self-tests passed, %u failed\n", successes, n); */ + + return n ? -1 : 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c index 4e51785e7606e7..9af19dfe4e80b0 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c @@ -22,10 +22,6 @@ #define TCP_PACING_CA_RATIO (120) #define TCP_REORDERING (12) -#define min(a, b) ((a) < (b) ? (a) : (b)) -#define max(a, b) ((a) > (b) ? (a) : (b)) -#define after(seq2, seq1) before(seq1, seq2) - extern void cubictcp_init(struct sock *sk) __ksym; extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; extern __u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym; @@ -34,11 +30,6 @@ extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym; extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym; extern void cubictcp_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym; -static bool before(__u32 seq1, __u32 seq2) -{ - return (__s32)(seq1-seq2) < 0; -} - static __u64 div64_u64(__u64 dividend, __u64 divisor) { return dividend / divisor; diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c index f089faa97ae6eb..46fb2b37d3a706 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c @@ -20,13 +20,6 @@ char _license[] SEC("license") = "GPL"; #define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) -#define min(a, b) ((a) < (b) ? (a) : (b)) -#define max(a, b) ((a) > (b) ? (a) : (b)) -static bool before(__u32 seq1, __u32 seq2) -{ - return (__s32)(seq1-seq2) < 0; -} -#define after(seq2, seq1) before(seq1, seq2) extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym; extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym; diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index 32c511bcd60b3a..1cc83140849f49 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -13,16 +13,10 @@ #ifndef EBUSY #define EBUSY 16 #endif -#define min(a, b) ((a) < (b) ? (a) : (b)) -#define max(a, b) ((a) > (b) ? (a) : (b)) #define min_not_zero(x, y) ({ \ typeof(x) __x = (x); \ typeof(y) __y = (y); \ __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) -static bool before(__u32 seq1, __u32 seq2) -{ - return (__s32)(seq1-seq2) < 0; -} char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_gotox.c b/tools/testing/selftests/bpf/progs/bpf_gotox.c new file mode 100644 index 00000000000000..216c71b94c644d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_gotox.c @@ -0,0 +1,448 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include "bpf_misc.h" + +__u64 in_user; +__u64 ret_user; + +int pid; + +/* + * Skip all the tests if compiler doesn't support indirect jumps. + * + * If tests are skipped, then all functions below are compiled as + * dummy, such that the skeleton looks the same, and the userspace + * program can avoid any checks rather than if data->skip is set. + */ +#ifdef __BPF_FEATURE_GOTOX +__u64 skip SEC(".data") = 0; +#else +__u64 skip = 1; +#endif + +struct simple_ctx { + __u64 x; +}; + +#ifdef __BPF_FEATURE_GOTOX +__u64 some_var; + +/* + * This function adds code which will be replaced by a different + * number of instructions by the verifier. This adds additional + * stress on testing the insn_array maps corresponding to indirect jumps. + */ +static __always_inline void adjust_insns(__u64 x) +{ + some_var ^= x + bpf_jiffies64(); +} + +SEC("syscall") +int one_switch(struct simple_ctx *ctx) +{ + switch (ctx->x) { + case 0: + adjust_insns(ctx->x + 1); + ret_user = 2; + break; + case 1: + adjust_insns(ctx->x + 7); + ret_user = 3; + break; + case 2: + adjust_insns(ctx->x + 9); + ret_user = 4; + break; + case 3: + adjust_insns(ctx->x + 11); + ret_user = 5; + break; + case 4: + adjust_insns(ctx->x + 17); + ret_user = 7; + break; + default: + adjust_insns(ctx->x + 177); + ret_user = 19; + break; + } + + return 0; +} + +SEC("syscall") +int one_switch_non_zero_sec_off(struct simple_ctx *ctx) +{ + switch (ctx->x) { + case 0: + adjust_insns(ctx->x + 1); + ret_user = 2; + break; + case 1: + adjust_insns(ctx->x + 7); + ret_user = 3; + break; + case 2: + adjust_insns(ctx->x + 9); + ret_user = 4; + break; + case 3: + adjust_insns(ctx->x + 11); + ret_user = 5; + break; + case 4: + adjust_insns(ctx->x + 17); + ret_user = 7; + break; + default: + adjust_insns(ctx->x + 177); + ret_user = 19; + break; + } + + return 0; +} + +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +int simple_test_other_sec(struct pt_regs *ctx) +{ + __u64 x = in_user; + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + switch (x) { + case 0: + adjust_insns(x + 1); + ret_user = 2; + break; + case 1: + adjust_insns(x + 7); + ret_user = 3; + break; + case 2: + adjust_insns(x + 9); + ret_user = 4; + break; + case 3: + adjust_insns(x + 11); + ret_user = 5; + break; + case 4: + adjust_insns(x + 17); + ret_user = 7; + break; + default: + adjust_insns(x + 177); + ret_user = 19; + break; + } + + return 0; +} + +SEC("syscall") +int two_switches(struct simple_ctx *ctx) +{ + switch (ctx->x) { + case 0: + adjust_insns(ctx->x + 1); + ret_user = 2; + break; + case 1: + adjust_insns(ctx->x + 7); + ret_user = 3; + break; + case 2: + adjust_insns(ctx->x + 9); + ret_user = 4; + break; + case 3: + adjust_insns(ctx->x + 11); + ret_user = 5; + break; + case 4: + adjust_insns(ctx->x + 17); + ret_user = 7; + break; + default: + adjust_insns(ctx->x + 177); + ret_user = 19; + break; + } + + switch (ctx->x + !!ret_user) { + case 1: + adjust_insns(ctx->x + 7); + ret_user = 103; + break; + case 2: + adjust_insns(ctx->x + 9); + ret_user = 104; + break; + case 3: + adjust_insns(ctx->x + 11); + ret_user = 107; + break; + case 4: + adjust_insns(ctx->x + 11); + ret_user = 205; + break; + case 5: + adjust_insns(ctx->x + 11); + ret_user = 115; + break; + default: + adjust_insns(ctx->x + 177); + ret_user = 1019; + break; + } + + return 0; +} + +SEC("syscall") +int big_jump_table(struct simple_ctx *ctx __attribute__((unused))) +{ + const void *const jt[256] = { + [0 ... 255] = &&default_label, + [0] = &&l0, + [11] = &&l11, + [27] = &&l27, + [31] = &&l31, + }; + + goto *jt[ctx->x & 0xff]; + +l0: + adjust_insns(ctx->x + 1); + ret_user = 2; + return 0; + +l11: + adjust_insns(ctx->x + 7); + ret_user = 3; + return 0; + +l27: + adjust_insns(ctx->x + 9); + ret_user = 4; + return 0; + +l31: + adjust_insns(ctx->x + 11); + ret_user = 5; + return 0; + +default_label: + adjust_insns(ctx->x + 177); + ret_user = 19; + return 0; +} + +SEC("syscall") +int one_jump_two_maps(struct simple_ctx *ctx __attribute__((unused))) +{ + __label__ l1, l2, l3, l4; + void *jt1[2] = { &&l1, &&l2 }; + void *jt2[2] = { &&l3, &&l4 }; + unsigned int a = ctx->x % 2; + unsigned int b = (ctx->x / 2) % 2; + volatile int ret = 0; + + if (!(a < 2 && b < 2)) + return 19; + + if (ctx->x % 2) + goto *jt1[a]; + else + goto *jt2[b]; + + l1: ret += 1; + l2: ret += 3; + l3: ret += 5; + l4: ret += 7; + + ret_user = ret; + return ret; +} + +SEC("syscall") +int one_map_two_jumps(struct simple_ctx *ctx __attribute__((unused))) +{ + __label__ l1, l2, l3; + void *jt[3] = { &&l1, &&l2, &&l3 }; + unsigned int a = (ctx->x >> 2) & 1; + unsigned int b = (ctx->x >> 3) & 1; + volatile int ret = 0; + + if (ctx->x % 2) + goto *jt[a]; + + if (ctx->x % 3) + goto *jt[a + b]; + + l1: ret += 3; + l2: ret += 5; + l3: ret += 7; + + ret_user = ret; + return ret; +} + +/* Just to introduce some non-zero offsets in .text */ +static __noinline int f0(volatile struct simple_ctx *ctx __arg_ctx) +{ + if (ctx) + return 1; + else + return 13; +} + +SEC("syscall") int f1(struct simple_ctx *ctx) +{ + ret_user = 0; + return f0(ctx); +} + +static __noinline int __static_global(__u64 x) +{ + switch (x) { + case 0: + adjust_insns(x + 1); + ret_user = 2; + break; + case 1: + adjust_insns(x + 7); + ret_user = 3; + break; + case 2: + adjust_insns(x + 9); + ret_user = 4; + break; + case 3: + adjust_insns(x + 11); + ret_user = 5; + break; + case 4: + adjust_insns(x + 17); + ret_user = 7; + break; + default: + adjust_insns(x + 177); + ret_user = 19; + break; + } + + return 0; +} + +SEC("syscall") +int use_static_global1(struct simple_ctx *ctx) +{ + ret_user = 0; + return __static_global(ctx->x); +} + +SEC("syscall") +int use_static_global2(struct simple_ctx *ctx) +{ + ret_user = 0; + adjust_insns(ctx->x + 1); + return __static_global(ctx->x); +} + +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +int use_static_global_other_sec(void *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + return __static_global(in_user); +} + +__noinline int __nonstatic_global(__u64 x) +{ + switch (x) { + case 0: + adjust_insns(x + 1); + ret_user = 2; + break; + case 1: + adjust_insns(x + 7); + ret_user = 3; + break; + case 2: + adjust_insns(x + 9); + ret_user = 4; + break; + case 3: + adjust_insns(x + 11); + ret_user = 5; + break; + case 4: + adjust_insns(x + 17); + ret_user = 7; + break; + default: + adjust_insns(x + 177); + ret_user = 19; + break; + } + + return 0; +} + +SEC("syscall") +int use_nonstatic_global1(struct simple_ctx *ctx) +{ + ret_user = 0; + return __nonstatic_global(ctx->x); +} + +SEC("syscall") +int use_nonstatic_global2(struct simple_ctx *ctx) +{ + ret_user = 0; + adjust_insns(ctx->x + 1); + return __nonstatic_global(ctx->x); +} + +SEC("fentry/" SYS_PREFIX "sys_nanosleep") +int use_nonstatic_global_other_sec(void *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + return __nonstatic_global(in_user); +} + +#else /* __BPF_FEATURE_GOTOX */ + +#define SKIP_TEST(TEST_NAME) \ + SEC("syscall") int TEST_NAME(void *ctx) \ + { \ + return 0; \ + } + +SKIP_TEST(one_switch); +SKIP_TEST(one_switch_non_zero_sec_off); +SKIP_TEST(simple_test_other_sec); +SKIP_TEST(two_switches); +SKIP_TEST(big_jump_table); +SKIP_TEST(one_jump_two_maps); +SKIP_TEST(one_map_two_jumps); +SKIP_TEST(use_static_global1); +SKIP_TEST(use_static_global2); +SKIP_TEST(use_static_global_other_sec); +SKIP_TEST(use_nonstatic_global1); +SKIP_TEST(use_nonstatic_global2); +SKIP_TEST(use_nonstatic_global_other_sec); + +#endif /* __BPF_FEATURE_GOTOX */ + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c b/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c index 774d4dbe818935..a8aa5a71d846f6 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c @@ -18,23 +18,10 @@ unsigned short reuse_listen_hport = 0; unsigned short listen_hport = 0; -char cubic_cc[TCP_CA_NAME_MAX] = "bpf_cubic"; +const char cubic_cc[] = "bpf_cubic"; char dctcp_cc[TCP_CA_NAME_MAX] = "bpf_dctcp"; bool random_retry = false; -static bool tcp_cc_eq(const char *a, const char *b) -{ - int i; - - for (i = 0; i < TCP_CA_NAME_MAX; i++) { - if (a[i] != b[i]) - return false; - if (!a[i]) - break; - } - - return true; -} SEC("iter/tcp") int change_tcp_cc(struct bpf_iter__tcp *ctx) @@ -58,7 +45,7 @@ int change_tcp_cc(struct bpf_iter__tcp *ctx) cur_cc, sizeof(cur_cc))) return 0; - if (!tcp_cc_eq(cur_cc, cubic_cc)) + if (bpf_strncmp(cur_cc, TCP_CA_NAME_MAX, cubic_cc)) return 0; if (random_retry && bpf_get_prandom_u32() % 4 == 1) diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index a7a1a684eed116..c9bfbe1bafc127 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -126,6 +126,9 @@ * Several __arch_* annotations could be specified at once. * When test case is not run on current arch it is marked as skipped. * __caps_unpriv Specify the capabilities that should be set when running the test. + * + * __linear_size Specify the size of the linear area of non-linear skbs, or + * 0 for linear skbs. */ #define __msg(msg) __attribute__((btf_decl_tag("comment:test_expect_msg=" XSTR(__COUNTER__) "=" msg))) #define __not_msg(msg) __attribute__((btf_decl_tag("comment:test_expect_not_msg=" XSTR(__COUNTER__) "=" msg))) @@ -159,6 +162,7 @@ #define __stderr_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_stderr_unpriv=" XSTR(__COUNTER__) "=" msg))) #define __stdout(msg) __attribute__((btf_decl_tag("comment:test_expect_stdout=" XSTR(__COUNTER__) "=" msg))) #define __stdout_unpriv(msg) __attribute__((btf_decl_tag("comment:test_expect_stdout_unpriv=" XSTR(__COUNTER__) "=" msg))) +#define __linear_size(sz) __attribute__((btf_decl_tag("comment:test_linear_size=" XSTR(sz)))) /* Define common capabilities tested using __caps_unpriv */ #define CAP_NET_ADMIN 12 diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index 17db400f0e0d97..d8dacef37c1639 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -146,6 +146,20 @@ #define tcp_jiffies32 ((__u32)bpf_jiffies64()) +#ifndef min +#define min(a, b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef max +#define max(a, b) ((a) > (b) ? (a) : (b)) +#endif + +static inline bool before(__u32 seq1, __u32 seq2) +{ + return (__s32)(seq1 - seq2) < 0; +} + +#define after(seq2, seq1) before(seq1, seq2) + static inline struct inet_connection_sock *inet_csk(const struct sock *sk) { return (struct inet_connection_sock *)sk; diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index 9e9ebf27b8784e..9d158cfad98174 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -34,6 +34,9 @@ #define SOL_TCP 6 #endif +const char reno[] = "reno"; +const char cubic[] = "cubic"; + __attribute__ ((noinline)) __weak int do_bind(struct bpf_sock_addr *ctx) { @@ -50,35 +53,27 @@ int do_bind(struct bpf_sock_addr *ctx) } static __inline int verify_cc(struct bpf_sock_addr *ctx, - char expected[TCP_CA_NAME_MAX]) + const char expected[]) { char buf[TCP_CA_NAME_MAX]; - int i; if (bpf_getsockopt(ctx, SOL_TCP, TCP_CONGESTION, &buf, sizeof(buf))) return 1; - for (i = 0; i < TCP_CA_NAME_MAX; i++) { - if (buf[i] != expected[i]) - return 1; - if (buf[i] == 0) - break; - } + if (bpf_strncmp(buf, TCP_CA_NAME_MAX, expected)) + return 1; return 0; } static __inline int set_cc(struct bpf_sock_addr *ctx) { - char reno[TCP_CA_NAME_MAX] = "reno"; - char cubic[TCP_CA_NAME_MAX] = "cubic"; - - if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &reno, sizeof(reno))) + if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, (void *)reno, sizeof(reno))) return 1; if (verify_cc(ctx, reno)) return 1; - if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &cubic, sizeof(cubic))) + if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, (void *)cubic, sizeof(cubic))) return 1; if (verify_cc(ctx, cubic)) return 1; diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c index 127dea342e5a67..e0d672d93adf85 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_success.c +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c @@ -914,8 +914,8 @@ void *user_ptr; char expected_str[384]; __u32 test_len[7] = {0/* placeholder */, 0, 1, 2, 255, 256, 257}; -typedef int (*bpf_read_dynptr_fn_t)(struct bpf_dynptr *dptr, u32 off, - u32 size, const void *unsafe_ptr); +typedef int (*bpf_read_dynptr_fn_t)(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr); /* Returns the offset just before the end of the maximum sized xdp fragment. * Any write larger than 32 bytes will be split between 2 fragments. @@ -1106,16 +1106,16 @@ int test_copy_from_user_str_dynptr(void *ctx) return 0; } -static int bpf_copy_data_from_user_task(struct bpf_dynptr *dptr, u32 off, - u32 size, const void *unsafe_ptr) +static int bpf_copy_data_from_user_task(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr) { struct task_struct *task = bpf_get_current_task_btf(); return bpf_copy_from_user_task_dynptr(dptr, off, size, unsafe_ptr, task); } -static int bpf_copy_data_from_user_task_str(struct bpf_dynptr *dptr, u32 off, - u32 size, const void *unsafe_ptr) +static int bpf_copy_data_from_user_task_str(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr) { struct task_struct *task = bpf_get_current_task_btf(); diff --git a/tools/testing/selftests/bpf/progs/file_reader.c b/tools/testing/selftests/bpf/progs/file_reader.c new file mode 100644 index 00000000000000..4d756b6235579e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/file_reader.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <string.h> +#include <stdbool.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" +#include "errno.h" + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} arrmap SEC(".maps"); + +struct elem { + struct file *file; + struct bpf_task_work tw; +}; + +char user_buf[256000]; +char tmp_buf[256000]; + +int pid = 0; +int err, run_success = 0; + +static int validate_file_read(struct file *file); +static int task_work_callback(struct bpf_map *map, void *key, void *value); + +SEC("lsm/file_open") +int on_open_expect_fault(void *c) +{ + struct bpf_dynptr dynptr; + struct file *file; + int local_err = 1; + __u32 user_buf_sz = sizeof(user_buf); + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + file = bpf_get_task_exe_file(bpf_get_current_task_btf()); + if (!file) + return 0; + + if (bpf_dynptr_from_file(file, 0, &dynptr)) + goto out; + + local_err = bpf_dynptr_read(tmp_buf, user_buf_sz, &dynptr, user_buf_sz, 0); + if (local_err == -EFAULT) { /* Expect page fault */ + local_err = 0; + run_success = 1; + } +out: + bpf_dynptr_file_discard(&dynptr); + if (local_err) + err = local_err; + bpf_put_file(file); + return 0; +} + +SEC("lsm/file_open") +int on_open_validate_file_read(void *c) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct elem *work; + int key = 0; + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + work = bpf_map_lookup_elem(&arrmap, &key); + if (!work) { + err = 1; + return 0; + } + bpf_task_work_schedule_signal_impl(task, &work->tw, &arrmap, task_work_callback, NULL); + return 0; +} + +/* Called in a sleepable context, read 256K bytes, cross check with user space read data */ +static int task_work_callback(struct bpf_map *map, void *key, void *value) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct file *file = bpf_get_task_exe_file(task); + + if (!file) + return 0; + + err = validate_file_read(file); + if (!err) + run_success = 1; + bpf_put_file(file); + return 0; +} + +static int verify_dynptr_read(struct bpf_dynptr *ptr, u32 off, char *user_buf, u32 len) +{ + int i; + + if (bpf_dynptr_read(tmp_buf, len, ptr, off, 0)) + return 1; + + /* Verify file contents read from BPF is the same as the one read from userspace */ + bpf_for(i, 0, len) + { + if (tmp_buf[i] != user_buf[i]) + return 1; + } + return 0; +} + +static int validate_file_read(struct file *file) +{ + struct bpf_dynptr dynptr; + int loc_err = 1, off; + __u32 user_buf_sz = sizeof(user_buf); + + if (bpf_dynptr_from_file(file, 0, &dynptr)) + goto cleanup; + + loc_err = verify_dynptr_read(&dynptr, 0, user_buf, user_buf_sz); + off = 1; + loc_err = loc_err ?: verify_dynptr_read(&dynptr, off, user_buf + off, user_buf_sz - off); + off = user_buf_sz - 1; + loc_err = loc_err ?: verify_dynptr_read(&dynptr, off, user_buf + off, user_buf_sz - off); + /* Read file with random offset and length */ + off = 4097; + loc_err = loc_err ?: verify_dynptr_read(&dynptr, off, user_buf + off, 100); + + /* Adjust dynptr, verify read */ + loc_err = loc_err ?: bpf_dynptr_adjust(&dynptr, off, off + 1); + loc_err = loc_err ?: verify_dynptr_read(&dynptr, 0, user_buf + off, 1); + /* Can't read more than 1 byte */ + loc_err = loc_err ?: verify_dynptr_read(&dynptr, 0, user_buf + off, 2) == 0; + /* Can't read with far offset */ + loc_err = loc_err ?: verify_dynptr_read(&dynptr, 1, user_buf + off, 1) == 0; + +cleanup: + bpf_dynptr_file_discard(&dynptr); + return loc_err; +} diff --git a/tools/testing/selftests/bpf/progs/file_reader_fail.c b/tools/testing/selftests/bpf/progs/file_reader_fail.c new file mode 100644 index 00000000000000..32fe28ed243921 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/file_reader_fail.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <string.h> +#include <stdbool.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +int err; +void *user_ptr; + +SEC("lsm/file_open") +__failure +__msg("Unreleased reference id=") +int on_nanosleep_unreleased_ref(void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct file *file = bpf_get_task_exe_file(task); + struct bpf_dynptr dynptr; + + if (!file) + return 0; + + err = bpf_dynptr_from_file(file, 0, &dynptr); + return err ? 1 : 0; +} + +SEC("xdp") +__failure +__msg("Expected a dynptr of type file as arg #0") +int xdp_wrong_dynptr_type(struct xdp_md *xdp) +{ + struct bpf_dynptr dynptr; + + bpf_dynptr_from_xdp(xdp, 0, &dynptr); + bpf_dynptr_file_discard(&dynptr); + return 0; +} + +SEC("xdp") +__failure +__msg("Expected an initialized dynptr as arg #0") +int xdp_no_dynptr_type(struct xdp_md *xdp) +{ + struct bpf_dynptr dynptr; + + bpf_dynptr_file_discard(&dynptr); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/htab_update.c b/tools/testing/selftests/bpf/progs/htab_update.c index 7481bb30b29b3f..195d3b2fba00c5 100644 --- a/tools/testing/selftests/bpf/progs/htab_update.c +++ b/tools/testing/selftests/bpf/progs/htab_update.c @@ -6,24 +6,31 @@ char _license[] SEC("license") = "GPL"; +/* Map value type: has BTF-managed field (bpf_timer) */ +struct val { + struct bpf_timer t; + __u64 payload; +}; + struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(max_entries, 1); - __uint(key_size, sizeof(__u32)); - __uint(value_size, sizeof(__u32)); + __type(key, __u32); + __type(value, struct val); } htab SEC(".maps"); int pid = 0; int update_err = 0; -SEC("?fentry/lookup_elem_raw") -int lookup_elem_raw(void *ctx) +SEC("?fentry/bpf_obj_free_fields") +int bpf_obj_free_fields(void *ctx) { - __u32 key = 0, value = 1; + __u32 key = 0; + struct val value = { .payload = 1 }; if ((bpf_get_current_pid_tgid() >> 32) != pid) return 0; - update_err = bpf_map_update_elem(&htab, &key, &value, 0); + update_err = bpf_map_update_elem(&htab, &key, &value, BPF_ANY); return 0; } diff --git a/tools/testing/selftests/bpf/progs/ip_check_defrag.c b/tools/testing/selftests/bpf/progs/ip_check_defrag.c index 645b2c9f7867c3..0e87ad1ebcfa5b 100644 --- a/tools/testing/selftests/bpf/progs/ip_check_defrag.c +++ b/tools/testing/selftests/bpf/progs/ip_check_defrag.c @@ -12,11 +12,6 @@ #define IP_OFFSET 0x1FFF #define NEXTHDR_FRAGMENT 44 -extern int bpf_dynptr_from_skb(struct __sk_buff *skb, __u64 flags, - struct bpf_dynptr *ptr__uninit) __ksym; -extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, uint32_t offset, - void *buffer, uint32_t buffer__sz) __ksym; - volatile int shootdowns = 0; static bool is_frag_v4(struct iphdr *iph) diff --git a/tools/testing/selftests/bpf/progs/lsm.c b/tools/testing/selftests/bpf/progs/lsm.c index 0c13b7409947ef..7de173daf27b6d 100644 --- a/tools/testing/selftests/bpf/progs/lsm.c +++ b/tools/testing/selftests/bpf/progs/lsm.c @@ -89,14 +89,16 @@ SEC("lsm/file_mprotect") int BPF_PROG(test_int_hook, struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot, int ret) { - if (ret != 0) + struct mm_struct *mm = vma->vm_mm; + + if (ret != 0 || !mm) return ret; __s32 pid = bpf_get_current_pid_tgid() >> 32; int is_stack = 0; - is_stack = (vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack); + is_stack = (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack); if (is_stack && monitored_pid == pid) { mprotect_count++; diff --git a/tools/testing/selftests/bpf/progs/lsm_tailcall.c b/tools/testing/selftests/bpf/progs/lsm_tailcall.c index 49c075ce2d4c36..6e7e58051e640a 100644 --- a/tools/testing/selftests/bpf/progs/lsm_tailcall.c +++ b/tools/testing/selftests/bpf/progs/lsm_tailcall.c @@ -20,14 +20,14 @@ int lsm_file_permission_prog(void *ctx) return 0; } -SEC("lsm/file_alloc_security") -int lsm_file_alloc_security_prog(void *ctx) +SEC("lsm/kernfs_init_security") +int lsm_kernfs_init_security_prog(void *ctx) { return 0; } -SEC("lsm/file_alloc_security") -int lsm_file_alloc_security_entry(void *ctx) +SEC("lsm/kernfs_init_security") +int lsm_kernfs_init_security_entry(void *ctx) { bpf_tail_call_static(ctx, &jmp_table, 0); return 0; diff --git a/tools/testing/selftests/bpf/progs/rcu_read_lock.c b/tools/testing/selftests/bpf/progs/rcu_read_lock.c index 3a868a19934926..d70c28824bbe33 100644 --- a/tools/testing/selftests/bpf/progs/rcu_read_lock.c +++ b/tools/testing/selftests/bpf/progs/rcu_read_lock.c @@ -278,6 +278,46 @@ out: return 0; } +SEC("?fentry.s/" SYS_PREFIX "sys_nanosleep") +int nested_rcu_region_unbalanced_1(void *ctx) +{ + struct task_struct *task, *real_parent; + + /* nested rcu read lock regions */ + task = bpf_get_current_task_btf(); + bpf_rcu_read_lock(); + bpf_rcu_read_lock(); + real_parent = task->real_parent; + if (!real_parent) + goto out; + (void)bpf_task_storage_get(&map_a, real_parent, 0, 0); +out: + bpf_rcu_read_unlock(); + bpf_rcu_read_unlock(); + bpf_rcu_read_unlock(); + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_nanosleep") +int nested_rcu_region_unbalanced_2(void *ctx) +{ + struct task_struct *task, *real_parent; + + /* nested rcu read lock regions */ + task = bpf_get_current_task_btf(); + bpf_rcu_read_lock(); + bpf_rcu_read_lock(); + bpf_rcu_read_lock(); + real_parent = task->real_parent; + if (!real_parent) + goto out; + (void)bpf_task_storage_get(&map_a, real_parent, 0, 0); +out: + bpf_rcu_read_unlock(); + bpf_rcu_read_unlock(); + return 0; +} + SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") int task_trusted_non_rcuptr(void *ctx) { diff --git a/tools/testing/selftests/bpf/progs/refcounted_kptr.c b/tools/testing/selftests/bpf/progs/refcounted_kptr.c index 893a4fdb4b6e98..1aca85d86aebca 100644 --- a/tools/testing/selftests/bpf/progs/refcounted_kptr.c +++ b/tools/testing/selftests/bpf/progs/refcounted_kptr.c @@ -568,4 +568,64 @@ err_out: return 0; } +private(kptr_ref) u64 ref; + +static int probe_read_refcount(void) +{ + u32 refcount; + + bpf_probe_read_kernel(&refcount, sizeof(refcount), (void *) ref); + return refcount; +} + +static int __insert_in_list(struct bpf_list_head *head, struct bpf_spin_lock *lock, + struct node_data __kptr **node) +{ + struct node_data *node_new, *node_ref, *node_old; + + node_new = bpf_obj_new(typeof(*node_new)); + if (!node_new) + return -1; + + node_ref = bpf_refcount_acquire(node_new); + node_old = bpf_kptr_xchg(node, node_new); + if (node_old) { + bpf_obj_drop(node_old); + bpf_obj_drop(node_ref); + return -2; + } + + bpf_spin_lock(lock); + bpf_list_push_front(head, &node_ref->l); + ref = (u64)(void *) &node_ref->ref; + bpf_spin_unlock(lock); + return probe_read_refcount(); +} + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __type(key, int); + __type(value, struct map_value); + __uint(max_entries, 1); +} percpu_hash SEC(".maps"); + +SEC("tc") +int percpu_hash_refcount_leak(void *ctx) +{ + struct map_value *v; + int key = 0; + + v = bpf_map_lookup_elem(&percpu_hash, &key); + if (!v) + return 0; + + return __insert_in_list(&head, &lock, &v->node); +} + +SEC("tc") +int check_percpu_hash_refcount(void *ctx) +{ + return probe_read_refcount(); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/ringbuf_bench.c b/tools/testing/selftests/bpf/progs/ringbuf_bench.c index 6a468496f53911..d96c7d1e8fc28d 100644 --- a/tools/testing/selftests/bpf/progs/ringbuf_bench.c +++ b/tools/testing/selftests/bpf/progs/ringbuf_bench.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2020 Facebook +#include <stdbool.h> #include <linux/bpf.h> #include <stdint.h> #include <bpf/bpf_helpers.h> @@ -14,9 +15,11 @@ struct { const volatile int batch_cnt = 0; const volatile long use_output = 0; +const volatile bool bench_producer = false; long sample_val = 42; long dropped __attribute__((aligned(128))) = 0; +long hits __attribute__((aligned(128))) = 0; const volatile long wakeup_data_size = 0; @@ -24,6 +27,9 @@ static __always_inline long get_flags() { long sz; + if (bench_producer) + return BPF_RB_NO_WAKEUP; + if (!wakeup_data_size) return 0; @@ -47,6 +53,8 @@ int bench_ringbuf(void *ctx) *sample = sample_val; flags = get_flags(); bpf_ringbuf_submit(sample, flags); + if (bench_producer) + __sync_add_and_fetch(&hits, 1); } } } else { @@ -55,6 +63,9 @@ int bench_ringbuf(void *ctx) if (bpf_ringbuf_output(&ringbuf, &sample_val, sizeof(sample_val), flags)) __sync_add_and_fetch(&dropped, 1); + else if (bench_producer) + __sync_add_and_fetch(&hits, 1); + } } return 0; diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c b/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c index 99d72c68f76af8..826e6b6aff7ecd 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c @@ -45,8 +45,12 @@ SEC("syscall") __retval(USER_PTR_ERR)int test_strcspn_null1(void *ctx) { return SEC("syscall") __retval(USER_PTR_ERR)int test_strcspn_null2(void *ctx) { return bpf_strcspn("hello", NULL); } SEC("syscall") __retval(USER_PTR_ERR)int test_strstr_null1(void *ctx) { return bpf_strstr(NULL, "hello"); } SEC("syscall") __retval(USER_PTR_ERR)int test_strstr_null2(void *ctx) { return bpf_strstr("hello", NULL); } +SEC("syscall") __retval(USER_PTR_ERR)int test_strcasestr_null1(void *ctx) { return bpf_strcasestr(NULL, "hello"); } +SEC("syscall") __retval(USER_PTR_ERR)int test_strcasestr_null2(void *ctx) { return bpf_strcasestr("hello", NULL); } SEC("syscall") __retval(USER_PTR_ERR)int test_strnstr_null1(void *ctx) { return bpf_strnstr(NULL, "hello", 1); } SEC("syscall") __retval(USER_PTR_ERR)int test_strnstr_null2(void *ctx) { return bpf_strnstr("hello", NULL, 1); } +SEC("syscall") __retval(USER_PTR_ERR)int test_strncasestr_null1(void *ctx) { return bpf_strncasestr(NULL, "hello", 1); } +SEC("syscall") __retval(USER_PTR_ERR)int test_strncasestr_null2(void *ctx) { return bpf_strncasestr("hello", NULL, 1); } /* Passing userspace ptr to string kfuncs */ SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_user_ptr1(void *ctx) { return bpf_strcmp(user_ptr, "hello"); } @@ -65,8 +69,12 @@ SEC("syscall") __retval(USER_PTR_ERR) int test_strcspn_user_ptr1(void *ctx) { re SEC("syscall") __retval(USER_PTR_ERR) int test_strcspn_user_ptr2(void *ctx) { return bpf_strcspn("hello", user_ptr); } SEC("syscall") __retval(USER_PTR_ERR) int test_strstr_user_ptr1(void *ctx) { return bpf_strstr(user_ptr, "hello"); } SEC("syscall") __retval(USER_PTR_ERR) int test_strstr_user_ptr2(void *ctx) { return bpf_strstr("hello", user_ptr); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strcasestr_user_ptr1(void *ctx) { return bpf_strcasestr(user_ptr, "hello"); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strcasestr_user_ptr2(void *ctx) { return bpf_strcasestr("hello", user_ptr); } SEC("syscall") __retval(USER_PTR_ERR) int test_strnstr_user_ptr1(void *ctx) { return bpf_strnstr(user_ptr, "hello", 1); } SEC("syscall") __retval(USER_PTR_ERR) int test_strnstr_user_ptr2(void *ctx) { return bpf_strnstr("hello", user_ptr, 1); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strncasestr_user_ptr1(void *ctx) { return bpf_strncasestr(user_ptr, "hello", 1); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strncasestr_user_ptr2(void *ctx) { return bpf_strncasestr("hello", user_ptr, 1); } #endif /* __TARGET_ARCH_s390 */ @@ -87,7 +95,11 @@ SEC("syscall") __retval(-EFAULT) int test_strcspn_pagefault1(void *ctx) { return SEC("syscall") __retval(-EFAULT) int test_strcspn_pagefault2(void *ctx) { return bpf_strcspn("hello", invalid_kern_ptr); } SEC("syscall") __retval(-EFAULT) int test_strstr_pagefault1(void *ctx) { return bpf_strstr(invalid_kern_ptr, "hello"); } SEC("syscall") __retval(-EFAULT) int test_strstr_pagefault2(void *ctx) { return bpf_strstr("hello", invalid_kern_ptr); } +SEC("syscall") __retval(-EFAULT) int test_strcasestr_pagefault1(void *ctx) { return bpf_strcasestr(invalid_kern_ptr, "hello"); } +SEC("syscall") __retval(-EFAULT) int test_strcasestr_pagefault2(void *ctx) { return bpf_strcasestr("hello", invalid_kern_ptr); } SEC("syscall") __retval(-EFAULT) int test_strnstr_pagefault1(void *ctx) { return bpf_strnstr(invalid_kern_ptr, "hello", 1); } SEC("syscall") __retval(-EFAULT) int test_strnstr_pagefault2(void *ctx) { return bpf_strnstr("hello", invalid_kern_ptr, 1); } +SEC("syscall") __retval(-EFAULT) int test_strncasestr_pagefault1(void *ctx) { return bpf_strncasestr(invalid_kern_ptr, "hello", 1); } +SEC("syscall") __retval(-EFAULT) int test_strncasestr_pagefault2(void *ctx) { return bpf_strncasestr("hello", invalid_kern_ptr, 1); } char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c b/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c index e41cc560199430..05e1da1f250f39 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c @@ -19,6 +19,8 @@ SEC("syscall") int test_strspn_accept_too_long(void *ctx) { return bpf_strspn("b SEC("syscall") int test_strcspn_str_too_long(void *ctx) { return bpf_strcspn(long_str, "b"); } SEC("syscall") int test_strcspn_reject_too_long(void *ctx) { return bpf_strcspn("b", long_str); } SEC("syscall") int test_strstr_too_long(void *ctx) { return bpf_strstr(long_str, "hello"); } +SEC("syscall") int test_strcasestr_too_long(void *ctx) { return bpf_strcasestr(long_str, "hello"); } SEC("syscall") int test_strnstr_too_long(void *ctx) { return bpf_strnstr(long_str, "hello", sizeof(long_str)); } +SEC("syscall") int test_strncasestr_too_long(void *ctx) { return bpf_strncasestr(long_str, "hello", sizeof(long_str)); } char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c index 2e3498e37b9ce1..a8513964516be4 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c @@ -33,8 +33,11 @@ __test(11) int test_strnlen(void *ctx) { return bpf_strnlen(str, 12); } __test(5) int test_strspn(void *ctx) { return bpf_strspn(str, "ehlo"); } __test(2) int test_strcspn(void *ctx) { return bpf_strcspn(str, "lo"); } __test(6) int test_strstr_found(void *ctx) { return bpf_strstr(str, "world"); } +__test(6) int test_strcasestr_found(void *ctx) { return bpf_strcasestr(str, "woRLD"); } __test(-ENOENT) int test_strstr_notfound(void *ctx) { return bpf_strstr(str, "hi"); } +__test(-ENOENT) int test_strcasestr_notfound(void *ctx) { return bpf_strcasestr(str, "hi"); } __test(0) int test_strstr_empty(void *ctx) { return bpf_strstr(str, ""); } +__test(0) int test_strcasestr_empty(void *ctx) { return bpf_strcasestr(str, ""); } __test(0) int test_strnstr_found1(void *ctx) { return bpf_strnstr("", "", 0); } __test(0) int test_strnstr_found2(void *ctx) { return bpf_strnstr(str, "hello", 5); } __test(0) int test_strnstr_found3(void *ctx) { return bpf_strnstr(str, "hello", 6); } @@ -42,5 +45,12 @@ __test(-ENOENT) int test_strnstr_notfound1(void *ctx) { return bpf_strnstr(str, __test(-ENOENT) int test_strnstr_notfound2(void *ctx) { return bpf_strnstr(str, "hello", 4); } __test(-ENOENT) int test_strnstr_notfound3(void *ctx) { return bpf_strnstr("", "a", 0); } __test(0) int test_strnstr_empty(void *ctx) { return bpf_strnstr(str, "", 1); } +__test(0) int test_strncasestr_found1(void *ctx) { return bpf_strncasestr("", "", 0); } +__test(0) int test_strncasestr_found2(void *ctx) { return bpf_strncasestr(str, "heLLO", 5); } +__test(0) int test_strncasestr_found3(void *ctx) { return bpf_strncasestr(str, "heLLO", 6); } +__test(-ENOENT) int test_strncasestr_notfound1(void *ctx) { return bpf_strncasestr(str, "hi", 10); } +__test(-ENOENT) int test_strncasestr_notfound2(void *ctx) { return bpf_strncasestr(str, "hello", 4); } +__test(-ENOENT) int test_strncasestr_notfound3(void *ctx) { return bpf_strncasestr("", "a", 0); } +__test(0) int test_strncasestr_empty(void *ctx) { return bpf_strncasestr(str, "", 1); } char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/strobemeta.h b/tools/testing/selftests/bpf/progs/strobemeta.h index a5c74d31a2444c..6e1918deaf2627 100644 --- a/tools/testing/selftests/bpf/progs/strobemeta.h +++ b/tools/testing/selftests/bpf/progs/strobemeta.h @@ -330,9 +330,9 @@ static void *calc_location(struct strobe_value_loc *loc, void *tls_base) } bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv); /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */ - return tls_ptr && tls_ptr != (void *)-1 - ? tls_ptr + tls_index.offset - : NULL; + if (!tls_ptr || tls_ptr == (void *)-1) + return NULL; + return tls_ptr + tls_index.offset; } #ifdef SUBPROGS diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c index a58b5194fc897b..022291f21dfb2e 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c @@ -8,8 +8,6 @@ char _license[] SEC("license") = "GPL"; #define USEC_PER_SEC 1000000UL -#define min(a, b) ((a) < (b) ? (a) : (b)) - static unsigned int tcp_left_out(const struct tcp_sock *tp) { return tp->sacked_out + tp->lost_out; diff --git a/tools/testing/selftests/bpf/progs/test_check_mtu.c b/tools/testing/selftests/bpf/progs/test_check_mtu.c index 2ec1de11a3ae4a..7b6b2b342c1de1 100644 --- a/tools/testing/selftests/bpf/progs/test_check_mtu.c +++ b/tools/testing/selftests/bpf/progs/test_check_mtu.c @@ -7,6 +7,7 @@ #include <stddef.h> #include <stdint.h> +#include <errno.h> char _license[] SEC("license") = "GPL"; @@ -288,3 +289,14 @@ int tc_input_len_exceed(struct __sk_buff *ctx) global_bpf_mtu_xdp = mtu_len; return retval; } + +SEC("tc") +int tc_chk_segs_flag(struct __sk_buff *ctx) +{ + __u32 mtu_len = 0; + int err; + + err = bpf_check_mtu(ctx, GLOBAL_USER_IFINDEX, &mtu_len, 0, BPF_MTU_CHK_SEGS); + + return err == -EINVAL ? BPF_OK : BPF_DROP; +} diff --git a/tools/testing/selftests/bpf/progs/test_perf_branches.c b/tools/testing/selftests/bpf/progs/test_perf_branches.c index a1ccc831c882f6..05ac9410cd68c9 100644 --- a/tools/testing/selftests/bpf/progs/test_perf_branches.c +++ b/tools/testing/selftests/bpf/progs/test_perf_branches.c @@ -8,6 +8,7 @@ #include <bpf/bpf_tracing.h> int valid = 0; +int run_cnt = 0; int required_size_out = 0; int written_stack_out = 0; int written_global_out = 0; @@ -24,6 +25,8 @@ int perf_branches(void *ctx) __u64 entries[4 * 3] = {0}; int required_size, written_stack, written_global; + ++run_cnt; + /* write to stack */ written_stack = bpf_read_branch_records(ctx, entries, sizeof(entries), 0); /* ignore spurious events */ diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_overwrite.c b/tools/testing/selftests/bpf/progs/test_ringbuf_overwrite.c new file mode 100644 index 00000000000000..ff4aa67ddacc0a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_overwrite.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025. Huawei Technologies Co., Ltd */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(map_flags, BPF_F_RB_OVERWRITE); +} ringbuf SEC(".maps"); + +int pid; + +const volatile unsigned long LEN1; +const volatile unsigned long LEN2; +const volatile unsigned long LEN3; +const volatile unsigned long LEN4; +const volatile unsigned long LEN5; + +long reserve1_fail = 0; +long reserve2_fail = 0; +long reserve3_fail = 0; +long reserve4_fail = 0; +long reserve5_fail = 0; + +unsigned long avail_data = 0; +unsigned long ring_size = 0; +unsigned long cons_pos = 0; +unsigned long prod_pos = 0; +unsigned long over_pos = 0; + +SEC("fentry/" SYS_PREFIX "sys_getpgid") +int test_overwrite_ringbuf(void *ctx) +{ + char *rec1, *rec2, *rec3, *rec4, *rec5; + int cur_pid = bpf_get_current_pid_tgid() >> 32; + + if (cur_pid != pid) + return 0; + + rec1 = bpf_ringbuf_reserve(&ringbuf, LEN1, 0); + if (!rec1) { + reserve1_fail = 1; + return 0; + } + + rec2 = bpf_ringbuf_reserve(&ringbuf, LEN2, 0); + if (!rec2) { + bpf_ringbuf_discard(rec1, 0); + reserve2_fail = 1; + return 0; + } + + rec3 = bpf_ringbuf_reserve(&ringbuf, LEN3, 0); + /* expect failure */ + if (!rec3) { + reserve3_fail = 1; + } else { + bpf_ringbuf_discard(rec1, 0); + bpf_ringbuf_discard(rec2, 0); + bpf_ringbuf_discard(rec3, 0); + return 0; + } + + rec4 = bpf_ringbuf_reserve(&ringbuf, LEN4, 0); + if (!rec4) { + reserve4_fail = 1; + bpf_ringbuf_discard(rec1, 0); + bpf_ringbuf_discard(rec2, 0); + return 0; + } + + bpf_ringbuf_submit(rec1, 0); + bpf_ringbuf_submit(rec2, 0); + bpf_ringbuf_submit(rec4, 0); + + rec5 = bpf_ringbuf_reserve(&ringbuf, LEN5, 0); + if (!rec5) { + reserve5_fail = 1; + return 0; + } + + for (int i = 0; i < LEN3; i++) + rec5[i] = 0xdd; + + bpf_ringbuf_submit(rec5, 0); + + ring_size = bpf_ringbuf_query(&ringbuf, BPF_RB_RING_SIZE); + avail_data = bpf_ringbuf_query(&ringbuf, BPF_RB_AVAIL_DATA); + cons_pos = bpf_ringbuf_query(&ringbuf, BPF_RB_CONS_POS); + prod_pos = bpf_ringbuf_query(&ringbuf, BPF_RB_PROD_POS); + over_pos = bpf_ringbuf_query(&ringbuf, BPF_RB_OVERWRITE_POS); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_tc_edt.c b/tools/testing/selftests/bpf/progs/test_tc_edt.c index 950a70b61e7468..4f6f03122d61d2 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_edt.c +++ b/tools/testing/selftests/bpf/progs/test_tc_edt.c @@ -14,7 +14,6 @@ #define TIME_HORIZON_NS (2000 * 1000 * 1000) #define NS_PER_SEC 1000000000 #define ECN_HORIZON_NS 5000000 -#define THROTTLE_RATE_BPS (5 * 1000 * 1000) /* flow_key => last_tstamp timestamp used */ struct { @@ -24,12 +23,13 @@ struct { __uint(max_entries, 1); } flow_map SEC(".maps"); +__uint64_t target_rate; + static inline int throttle_flow(struct __sk_buff *skb) { int key = 0; uint64_t *last_tstamp = bpf_map_lookup_elem(&flow_map, &key); - uint64_t delay_ns = ((uint64_t)skb->len) * NS_PER_SEC / - THROTTLE_RATE_BPS; + uint64_t delay_ns = ((uint64_t)skb->len) * NS_PER_SEC / target_rate; uint64_t now = bpf_ktime_get_ns(); uint64_t tstamp, next_tstamp = 0; @@ -70,7 +70,7 @@ static inline int handle_tcp(struct __sk_buff *skb, struct tcphdr *tcp) if ((void *)(tcp + 1) > data_end) return TC_ACT_SHOT; - if (tcp->dest == bpf_htons(9000)) + if (tcp->source == bpf_htons(9000)) return throttle_flow(skb); return TC_ACT_OK; @@ -99,7 +99,8 @@ static inline int handle_ipv4(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("cls_test") int tc_prog(struct __sk_buff *skb) +SEC("tc") +int tc_prog(struct __sk_buff *skb) { if (skb->protocol == bpf_htons(ETH_P_IP)) return handle_ipv4(skb); diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 404124a9389278..7330c61b5730c8 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -2,23 +2,11 @@ /* In-place tunneling */ -#include <stdbool.h> -#include <string.h> - -#include <linux/stddef.h> -#include <linux/bpf.h> -#include <linux/if_ether.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <linux/mpls.h> -#include <linux/tcp.h> -#include <linux/udp.h> -#include <linux/pkt_cls.h> -#include <linux/types.h> +#include <vmlinux.h> -#include <bpf/bpf_endian.h> #include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> +#include "bpf_tracing_net.h" #include "bpf_compiler.h" #pragma GCC diagnostic ignored "-Waddress-of-packed-member" @@ -27,6 +15,14 @@ static const int cfg_port = 8000; static const int cfg_udp_src = 20000; +#define ETH_P_MPLS_UC 0x8847 +#define ETH_P_TEB 0x6558 + +#define MPLS_LS_S_MASK 0x00000100 +#define BPF_F_ADJ_ROOM_ENCAP_L2(len) \ + (((__u64)len & BPF_ADJ_ROOM_ENCAP_L2_MASK) \ + << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) + #define L2_PAD_SZ (sizeof(struct vxlanhdr) + ETH_HLEN) #define UDP_PORT 5555 @@ -36,10 +32,9 @@ static const int cfg_udp_src = 20000; #define EXTPROTO_VXLAN 0x1 -#define VXLAN_N_VID (1u << 24) -#define VXLAN_VNI_MASK bpf_htonl((VXLAN_N_VID - 1) << 8) -#define VXLAN_FLAGS 0x8 -#define VXLAN_VNI 1 +#define VXLAN_FLAGS bpf_htonl(1<<27) +#define VNI_ID 1 +#define VXLAN_VNI bpf_htonl(VNI_ID << 8) #ifndef NEXTHDR_DEST #define NEXTHDR_DEST 60 @@ -48,12 +43,6 @@ static const int cfg_udp_src = 20000; /* MPLS label 1000 with S bit (last label) set and ttl of 255. */ static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 | MPLS_LS_S_MASK | 0xff); - -struct vxlanhdr { - __be32 vx_flags; - __be32 vx_vni; -} __attribute__((packed)); - struct gre_hdr { __be16 flags; __be16 protocol; @@ -94,8 +83,8 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph) static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, __u16 l2_proto, __u16 ext_proto) { + struct iphdr iph_inner = {0}; __u16 udp_dst = UDP_PORT; - struct iphdr iph_inner; struct v4hdr h_outer; struct tcphdr tcph; int olen, l2_len; @@ -122,7 +111,6 @@ static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, return TC_ACT_OK; /* Derive the IPv4 header fields from the IPv6 header */ - memset(&iph_inner, 0, sizeof(iph_inner)); iph_inner.version = 4; iph_inner.ihl = 5; iph_inner.tot_len = bpf_htons(sizeof(iph6_inner) + @@ -210,7 +198,7 @@ static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr; vxlan_hdr->vx_flags = VXLAN_FLAGS; - vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8); + vxlan_hdr->vx_vni = VXLAN_VNI; l2_hdr += sizeof(struct vxlanhdr); } @@ -340,7 +328,7 @@ static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr; vxlan_hdr->vx_flags = VXLAN_FLAGS; - vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8); + vxlan_hdr->vx_vni = VXLAN_VNI; l2_hdr += sizeof(struct vxlanhdr); } @@ -372,8 +360,8 @@ static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, static int encap_ipv6_ipip6(struct __sk_buff *skb) { + struct v6hdr h_outer = {0}; struct iphdr iph_inner; - struct v6hdr h_outer; struct tcphdr tcph; struct ethhdr eth; __u64 flags; @@ -400,13 +388,12 @@ static int encap_ipv6_ipip6(struct __sk_buff *skb) return TC_ACT_SHOT; /* prepare new outer network header */ - memset(&h_outer.ip, 0, sizeof(h_outer.ip)); h_outer.ip.version = 6; h_outer.ip.hop_limit = iph_inner.ttl; - h_outer.ip.saddr.s6_addr[1] = 0xfd; - h_outer.ip.saddr.s6_addr[15] = 1; - h_outer.ip.daddr.s6_addr[1] = 0xfd; - h_outer.ip.daddr.s6_addr[15] = 2; + h_outer.ip.saddr.in6_u.u6_addr8[1] = 0xfd; + h_outer.ip.saddr.in6_u.u6_addr8[15] = 1; + h_outer.ip.daddr.in6_u.u6_addr8[1] = 0xfd; + h_outer.ip.daddr.in6_u.u6_addr8[15] = 2; h_outer.ip.payload_len = iph_inner.tot_len; h_outer.ip.nexthdr = IPPROTO_IPIP; @@ -431,7 +418,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, return __encap_ipv6(skb, encap_proto, l2_proto, 0); } -SEC("encap_ipip_none") +SEC("tc") int __encap_ipip_none(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) @@ -440,7 +427,7 @@ int __encap_ipip_none(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_gre_none") +SEC("tc") int __encap_gre_none(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) @@ -449,7 +436,7 @@ int __encap_gre_none(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_gre_mpls") +SEC("tc") int __encap_gre_mpls(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) @@ -458,7 +445,7 @@ int __encap_gre_mpls(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_gre_eth") +SEC("tc") int __encap_gre_eth(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) @@ -467,7 +454,7 @@ int __encap_gre_eth(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_udp_none") +SEC("tc") int __encap_udp_none(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) @@ -476,7 +463,7 @@ int __encap_udp_none(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_udp_mpls") +SEC("tc") int __encap_udp_mpls(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) @@ -485,7 +472,7 @@ int __encap_udp_mpls(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_udp_eth") +SEC("tc") int __encap_udp_eth(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) @@ -494,7 +481,7 @@ int __encap_udp_eth(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_vxlan_eth") +SEC("tc") int __encap_vxlan_eth(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) @@ -505,7 +492,7 @@ int __encap_vxlan_eth(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_sit_none") +SEC("tc") int __encap_sit_none(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) @@ -514,7 +501,7 @@ int __encap_sit_none(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_ip6tnl_none") +SEC("tc") int __encap_ip6tnl_none(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) @@ -523,7 +510,7 @@ int __encap_ip6tnl_none(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_ipip6_none") +SEC("tc") int __encap_ipip6_none(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) @@ -532,7 +519,7 @@ int __encap_ipip6_none(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_ip6gre_none") +SEC("tc") int __encap_ip6gre_none(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) @@ -541,7 +528,7 @@ int __encap_ip6gre_none(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_ip6gre_mpls") +SEC("tc") int __encap_ip6gre_mpls(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) @@ -550,7 +537,7 @@ int __encap_ip6gre_mpls(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_ip6gre_eth") +SEC("tc") int __encap_ip6gre_eth(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) @@ -559,7 +546,7 @@ int __encap_ip6gre_eth(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_ip6udp_none") +SEC("tc") int __encap_ip6udp_none(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) @@ -568,7 +555,7 @@ int __encap_ip6udp_none(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_ip6udp_mpls") +SEC("tc") int __encap_ip6udp_mpls(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) @@ -577,7 +564,7 @@ int __encap_ip6udp_mpls(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_ip6udp_eth") +SEC("tc") int __encap_ip6udp_eth(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) @@ -586,7 +573,7 @@ int __encap_ip6udp_eth(struct __sk_buff *skb) return TC_ACT_OK; } -SEC("encap_ip6vxlan_eth") +SEC("tc") int __encap_ip6vxlan_eth(struct __sk_buff *skb) { if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) @@ -693,7 +680,7 @@ static int decap_ipv6(struct __sk_buff *skb) iph_outer.nexthdr); } -SEC("decap") +SEC("tc") int decap_f(struct __sk_buff *skb) { switch (skb->protocol) { diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 3d5f30c29ae339..2898b3749d076d 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -42,12 +42,14 @@ int bench_trigger_uprobe_multi(void *ctx) const volatile int batch_iters = 0; SEC("?raw_tp") -int trigger_count(void *ctx) +int trigger_kernel_count(void *ctx) { int i; - for (i = 0; i < batch_iters; i++) + for (i = 0; i < batch_iters; i++) { inc_counter(); + bpf_get_numa_node_id(); + } return 0; } diff --git a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c new file mode 100644 index 00000000000000..7efa9521105e1b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" +#include "bpf_experimental.h" + +char _license[] SEC("license") = "GPL"; + +/* Timer tests */ + +struct timer_elem { + struct bpf_timer t; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct timer_elem); +} timer_map SEC(".maps"); + +static int timer_cb(void *map, int *key, struct bpf_timer *timer) +{ + u32 data; + /* Timer callbacks are never sleepable, even from non-sleepable programs */ + bpf_copy_from_user(&data, sizeof(data), NULL); + return 0; +} + +SEC("fentry/bpf_fentry_test1") +__failure __msg("helper call might sleep in a non-sleepable prog") +int timer_non_sleepable_prog(void *ctx) +{ + struct timer_elem *val; + int key = 0; + + val = bpf_map_lookup_elem(&timer_map, &key); + if (!val) + return 0; + + bpf_timer_init(&val->t, &timer_map, 0); + bpf_timer_set_callback(&val->t, timer_cb); + return 0; +} + +SEC("lsm.s/file_open") +__failure __msg("helper call might sleep in a non-sleepable prog") +int timer_sleepable_prog(void *ctx) +{ + struct timer_elem *val; + int key = 0; + + val = bpf_map_lookup_elem(&timer_map, &key); + if (!val) + return 0; + + bpf_timer_init(&val->t, &timer_map, 0); + bpf_timer_set_callback(&val->t, timer_cb); + return 0; +} + +/* Workqueue tests */ + +struct wq_elem { + struct bpf_wq w; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct wq_elem); +} wq_map SEC(".maps"); + +static int wq_cb(void *map, int *key, void *value) +{ + u32 data; + /* Workqueue callbacks are always sleepable, even from non-sleepable programs */ + bpf_copy_from_user(&data, sizeof(data), NULL); + return 0; +} + +SEC("fentry/bpf_fentry_test1") +__success +int wq_non_sleepable_prog(void *ctx) +{ + struct wq_elem *val; + int key = 0; + + val = bpf_map_lookup_elem(&wq_map, &key); + if (!val) + return 0; + + if (bpf_wq_init(&val->w, &wq_map, 0) != 0) + return 0; + if (bpf_wq_set_callback_impl(&val->w, wq_cb, 0, NULL) != 0) + return 0; + return 0; +} + +SEC("lsm.s/file_open") +__success +int wq_sleepable_prog(void *ctx) +{ + struct wq_elem *val; + int key = 0; + + val = bpf_map_lookup_elem(&wq_map, &key); + if (!val) + return 0; + + if (bpf_wq_init(&val->w, &wq_map, 0) != 0) + return 0; + if (bpf_wq_set_callback_impl(&val->w, wq_cb, 0, NULL) != 0) + return 0; + return 0; +} + +/* Task work tests */ + +struct task_work_elem { + struct bpf_task_work tw; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct task_work_elem); +} task_work_map SEC(".maps"); + +static int task_work_cb(struct bpf_map *map, void *key, void *value) +{ + u32 data; + /* Task work callbacks are always sleepable, even from non-sleepable programs */ + bpf_copy_from_user(&data, sizeof(data), NULL); + return 0; +} + +SEC("fentry/bpf_fentry_test1") +__success +int task_work_non_sleepable_prog(void *ctx) +{ + struct task_work_elem *val; + struct task_struct *task; + int key = 0; + + val = bpf_map_lookup_elem(&task_work_map, &key); + if (!val) + return 0; + + task = bpf_get_current_task_btf(); + if (!task) + return 0; + + bpf_task_work_schedule_resume_impl(task, &val->tw, &task_work_map, task_work_cb, NULL); + return 0; +} + +SEC("lsm.s/file_open") +__success +int task_work_sleepable_prog(void *ctx) +{ + struct task_work_elem *val; + struct task_struct *task; + int key = 0; + + val = bpf_map_lookup_elem(&task_work_map, &key); + if (!val) + return 0; + + task = bpf_get_current_task_btf(); + if (!task) + return 0; + + bpf_task_work_schedule_resume_impl(task, &val->tw, &task_work_map, task_work_cb, NULL); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 0a72e0228ea9a2..411a18437d7eab 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -1709,4 +1709,158 @@ __naked void jeq_disagreeing_tnums(void *ctx) : __clobber_all); } +SEC("socket") +__description("conditional jump on same register, branch taken") +__not_msg("20: (b7) r0 = 1 {{.*}} R0=1") +__success __log_level(2) +__retval(0) __flag(BPF_F_TEST_REG_INVARIANTS) +__naked void condition_jump_on_same_register(void *ctx) +{ + asm volatile(" \ + call %[bpf_get_prandom_u32]; \ + w8 = 0x80000000; \ + r0 &= r8; \ + if r0 == r0 goto +1; \ + goto l1_%=; \ + if r0 >= r0 goto +1; \ + goto l1_%=; \ + if r0 s>= r0 goto +1; \ + goto l1_%=; \ + if r0 <= r0 goto +1; \ + goto l1_%=; \ + if r0 s<= r0 goto +1; \ + goto l1_%=; \ + if r0 != r0 goto l1_%=; \ + if r0 > r0 goto l1_%=; \ + if r0 s> r0 goto l1_%=; \ + if r0 < r0 goto l1_%=; \ + if r0 s< r0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = 1; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("jset on same register, constant value branch taken") +__not_msg("7: (b7) r0 = 1 {{.*}} R0=1") +__success __log_level(2) +__retval(0) __flag(BPF_F_TEST_REG_INVARIANTS) +__naked void jset_on_same_register_1(void *ctx) +{ + asm volatile(" \ + r0 = 0; \ + if r0 & r0 goto l1_%=; \ + r0 = 1; \ + if r0 & r0 goto +1; \ + goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = 1; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("jset on same register, scalar value branch taken") +__not_msg("12: (b7) r0 = 1 {{.*}} R0=1") +__success __log_level(2) +__retval(0) __flag(BPF_F_TEST_REG_INVARIANTS) +__naked void jset_on_same_register_2(void *ctx) +{ + asm volatile(" \ + /* range [1;2] */ \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0x1; \ + r0 += 1; \ + if r0 & r0 goto +1; \ + goto l1_%=; \ + /* range [-2;-1] */ \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0x1; \ + r0 -= 2; \ + if r0 & r0 goto +1; \ + goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = 1; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("jset on same register, scalar value unknown branch 1") +__msg("3: (b7) r0 = 0 {{.*}} R0=0") +__msg("5: (b7) r0 = 1 {{.*}} R0=1") +__success __log_level(2) +__flag(BPF_F_TEST_REG_INVARIANTS) +__naked void jset_on_same_register_3(void *ctx) +{ + asm volatile(" \ + /* range [0;1] */ \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0x1; \ + if r0 & r0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = 1; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("jset on same register, scalar value unknown branch 2") +__msg("4: (b7) r0 = 0 {{.*}} R0=0") +__msg("6: (b7) r0 = 1 {{.*}} R0=1") +__success __log_level(2) +__flag(BPF_F_TEST_REG_INVARIANTS) +__naked void jset_on_same_register_4(void *ctx) +{ + asm volatile(" \ + /* range [-1;0] */ \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0x1; \ + r0 -= 1; \ + if r0 & r0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = 1; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("jset on same register, scalar value unknown branch 3") +__msg("4: (b7) r0 = 0 {{.*}} R0=0") +__msg("6: (b7) r0 = 1 {{.*}} R0=1") +__success __log_level(2) +__flag(BPF_F_TEST_REG_INVARIANTS) +__naked void jset_on_same_register_5(void *ctx) +{ + asm volatile(" \ + /* range [-1;1] */ \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0x2; \ + r0 -= 1; \ + if r0 & r0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = 1; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c b/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c index 28b602ac9cbe2c..911caa8fd1b759 100644 --- a/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c +++ b/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Converted from tools/testing/selftests/bpf/verifier/direct_packet_access.c */ +#include <linux/if_ether.h> #include <linux/bpf.h> #include <bpf/bpf_helpers.h> #include "bpf_misc.h" @@ -800,4 +801,62 @@ l0_%=: /* exit(0) */ \ : __clobber_all); } +#define access_test_non_linear(name, type, desc, retval, linear_sz, off) \ + SEC(type) \ + __description("direct packet access: " #name " (non-linear, " type ", " desc ")") \ + __success __retval(retval) \ + __linear_size(linear_sz) \ + __naked void access_non_linear_##name(void) \ + { \ + asm volatile (" \ + r2 = *(u32*)(r1 + %[skb_data]); \ + r3 = *(u32*)(r1 + %[skb_data_end]); \ + r0 = r2; \ + r0 += %[offset]; \ + if r0 > r3 goto l0_%=; \ + r0 = *(u8*)(r0 - 1); \ + r0 = 0; \ + exit; \ + l0_%=: r0 = 1; \ + exit; \ + " : \ + : __imm_const(skb_data, offsetof(struct __sk_buff, data)), \ + __imm_const(skb_data_end, offsetof(struct __sk_buff, data_end)), \ + __imm_const(offset, off) \ + : __clobber_all); \ + } + +access_test_non_linear(test31, "tc", "too short eth", 1, ETH_HLEN, 22); +access_test_non_linear(test32, "tc", "too short 1", 1, 1, 22); +access_test_non_linear(test33, "tc", "long enough", 0, 22, 22); +access_test_non_linear(test34, "cgroup_skb/ingress", "too short eth", 1, ETH_HLEN, 8); +access_test_non_linear(test35, "cgroup_skb/ingress", "too short 1", 1, 1, 8); +access_test_non_linear(test36, "cgroup_skb/ingress", "long enough", 0, 22, 8); + +SEC("tc") +__description("direct packet access: test37 (non-linear, linearized)") +__success __retval(0) +__linear_size(ETH_HLEN) +__naked void access_non_linear_linearized(void) +{ + asm volatile (" \ + r6 = r1; \ + r2 = 22; \ + call %[bpf_skb_pull_data]; \ + r2 = *(u32*)(r6 + %[skb_data]); \ + r3 = *(u32*)(r6 + %[skb_data_end]); \ + r0 = r2; \ + r0 += 22; \ + if r0 > r3 goto l0_%=; \ + r0 = *(u8*)(r0 - 1); \ + exit; \ +l0_%=: r0 = 1; \ + exit; \ +" : + : __imm(bpf_skb_pull_data), + __imm_const(skb_data, offsetof(struct __sk_buff, data)), + __imm_const(skb_data_end, offsetof(struct __sk_buff, data_end)) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_gotox.c b/tools/testing/selftests/bpf/progs/verifier_gotox.c new file mode 100644 index 00000000000000..607dad058ca16b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_gotox.c @@ -0,0 +1,389 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Isovalent */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" +#include "../../../include/linux/filter.h" + +#if defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64) + +#define DEFINE_SIMPLE_JUMP_TABLE_PROG(NAME, SRC_REG, OFF, IMM, OUTCOME) \ + \ + SEC("socket") \ + OUTCOME \ + __naked void jump_table_ ## NAME(void) \ + { \ + asm volatile (" \ + .pushsection .jumptables,\"\",@progbits; \ + jt0_%=: \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .size jt0_%=, 16; \ + .global jt0_%=; \ + .popsection; \ + \ + r0 = jt0_%= ll; \ + r0 += 8; \ + r0 = *(u64 *)(r0 + 0); \ + .8byte %[gotox_r0]; \ + ret0_%=: \ + r0 = 0; \ + exit; \ + ret1_%=: \ + r0 = 1; \ + exit; \ + " : \ + : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, (SRC_REG), (OFF) , (IMM))) \ + : __clobber_all); \ + } + +/* + * The first program which doesn't use reserved fields + * loads and works properly. The rest fail to load. + */ +DEFINE_SIMPLE_JUMP_TABLE_PROG(ok, BPF_REG_0, 0, 0, __success __retval(1)) +DEFINE_SIMPLE_JUMP_TABLE_PROG(reserved_field_src_reg, BPF_REG_1, 0, 0, __failure __msg("BPF_JA|BPF_X uses reserved fields")) +DEFINE_SIMPLE_JUMP_TABLE_PROG(reserved_field_non_zero_off, BPF_REG_0, 1, 0, __failure __msg("BPF_JA|BPF_X uses reserved fields")) +DEFINE_SIMPLE_JUMP_TABLE_PROG(reserved_field_non_zero_imm, BPF_REG_0, 0, 1, __failure __msg("BPF_JA|BPF_X uses reserved fields")) + +/* + * Gotox is forbidden when there is no jump table loaded + * which points to the sub-function where the gotox is used + */ +SEC("socket") +__failure __msg("no jump tables found for subprog starting at 0") +__naked void jump_table_no_jump_table(void) +{ + asm volatile (" \ + .8byte %[gotox_r0]; \ + r0 = 1; \ + exit; \ +" : \ + : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0)) + : __clobber_all); +} + +/* + * Incorrect type of the target register, only PTR_TO_INSN allowed + */ +SEC("socket") +__failure __msg("R1 has type scalar, expected PTR_TO_INSN") +__naked void jump_table_incorrect_dst_reg_type(void) +{ + asm volatile (" \ + .pushsection .jumptables,\"\",@progbits; \ +jt0_%=: \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .size jt0_%=, 16; \ + .global jt0_%=; \ + .popsection; \ + \ + r0 = jt0_%= ll; \ + r0 += 8; \ + r0 = *(u64 *)(r0 + 0); \ + r1 = 42; \ + .8byte %[gotox_r1]; \ + ret0_%=: \ + r0 = 0; \ + exit; \ + ret1_%=: \ + r0 = 1; \ + exit; \ +" : \ + : __imm_insn(gotox_r1, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_1, 0, 0 , 0)) + : __clobber_all); +} + +#define DEFINE_INVALID_SIZE_PROG(READ_SIZE, OUTCOME) \ + \ + SEC("socket") \ + OUTCOME \ + __naked void jump_table_invalid_read_size_ ## READ_SIZE(void) \ + { \ + asm volatile (" \ + .pushsection .jumptables,\"\",@progbits; \ + jt0_%=: \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .size jt0_%=, 16; \ + .global jt0_%=; \ + .popsection; \ + \ + r0 = jt0_%= ll; \ + r0 += 8; \ + r0 = *(" #READ_SIZE " *)(r0 + 0); \ + .8byte %[gotox_r0]; \ + ret0_%=: \ + r0 = 0; \ + exit; \ + ret1_%=: \ + r0 = 1; \ + exit; \ + " : \ + : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0)) \ + : __clobber_all); \ + } + +DEFINE_INVALID_SIZE_PROG(u32, __failure __msg("Invalid read of 4 bytes from insn_array")) +DEFINE_INVALID_SIZE_PROG(u16, __failure __msg("Invalid read of 2 bytes from insn_array")) +DEFINE_INVALID_SIZE_PROG(u8, __failure __msg("Invalid read of 1 bytes from insn_array")) + +SEC("socket") +__failure __msg("misaligned value access off 0+1+0 size 8") +__naked void jump_table_misaligned_access(void) +{ + asm volatile (" \ + .pushsection .jumptables,\"\",@progbits; \ +jt0_%=: \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .size jt0_%=, 16; \ + .global jt0_%=; \ + .popsection; \ + \ + r0 = jt0_%= ll; \ + r0 += 1; \ + r0 = *(u64 *)(r0 + 0); \ + .8byte %[gotox_r0]; \ + ret0_%=: \ + r0 = 0; \ + exit; \ + ret1_%=: \ + r0 = 1; \ + exit; \ +" : \ + : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0)) + : __clobber_all); +} + +SEC("socket") +__failure __msg("invalid access to map value, value_size=16 off=24 size=8") +__naked void jump_table_invalid_mem_acceess_pos(void) +{ + asm volatile (" \ + .pushsection .jumptables,\"\",@progbits; \ +jt0_%=: \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .size jt0_%=, 16; \ + .global jt0_%=; \ + .popsection; \ + \ + r0 = jt0_%= ll; \ + r0 += 24; \ + r0 = *(u64 *)(r0 + 0); \ + .8byte %[gotox_r0]; \ + ret0_%=: \ + r0 = 0; \ + exit; \ + ret1_%=: \ + r0 = 1; \ + exit; \ +" : \ + : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0)) + : __clobber_all); +} + +SEC("socket") +__failure __msg("invalid access to map value, value_size=16 off=-24 size=8") +__naked void jump_table_invalid_mem_acceess_neg(void) +{ + asm volatile (" \ + .pushsection .jumptables,\"\",@progbits; \ +jt0_%=: \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .size jt0_%=, 16; \ + .global jt0_%=; \ + .popsection; \ + \ + r0 = jt0_%= ll; \ + r0 -= 24; \ + r0 = *(u64 *)(r0 + 0); \ + .8byte %[gotox_r0]; \ + ret0_%=: \ + r0 = 0; \ + exit; \ + ret1_%=: \ + r0 = 1; \ + exit; \ +" : \ + : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0)) + : __clobber_all); +} + +SEC("socket") +__success __retval(1) +__naked void jump_table_add_sub_ok(void) +{ + asm volatile (" \ + .pushsection .jumptables,\"\",@progbits; \ +jt0_%=: \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .size jt0_%=, 16; \ + .global jt0_%=; \ + .popsection; \ + \ + r0 = jt0_%= ll; \ + r0 -= 24; \ + r0 += 32; \ + r0 = *(u64 *)(r0 + 0); \ + .8byte %[gotox_r0]; \ + ret0_%=: \ + r0 = 0; \ + exit; \ + ret1_%=: \ + r0 = 1; \ + exit; \ +" : \ + : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0)) + : __clobber_all); +} + +SEC("socket") +__failure __msg("write into map forbidden, value_size=16 off=8 size=8") +__naked void jump_table_no_writes(void) +{ + asm volatile (" \ + .pushsection .jumptables,\"\",@progbits; \ +jt0_%=: \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .size jt0_%=, 16; \ + .global jt0_%=; \ + .popsection; \ + \ + r0 = jt0_%= ll; \ + r0 += 8; \ + r1 = 0xbeef; \ + *(u64 *)(r0 + 0) = r1; \ + .8byte %[gotox_r0]; \ + ret0_%=: \ + r0 = 0; \ + exit; \ + ret1_%=: \ + r0 = 1; \ + exit; \ +" : \ + : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0)) + : __clobber_all); +} + +#define DEFINE_JUMP_TABLE_USE_REG(REG) \ + SEC("socket") \ + __success __retval(1) \ + __naked void jump_table_use_reg_r ## REG(void) \ + { \ + asm volatile (" \ + .pushsection .jumptables,\"\",@progbits; \ + jt0_%=: \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .size jt0_%=, 16; \ + .global jt0_%=; \ + .popsection; \ + \ + r0 = jt0_%= ll; \ + r0 += 8; \ + r" #REG " = *(u64 *)(r0 + 0); \ + .8byte %[gotox_rX]; \ + ret0_%=: \ + r0 = 0; \ + exit; \ + ret1_%=: \ + r0 = 1; \ + exit; \ + " : \ + : __imm_insn(gotox_rX, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_ ## REG, 0, 0 , 0)) \ + : __clobber_all); \ + } + +DEFINE_JUMP_TABLE_USE_REG(0) +DEFINE_JUMP_TABLE_USE_REG(1) +DEFINE_JUMP_TABLE_USE_REG(2) +DEFINE_JUMP_TABLE_USE_REG(3) +DEFINE_JUMP_TABLE_USE_REG(4) +DEFINE_JUMP_TABLE_USE_REG(5) +DEFINE_JUMP_TABLE_USE_REG(6) +DEFINE_JUMP_TABLE_USE_REG(7) +DEFINE_JUMP_TABLE_USE_REG(8) +DEFINE_JUMP_TABLE_USE_REG(9) + +__used static int test_subprog(void) +{ + return 0; +} + +SEC("socket") +__failure __msg("jump table for insn 4 points outside of the subprog [0,10]") +__naked void jump_table_outside_subprog(void) +{ + asm volatile (" \ + .pushsection .jumptables,\"\",@progbits; \ +jt0_%=: \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .quad ret_out_%= - socket; \ + .size jt0_%=, 24; \ + .global jt0_%=; \ + .popsection; \ + \ + r0 = jt0_%= ll; \ + r0 += 8; \ + r0 = *(u64 *)(r0 + 0); \ + .8byte %[gotox_r0]; \ + ret0_%=: \ + r0 = 0; \ + exit; \ + ret1_%=: \ + r0 = 1; \ + call test_subprog; \ + exit; \ + ret_out_%=: \ +" : \ + : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0)) + : __clobber_all); +} + +SEC("socket") +__success __retval(1) +__naked void jump_table_contains_non_unique_values(void) +{ + asm volatile (" \ + .pushsection .jumptables,\"\",@progbits; \ +jt0_%=: \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .quad ret0_%= - socket; \ + .quad ret1_%= - socket; \ + .size jt0_%=, 80; \ + .global jt0_%=; \ + .popsection; \ + \ + r0 = jt0_%= ll; \ + r0 += 8; \ + r0 = *(u64 *)(r0 + 0); \ + .8byte %[gotox_r0]; \ + ret0_%=: \ + r0 = 0; \ + exit; \ + ret1_%=: \ + r0 = 1; \ + exit; \ +" : \ + : __imm_insn(gotox_r0, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_0, 0, 0 , 0)) + : __clobber_all); +} + +#endif /* __TARGET_ARCH_x86 || __TARGET_ARCH_arm64 */ + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_live_stack.c b/tools/testing/selftests/bpf/progs/verifier_live_stack.c index c0e80850926827..2de105057bbc42 100644 --- a/tools/testing/selftests/bpf/progs/verifier_live_stack.c +++ b/tools/testing/selftests/bpf/progs/verifier_live_stack.c @@ -292,3 +292,53 @@ __naked void syzbot_postorder_bug1(void) "exit;" ::: __clobber_all); } + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u32); +} map_array SEC(".maps"); + +SEC("socket") +__failure __msg("invalid read from stack R2 off=-1024 size=8") +__flag(BPF_F_TEST_STATE_FREQ) +__naked unsigned long caller_stack_write_tail_call(void) +{ + asm volatile ( + "r6 = r1;" + "*(u64 *)(r10 - 8) = -8;" + "call %[bpf_get_prandom_u32];" + "if r0 != 42 goto 1f;" + "goto 2f;" + "1:" + "*(u64 *)(r10 - 8) = -1024;" + "2:" + "r1 = r6;" + "r2 = r10;" + "r2 += -8;" + "call write_tail_call;" + "r1 = *(u64 *)(r10 - 8);" + "r2 = r10;" + "r2 += r1;" + "r0 = *(u64 *)(r2 + 0);" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +static __used __naked unsigned long write_tail_call(void) +{ + asm volatile ( + "r6 = r2;" + "r2 = %[map_array] ll;" + "r3 = 0;" + "call %[bpf_tail_call];" + "*(u64 *)(r6 + 0) = -16;" + "r0 = 0;" + "exit;" + : + : __imm(bpf_tail_call), + __imm_addr(map_array) + : __clobber_all); +} diff --git a/tools/testing/selftests/bpf/progs/verifier_lsm.c b/tools/testing/selftests/bpf/progs/verifier_lsm.c index 32e5e779cb962f..6af9100a37ffd6 100644 --- a/tools/testing/selftests/bpf/progs/verifier_lsm.c +++ b/tools/testing/selftests/bpf/progs/verifier_lsm.c @@ -4,7 +4,7 @@ #include <bpf/bpf_helpers.h> #include "bpf_misc.h" -SEC("lsm/file_alloc_security") +SEC("lsm/file_permission") __description("lsm bpf prog with -4095~0 retval. test 1") __success __naked int errno_zero_retval_test1(void *ctx) @@ -15,7 +15,7 @@ __naked int errno_zero_retval_test1(void *ctx) ::: __clobber_all); } -SEC("lsm/file_alloc_security") +SEC("lsm/file_permission") __description("lsm bpf prog with -4095~0 retval. test 2") __success __naked int errno_zero_retval_test2(void *ctx) diff --git a/tools/testing/selftests/bpf/progs/verifier_netfilter_ctx.c b/tools/testing/selftests/bpf/progs/verifier_netfilter_ctx.c index ab9f9f2620edca..e2cbc5bda65ea5 100644 --- a/tools/testing/selftests/bpf/progs/verifier_netfilter_ctx.c +++ b/tools/testing/selftests/bpf/progs/verifier_netfilter_ctx.c @@ -79,11 +79,6 @@ int with_invalid_ctx_access_test5(struct bpf_nf_ctx *ctx) return NF_ACCEPT; } -extern int bpf_dynptr_from_skb(struct __sk_buff *skb, __u64 flags, - struct bpf_dynptr *ptr__uninit) __ksym; -extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, uint32_t offset, - void *buffer, uint32_t buffer__sz) __ksym; - SEC("netfilter") __description("netfilter test prog with skb and state read access") __success __failure_unpriv diff --git a/tools/testing/selftests/bpf/progs/verifier_sock.c b/tools/testing/selftests/bpf/progs/verifier_sock.c index 2b4610b53382d0..a2132c72d3b806 100644 --- a/tools/testing/selftests/bpf/progs/verifier_sock.c +++ b/tools/testing/selftests/bpf/progs/verifier_sock.c @@ -1117,10 +1117,17 @@ int tail_call(struct __sk_buff *sk) return 0; } -/* Tail calls invalidate packet pointers. */ +static __noinline +int static_tail_call(struct __sk_buff *sk) +{ + bpf_tail_call_static(sk, &jmp_table, 0); + return 0; +} + +/* Tail calls in sub-programs invalidate packet pointers. */ SEC("tc") __failure __msg("invalid mem access") -int invalidate_pkt_pointers_by_tail_call(struct __sk_buff *sk) +int invalidate_pkt_pointers_by_global_tail_call(struct __sk_buff *sk) { int *p = (void *)(long)sk->data; @@ -1131,4 +1138,32 @@ int invalidate_pkt_pointers_by_tail_call(struct __sk_buff *sk) return TCX_PASS; } +/* Tail calls in static sub-programs invalidate packet pointers. */ +SEC("tc") +__failure __msg("invalid mem access") +int invalidate_pkt_pointers_by_static_tail_call(struct __sk_buff *sk) +{ + int *p = (void *)(long)sk->data; + + if ((void *)(p + 1) > (void *)(long)sk->data_end) + return TCX_DROP; + static_tail_call(sk); + *p = 42; /* this is unsafe */ + return TCX_PASS; +} + +/* Direct tail calls do not invalidate packet pointers. */ +SEC("tc") +__success +int invalidate_pkt_pointers_by_tail_call(struct __sk_buff *sk) +{ + int *p = (void *)(long)sk->data; + + if ((void *)(p + 1) > (void *)(long)sk->data_end) + return TCX_DROP; + bpf_tail_call_static(sk, &jmp_table, 0); + *p = 42; /* this is NOT unsafe: tail calls don't return */ + return TCX_PASS; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index ac3e418c2a9616..61886ed554de80 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -793,4 +793,57 @@ __naked int stack_slot_aliases_precision(void) ); } +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u32); +} map_array SEC(".maps"); + +__naked __noinline __used +static unsigned long identity_tail_call(void) +{ + /* the simplest identity function involving a tail call */ + asm volatile ( + "r6 = r2;" + "r2 = %[map_array] ll;" + "r3 = 0;" + "call %[bpf_tail_call];" + "r0 = r6;" + "exit;" + : + : __imm(bpf_tail_call), + __imm_addr(map_array) + : __clobber_all); +} + +SEC("?raw_tp") +__failure __log_level(2) +__msg("13: (85) call bpf_tail_call#12") +__msg("mark_precise: frame1: last_idx 13 first_idx 0 subseq_idx -1 ") +__msg("returning from callee:") +__msg("frame1: R0=scalar() R6=3 R10=fp0") +__msg("to caller at 4:") +__msg("R0=scalar() R6=map_value(map=.data.vals,ks=4,vs=16) R10=fp0") +__msg("6: (0f) r1 += r0") +__msg("mark_precise: frame0: regs=r0 stack= before 5: (bf) r1 = r6") +__msg("mark_precise: frame0: regs=r0 stack= before 4: (27) r0 *= 4") +__msg("mark_precise: frame0: parent state regs=r0 stack=: R0=Pscalar() R6=map_value(map=.data.vals,ks=4,vs=16) R10=fp0") +__msg("math between map_value pointer and register with unbounded min value is not allowed") +__naked int subprog_result_tail_call(void) +{ + asm volatile ( + "r2 = 3;" + "call identity_tail_call;" + "r0 *= 4;" + "r1 = %[vals];" + "r1 += r0;" + "r0 = *(u32 *)(r1 + 0);" + "exit;" + : + : __imm_ptr(vals) + : __clobber_common + ); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/wq.c b/tools/testing/selftests/bpf/progs/wq.c index 2f1ba08c293e26..25be2cd9d42c72 100644 --- a/tools/testing/selftests/bpf/progs/wq.c +++ b/tools/testing/selftests/bpf/progs/wq.c @@ -187,3 +187,20 @@ long test_call_lru_sleepable(void *ctx) return test_elem_callback(&lru, &key, wq_callback); } + +SEC("tc") +long test_map_no_btf(void *ctx) +{ + struct elem *val; + struct bpf_wq *wq; + int key = 42; + + val = bpf_map_lookup_elem(&array, &key); + if (!val) + return -2; + + wq = &val->w; + if (bpf_wq_init(wq, &array, 0) != 0) + return -3; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/wq_failures.c b/tools/testing/selftests/bpf/progs/wq_failures.c index 4240211a19001f..d06f6d40594a65 100644 --- a/tools/testing/selftests/bpf/progs/wq_failures.c +++ b/tools/testing/selftests/bpf/progs/wq_failures.c @@ -142,3 +142,26 @@ long test_wrong_wq_pointer_offset(void *ctx) return -22; } + +SEC("tc") +__log_level(2) +__failure +__msg(": (85) call bpf_wq_init#") +__msg("R1 doesn't have constant offset. bpf_wq has to be at the constant offset") +long test_bad_wq_off(void *ctx) +{ + struct elem *val; + struct bpf_wq *wq; + int key = 42; + u64 unknown; + + val = bpf_map_lookup_elem(&array, &key); + if (!val) + return -2; + + unknown = bpf_get_prandom_u32(); + wq = &val->w + unknown; + if (bpf_wq_init(wq, &array, 0) != 0) + return -3; + return 0; +} diff --git a/tools/testing/selftests/bpf/test_bpftool_build.sh b/tools/testing/selftests/bpf/test_bpftool_build.sh index 1453a53ed547d9..b03a875715920f 100755 --- a/tools/testing/selftests/bpf/test_bpftool_build.sh +++ b/tools/testing/selftests/bpf/test_bpftool_build.sh @@ -90,10 +90,6 @@ echo -e "... through kbuild\n" if [ -f ".config" ] ; then make_and_clean tools/bpf - ## "make tools/bpf" sets $(OUTPUT) to ...tools/bpf/runqslower for - ## runqslower, but the default (used for the "clean" target) is .output. - ## Let's make sure we clean runqslower's directory properly. - make -C tools/bpf/runqslower OUTPUT=${KDIR_ROOT_DIR}/tools/bpf/runqslower/ clean ## $OUTPUT is overwritten in kbuild Makefile, and thus cannot be passed ## down from toplevel Makefile to bpftool's Makefile. diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c b/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c index 769206fc70e485..7b4ae5e81d3252 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c @@ -5,6 +5,7 @@ #include <linux/delay.h> #include <linux/module.h> #include <linux/prandom.h> +#include <linux/ktime.h> #include <asm/rqspinlock.h> #include <linux/perf_event.h> #include <linux/kthread.h> @@ -22,48 +23,146 @@ static struct perf_event_attr hw_attr = { static rqspinlock_t lock_a; static rqspinlock_t lock_b; +static rqspinlock_t lock_c; + +#define RQSL_SLOW_THRESHOLD_MS 10 +static const unsigned int rqsl_hist_ms[] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 12, 14, 16, 18, 20, 25, 30, 40, 50, 75, + 100, 150, 200, 250, 1000, +}; +#define RQSL_NR_HIST_BUCKETS ARRAY_SIZE(rqsl_hist_ms) + +enum rqsl_context { + RQSL_CTX_NORMAL = 0, + RQSL_CTX_NMI, + RQSL_CTX_MAX, +}; + +struct rqsl_cpu_hist { + atomic64_t hist[RQSL_CTX_MAX][RQSL_NR_HIST_BUCKETS]; + atomic64_t success[RQSL_CTX_MAX]; + atomic64_t failure[RQSL_CTX_MAX]; +}; + +static DEFINE_PER_CPU(struct rqsl_cpu_hist, rqsl_cpu_hists); + +enum rqsl_mode { + RQSL_MODE_AA = 0, + RQSL_MODE_ABBA, + RQSL_MODE_ABBCCA, +}; + +static int test_mode = RQSL_MODE_AA; +module_param(test_mode, int, 0644); +MODULE_PARM_DESC(test_mode, + "rqspinlock test mode: 0 = AA, 1 = ABBA, 2 = ABBCCA"); + +static int normal_delay = 20; +module_param(normal_delay, int, 0644); +MODULE_PARM_DESC(normal_delay, + "rqspinlock critical section length for normal context (20ms default)"); + +static int nmi_delay = 10; +module_param(nmi_delay, int, 0644); +MODULE_PARM_DESC(nmi_delay, + "rqspinlock critical section length for NMI context (10ms default)"); static struct perf_event **rqsl_evts; static int rqsl_nevts; -static bool test_ab = false; -module_param(test_ab, bool, 0644); -MODULE_PARM_DESC(test_ab, "Test ABBA situations instead of AA situations"); - static struct task_struct **rqsl_threads; static int rqsl_nthreads; static atomic_t rqsl_ready_cpus = ATOMIC_INIT(0); static int pause = 0; -static bool nmi_locks_a(int cpu) +static const char *rqsl_mode_names[] = { + [RQSL_MODE_AA] = "AA", + [RQSL_MODE_ABBA] = "ABBA", + [RQSL_MODE_ABBCCA] = "ABBCCA", +}; + +struct rqsl_lock_pair { + rqspinlock_t *worker_lock; + rqspinlock_t *nmi_lock; +}; + +static struct rqsl_lock_pair rqsl_get_lock_pair(int cpu) { - return (cpu & 1) && test_ab; + int mode = READ_ONCE(test_mode); + + switch (mode) { + default: + case RQSL_MODE_AA: + return (struct rqsl_lock_pair){ &lock_a, &lock_a }; + case RQSL_MODE_ABBA: + if (cpu & 1) + return (struct rqsl_lock_pair){ &lock_b, &lock_a }; + return (struct rqsl_lock_pair){ &lock_a, &lock_b }; + case RQSL_MODE_ABBCCA: + switch (cpu % 3) { + case 0: + return (struct rqsl_lock_pair){ &lock_a, &lock_b }; + case 1: + return (struct rqsl_lock_pair){ &lock_b, &lock_c }; + default: + return (struct rqsl_lock_pair){ &lock_c, &lock_a }; + } + } +} + +static u32 rqsl_hist_bucket_idx(u32 delta_ms) +{ + int i; + + for (i = 0; i < RQSL_NR_HIST_BUCKETS; i++) { + if (delta_ms <= rqsl_hist_ms[i]) + return i; + } + + return RQSL_NR_HIST_BUCKETS - 1; +} + +static void rqsl_record_lock_result(u64 delta_ns, enum rqsl_context ctx, int ret) +{ + struct rqsl_cpu_hist *hist = this_cpu_ptr(&rqsl_cpu_hists); + u32 delta_ms = DIV_ROUND_UP_ULL(delta_ns, NSEC_PER_MSEC); + u32 bucket = rqsl_hist_bucket_idx(delta_ms); + atomic64_t *buckets = hist->hist[ctx]; + + atomic64_inc(&buckets[bucket]); + if (!ret) + atomic64_inc(&hist->success[ctx]); + else + atomic64_inc(&hist->failure[ctx]); } static int rqspinlock_worker_fn(void *arg) { int cpu = smp_processor_id(); unsigned long flags; + u64 start_ns; int ret; if (cpu) { atomic_inc(&rqsl_ready_cpus); while (!kthread_should_stop()) { + struct rqsl_lock_pair locks = rqsl_get_lock_pair(cpu); + rqspinlock_t *worker_lock = locks.worker_lock; + if (READ_ONCE(pause)) { msleep(1000); continue; } - if (nmi_locks_a(cpu)) - ret = raw_res_spin_lock_irqsave(&lock_b, flags); - else - ret = raw_res_spin_lock_irqsave(&lock_a, flags); - mdelay(20); - if (nmi_locks_a(cpu) && !ret) - raw_res_spin_unlock_irqrestore(&lock_b, flags); - else if (!ret) - raw_res_spin_unlock_irqrestore(&lock_a, flags); + start_ns = ktime_get_mono_fast_ns(); + ret = raw_res_spin_lock_irqsave(worker_lock, flags); + rqsl_record_lock_result(ktime_get_mono_fast_ns() - start_ns, + RQSL_CTX_NORMAL, ret); + mdelay(normal_delay); + if (!ret) + raw_res_spin_unlock_irqrestore(worker_lock, flags); cpu_relax(); } return 0; @@ -91,24 +190,25 @@ static int rqspinlock_worker_fn(void *arg) static void nmi_cb(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { + struct rqsl_lock_pair locks; int cpu = smp_processor_id(); unsigned long flags; + u64 start_ns; int ret; if (!cpu || READ_ONCE(pause)) return; - if (nmi_locks_a(cpu)) - ret = raw_res_spin_lock_irqsave(&lock_a, flags); - else - ret = raw_res_spin_lock_irqsave(test_ab ? &lock_b : &lock_a, flags); + locks = rqsl_get_lock_pair(cpu); + start_ns = ktime_get_mono_fast_ns(); + ret = raw_res_spin_lock_irqsave(locks.nmi_lock, flags); + rqsl_record_lock_result(ktime_get_mono_fast_ns() - start_ns, + RQSL_CTX_NMI, ret); - mdelay(10); + mdelay(nmi_delay); - if (nmi_locks_a(cpu) && !ret) - raw_res_spin_unlock_irqrestore(&lock_a, flags); - else if (!ret) - raw_res_spin_unlock_irqrestore(test_ab ? &lock_b : &lock_a, flags); + if (!ret) + raw_res_spin_unlock_irqrestore(locks.nmi_lock, flags); } static void free_rqsl_threads(void) @@ -142,13 +242,19 @@ static int bpf_test_rqspinlock_init(void) int i, ret; int ncpus = num_online_cpus(); - pr_err("Mode = %s\n", test_ab ? "ABBA" : "AA"); + if (test_mode < RQSL_MODE_AA || test_mode > RQSL_MODE_ABBCCA) { + pr_err("Invalid mode %d\n", test_mode); + return -EINVAL; + } + + pr_err("Mode = %s\n", rqsl_mode_names[test_mode]); - if (ncpus < 3) + if (ncpus < test_mode + 2) return -ENOTSUPP; raw_res_spin_lock_init(&lock_a); raw_res_spin_lock_init(&lock_b); + raw_res_spin_lock_init(&lock_c); rqsl_evts = kcalloc(ncpus - 1, sizeof(*rqsl_evts), GFP_KERNEL); if (!rqsl_evts) @@ -196,10 +302,88 @@ err_perf_events: module_init(bpf_test_rqspinlock_init); +static void rqsl_print_histograms(void) +{ + int cpu, i; + + pr_err("rqspinlock acquisition latency histogram (ms):\n"); + + for_each_online_cpu(cpu) { + struct rqsl_cpu_hist *hist = per_cpu_ptr(&rqsl_cpu_hists, cpu); + u64 norm_counts[RQSL_NR_HIST_BUCKETS]; + u64 nmi_counts[RQSL_NR_HIST_BUCKETS]; + u64 total_counts[RQSL_NR_HIST_BUCKETS]; + u64 norm_success, nmi_success, success_total; + u64 norm_failure, nmi_failure, failure_total; + u64 norm_total = 0, nmi_total = 0, total = 0; + bool has_slow = false; + + for (i = 0; i < RQSL_NR_HIST_BUCKETS; i++) { + norm_counts[i] = atomic64_read(&hist->hist[RQSL_CTX_NORMAL][i]); + nmi_counts[i] = atomic64_read(&hist->hist[RQSL_CTX_NMI][i]); + total_counts[i] = norm_counts[i] + nmi_counts[i]; + norm_total += norm_counts[i]; + nmi_total += nmi_counts[i]; + total += total_counts[i]; + if (rqsl_hist_ms[i] > RQSL_SLOW_THRESHOLD_MS && + total_counts[i]) + has_slow = true; + } + + norm_success = atomic64_read(&hist->success[RQSL_CTX_NORMAL]); + nmi_success = atomic64_read(&hist->success[RQSL_CTX_NMI]); + norm_failure = atomic64_read(&hist->failure[RQSL_CTX_NORMAL]); + nmi_failure = atomic64_read(&hist->failure[RQSL_CTX_NMI]); + success_total = norm_success + nmi_success; + failure_total = norm_failure + nmi_failure; + + if (!total) + continue; + + if (!has_slow) { + pr_err(" cpu%d: total %llu (normal %llu, nmi %llu) | " + "success %llu (normal %llu, nmi %llu) | " + "failure %llu (normal %llu, nmi %llu), all within 0-%ums\n", + cpu, total, norm_total, nmi_total, + success_total, norm_success, nmi_success, + failure_total, norm_failure, nmi_failure, + RQSL_SLOW_THRESHOLD_MS); + continue; + } + + pr_err(" cpu%d: total %llu (normal %llu, nmi %llu) | " + "success %llu (normal %llu, nmi %llu) | " + "failure %llu (normal %llu, nmi %llu)\n", + cpu, total, norm_total, nmi_total, + success_total, norm_success, nmi_success, + failure_total, norm_failure, nmi_failure); + for (i = 0; i < RQSL_NR_HIST_BUCKETS; i++) { + unsigned int start_ms; + + if (!total_counts[i]) + continue; + + start_ms = i == 0 ? 0 : rqsl_hist_ms[i - 1] + 1; + if (i == RQSL_NR_HIST_BUCKETS - 1) { + pr_err(" >= %ums: total %llu (normal %llu, nmi %llu)\n", + start_ms, total_counts[i], + norm_counts[i], nmi_counts[i]); + } else { + pr_err(" %u-%ums: total %llu (normal %llu, nmi %llu)\n", + start_ms, rqsl_hist_ms[i], + total_counts[i], + norm_counts[i], nmi_counts[i]); + } + } + } +} + static void bpf_test_rqspinlock_exit(void) { + WRITE_ONCE(pause, 1); free_rqsl_threads(); free_rqsl_evts(); + rqsl_print_histograms(); } module_exit(bpf_test_rqspinlock_exit); diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 74ecc281bb8c12..338c035c36884c 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -43,6 +43,7 @@ #define TEST_TAG_EXPECT_STDERR_PFX_UNPRIV "comment:test_expect_stderr_unpriv=" #define TEST_TAG_EXPECT_STDOUT_PFX "comment:test_expect_stdout=" #define TEST_TAG_EXPECT_STDOUT_PFX_UNPRIV "comment:test_expect_stdout_unpriv=" +#define TEST_TAG_LINEAR_SIZE "comment:test_linear_size=" /* Warning: duplicated in bpf_misc.h */ #define POINTER_VALUE 0xbadcafe @@ -89,6 +90,7 @@ struct test_spec { int mode_mask; int arch_mask; int load_mask; + int linear_sz; bool auxiliary; bool valid; }; @@ -633,6 +635,21 @@ static int parse_test_spec(struct test_loader *tester, &spec->unpriv.stdout); if (err) goto cleanup; + } else if (str_has_pfx(s, TEST_TAG_LINEAR_SIZE)) { + switch (bpf_program__type(prog)) { + case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_CGROUP_SKB: + val = s + sizeof(TEST_TAG_LINEAR_SIZE) - 1; + err = parse_int(val, &spec->linear_sz, "test linear size"); + if (err) + goto cleanup; + break; + default: + PRINT_FAIL("__linear_size for unsupported program type"); + err = -EINVAL; + goto cleanup; + } } } @@ -1007,10 +1024,11 @@ static bool is_unpriv_capable_map(struct bpf_map *map) } } -static int do_prog_test_run(int fd_prog, int *retval, bool empty_opts) +static int do_prog_test_run(int fd_prog, int *retval, bool empty_opts, int linear_sz) { __u8 tmp_out[TEST_DATA_LEN << 2] = {}; __u8 tmp_in[TEST_DATA_LEN] = {}; + struct __sk_buff ctx = {}; int err, saved_errno; LIBBPF_OPTS(bpf_test_run_opts, topts, .data_in = tmp_in, @@ -1020,6 +1038,12 @@ static int do_prog_test_run(int fd_prog, int *retval, bool empty_opts) .repeat = 1, ); + if (linear_sz) { + ctx.data_end = linear_sz; + topts.ctx_in = &ctx; + topts.ctx_size_in = sizeof(ctx); + } + if (empty_opts) { memset(&topts, 0, sizeof(struct bpf_test_run_opts)); topts.sz = sizeof(struct bpf_test_run_opts); @@ -1269,7 +1293,8 @@ void run_subtest(struct test_loader *tester, } err = do_prog_test_run(bpf_program__fd(tprog), &retval, - bpf_program__type(tprog) == BPF_PROG_TYPE_SYSCALL ? true : false); + bpf_program__type(tprog) == BPF_PROG_TYPE_SYSCALL ? true : false, + spec->linear_sz); if (!err && retval != subspec->retval && subspec->retval != POINTER_VALUE) { PRINT_FAIL("Unexpected retval: %d != %d\n", retval, subspec->retval); goto tobj_cleanup; diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 3fae9ce46ca9bf..ccc5acd55ff9d5 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -1399,7 +1399,8 @@ static void test_map_stress(void) static bool can_retry(int err) { return (err == EAGAIN || err == EBUSY || - (err == ENOMEM && map_opts.map_flags == BPF_F_NO_PREALLOC)); + ((err == ENOMEM || err == E2BIG) && + map_opts.map_flags == BPF_F_NO_PREALLOC)); } int map_update_retriable(int map_fd, const void *key, const void *value, int flags, int attempts, diff --git a/tools/testing/selftests/bpf/test_tag.c b/tools/testing/selftests/bpf/test_tag.c index 5546b05a048661..f1300047c1e0a9 100644 --- a/tools/testing/selftests/bpf/test_tag.c +++ b/tools/testing/selftests/bpf/test_tag.c @@ -116,7 +116,7 @@ static void tag_from_alg(int insns, uint8_t *tag, uint32_t len) static const struct sockaddr_alg alg = { .salg_family = AF_ALG, .salg_type = "hash", - .salg_name = "sha1", + .salg_name = "sha256", }; int fd_base, fd_alg, ret; ssize_t size; diff --git a/tools/testing/selftests/bpf/test_tc_edt.sh b/tools/testing/selftests/bpf/test_tc_edt.sh deleted file mode 100755 index 76f0bd17061f9e..00000000000000 --- a/tools/testing/selftests/bpf/test_tc_edt.sh +++ /dev/null @@ -1,100 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# This test installs a TC bpf program that throttles a TCP flow -# with dst port = 9000 down to 5MBps. Then it measures actual -# throughput of the flow. - -BPF_FILE="test_tc_edt.bpf.o" -if [[ $EUID -ne 0 ]]; then - echo "This script must be run as root" - echo "FAIL" - exit 1 -fi - -# check that nc, dd, and timeout are present -command -v nc >/dev/null 2>&1 || \ - { echo >&2 "nc is not available"; exit 1; } -command -v dd >/dev/null 2>&1 || \ - { echo >&2 "nc is not available"; exit 1; } -command -v timeout >/dev/null 2>&1 || \ - { echo >&2 "timeout is not available"; exit 1; } - -readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)" -readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)" - -readonly IP_SRC="172.16.1.100" -readonly IP_DST="172.16.2.100" - -cleanup() -{ - ip netns del ${NS_SRC} - ip netns del ${NS_DST} -} - -trap cleanup EXIT - -set -e # exit on error - -ip netns add "${NS_SRC}" -ip netns add "${NS_DST}" -ip link add veth_src type veth peer name veth_dst -ip link set veth_src netns ${NS_SRC} -ip link set veth_dst netns ${NS_DST} - -ip -netns ${NS_SRC} addr add ${IP_SRC}/24 dev veth_src -ip -netns ${NS_DST} addr add ${IP_DST}/24 dev veth_dst - -ip -netns ${NS_SRC} link set dev veth_src up -ip -netns ${NS_DST} link set dev veth_dst up - -ip -netns ${NS_SRC} route add ${IP_DST}/32 dev veth_src -ip -netns ${NS_DST} route add ${IP_SRC}/32 dev veth_dst - -# set up TC on TX -ip netns exec ${NS_SRC} tc qdisc add dev veth_src root fq -ip netns exec ${NS_SRC} tc qdisc add dev veth_src clsact -ip netns exec ${NS_SRC} tc filter add dev veth_src egress \ - bpf da obj ${BPF_FILE} sec cls_test - - -# start the listener -ip netns exec ${NS_DST} bash -c \ - "nc -4 -l -p 9000 >/dev/null &" -declare -i NC_PID=$! -sleep 1 - -declare -ir TIMEOUT=20 -declare -ir EXPECTED_BPS=5000000 - -# run the load, capture RX bytes on DST -declare -ir RX_BYTES_START=$( ip netns exec ${NS_DST} \ - cat /sys/class/net/veth_dst/statistics/rx_bytes ) - -set +e -ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero \ - bs=1000 count=1000000 > /dev/tcp/${IP_DST}/9000 2>/dev/null" -set -e - -declare -ir RX_BYTES_END=$( ip netns exec ${NS_DST} \ - cat /sys/class/net/veth_dst/statistics/rx_bytes ) - -declare -ir ACTUAL_BPS=$(( ($RX_BYTES_END - $RX_BYTES_START) / $TIMEOUT )) - -echo $TIMEOUT $ACTUAL_BPS $EXPECTED_BPS | \ - awk '{printf "elapsed: %d sec; bps difference: %.2f%%\n", - $1, ($2-$3)*100.0/$3}' - -# Pass the test if the actual bps is within 1% of the expected bps. -# The difference is usually about 0.1% on a 20-sec test, and ==> zero -# the longer the test runs. -declare -ir RES=$( echo $ACTUAL_BPS $EXPECTED_BPS | \ - awk 'function abs(x){return ((x < 0.0) ? -x : x)} - {if (abs(($1-$2)*100.0/$2) > 1.0) { print "1" } - else { print "0"} }' ) -if [ "${RES}" == "0" ] ; then - echo "PASS" -else - echo "FAIL" - exit 1 -fi diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh deleted file mode 100755 index cb55a908bb0d70..00000000000000 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ /dev/null @@ -1,320 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# In-place tunneling - -BPF_FILE="test_tc_tunnel.bpf.o" -# must match the port that the bpf program filters on -readonly port=8000 - -readonly ns_prefix="ns-$$-" -readonly ns1="${ns_prefix}1" -readonly ns2="${ns_prefix}2" - -readonly ns1_v4=192.168.1.1 -readonly ns2_v4=192.168.1.2 -readonly ns1_v6=fd::1 -readonly ns2_v6=fd::2 - -# Must match port used by bpf program -readonly udpport=5555 -# MPLSoverUDP -readonly mplsudpport=6635 -readonly mplsproto=137 - -readonly infile="$(mktemp)" -readonly outfile="$(mktemp)" - -setup() { - ip netns add "${ns1}" - ip netns add "${ns2}" - - ip link add dev veth1 mtu 1500 netns "${ns1}" type veth \ - peer name veth2 mtu 1500 netns "${ns2}" - - ip netns exec "${ns1}" ethtool -K veth1 tso off - - ip -netns "${ns1}" link set veth1 up - ip -netns "${ns2}" link set veth2 up - - ip -netns "${ns1}" -4 addr add "${ns1_v4}/24" dev veth1 - ip -netns "${ns2}" -4 addr add "${ns2_v4}/24" dev veth2 - ip -netns "${ns1}" -6 addr add "${ns1_v6}/64" dev veth1 nodad - ip -netns "${ns2}" -6 addr add "${ns2_v6}/64" dev veth2 nodad - - # clamp route to reserve room for tunnel headers - ip -netns "${ns1}" -4 route flush table main - ip -netns "${ns1}" -6 route flush table main - ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1450 dev veth1 - ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1430 dev veth1 - - sleep 1 - - dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none -} - -cleanup() { - ip netns del "${ns2}" - ip netns del "${ns1}" - - if [[ -f "${outfile}" ]]; then - rm "${outfile}" - fi - if [[ -f "${infile}" ]]; then - rm "${infile}" - fi - - if [[ -n $server_pid ]]; then - kill $server_pid 2> /dev/null - fi -} - -server_listen() { - ip netns exec "${ns2}" nc "${netcat_opt}" -l "${port}" > "${outfile}" & - server_pid=$! -} - -client_connect() { - ip netns exec "${ns1}" timeout 2 nc "${netcat_opt}" -w 1 "${addr2}" "${port}" < "${infile}" - echo $? -} - -verify_data() { - wait "${server_pid}" - server_pid= - # sha1sum returns two fields [sha1] [filepath] - # convert to bash array and access first elem - insum=($(sha1sum ${infile})) - outsum=($(sha1sum ${outfile})) - if [[ "${insum[0]}" != "${outsum[0]}" ]]; then - echo "data mismatch" - exit 1 - fi -} - -wait_for_port() { - for i in $(seq 20); do - if ip netns exec "${ns2}" ss ${2:--4}OHntl | grep -q "$1"; then - return 0 - fi - sleep 0.1 - done - return 1 -} - -set -e - -# no arguments: automated test, run all -if [[ "$#" -eq "0" ]]; then - echo "ipip" - $0 ipv4 ipip none 100 - - echo "ipip6" - $0 ipv4 ipip6 none 100 - - echo "ip6ip6" - $0 ipv6 ip6tnl none 100 - - echo "sit" - $0 ipv6 sit none 100 - - echo "ip4 vxlan" - $0 ipv4 vxlan eth 2000 - - echo "ip6 vxlan" - $0 ipv6 ip6vxlan eth 2000 - - for mac in none mpls eth ; do - echo "ip gre $mac" - $0 ipv4 gre $mac 100 - - echo "ip6 gre $mac" - $0 ipv6 ip6gre $mac 100 - - echo "ip gre $mac gso" - $0 ipv4 gre $mac 2000 - - echo "ip6 gre $mac gso" - $0 ipv6 ip6gre $mac 2000 - - echo "ip udp $mac" - $0 ipv4 udp $mac 100 - - echo "ip6 udp $mac" - $0 ipv6 ip6udp $mac 100 - - echo "ip udp $mac gso" - $0 ipv4 udp $mac 2000 - - echo "ip6 udp $mac gso" - $0 ipv6 ip6udp $mac 2000 - done - - echo "OK. All tests passed" - exit 0 -fi - -if [[ "$#" -ne "4" ]]; then - echo "Usage: $0" - echo " or: $0 <ipv4|ipv6> <tuntype> <none|mpls|eth> <data_len>" - exit 1 -fi - -case "$1" in -"ipv4") - readonly addr1="${ns1_v4}" - readonly addr2="${ns2_v4}" - readonly ipproto=4 - readonly netcat_opt=-${ipproto} - readonly foumod=fou - readonly foutype=ipip - readonly fouproto=4 - readonly fouproto_mpls=${mplsproto} - readonly gretaptype=gretap - ;; -"ipv6") - readonly addr1="${ns1_v6}" - readonly addr2="${ns2_v6}" - readonly ipproto=6 - readonly netcat_opt=-${ipproto} - readonly foumod=fou6 - readonly foutype=ip6tnl - readonly fouproto="41 -6" - readonly fouproto_mpls="${mplsproto} -6" - readonly gretaptype=ip6gretap - ;; -*) - echo "unknown arg: $1" - exit 1 - ;; -esac - -readonly tuntype=$2 -readonly mac=$3 -readonly datalen=$4 - -echo "encap ${addr1} to ${addr2}, type ${tuntype}, mac ${mac} len ${datalen}" - -trap cleanup EXIT - -setup - -# basic communication works -echo "test basic connectivity" -server_listen -wait_for_port ${port} ${netcat_opt} -client_connect -verify_data - -# clientside, insert bpf program to encap all TCP to port ${port} -# client can no longer connect -ip netns exec "${ns1}" tc qdisc add dev veth1 clsact -ip netns exec "${ns1}" tc filter add dev veth1 egress \ - bpf direct-action object-file ${BPF_FILE} \ - section "encap_${tuntype}_${mac}" -echo "test bpf encap without decap (expect failure)" -server_listen -wait_for_port ${port} ${netcat_opt} -! client_connect - -if [[ "$tuntype" =~ "udp" ]]; then - # Set up fou tunnel. - ttype="${foutype}" - targs="encap fou encap-sport auto encap-dport $udpport" - # fou may be a module; allow this to fail. - modprobe "${foumod}" ||true - if [[ "$mac" == "mpls" ]]; then - dport=${mplsudpport} - dproto=${fouproto_mpls} - tmode="mode any ttl 255" - else - dport=${udpport} - dproto=${fouproto} - fi - ip netns exec "${ns2}" ip fou add port $dport ipproto ${dproto} - targs="encap fou encap-sport auto encap-dport $dport" -elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then - ttype=$gretaptype -elif [[ "$tuntype" =~ "vxlan" && "$mac" == "eth" ]]; then - ttype="vxlan" - targs="id 1 dstport 8472 udp6zerocsumrx" -elif [[ "$tuntype" == "ipip6" ]]; then - ttype="ip6tnl" - targs="" -else - ttype=$tuntype - targs="" -fi - -# tunnel address family differs from inner for SIT -if [[ "${tuntype}" == "sit" ]]; then - link_addr1="${ns1_v4}" - link_addr2="${ns2_v4}" -elif [[ "${tuntype}" == "ipip6" ]]; then - link_addr1="${ns1_v6}" - link_addr2="${ns2_v6}" -else - link_addr1="${addr1}" - link_addr2="${addr2}" -fi - -# serverside, insert decap module -# server is still running -# client can connect again -ip netns exec "${ns2}" ip link add name testtun0 type "${ttype}" \ - ${tmode} remote "${link_addr1}" local "${link_addr2}" $targs - -expect_tun_fail=0 - -if [[ "$tuntype" == "ip6udp" && "$mac" == "mpls" ]]; then - # No support for MPLS IPv6 fou tunnel; expect failure. - expect_tun_fail=1 -elif [[ "$tuntype" =~ "udp" && "$mac" == "eth" ]]; then - # No support for TEB fou tunnel; expect failure. - expect_tun_fail=1 -elif [[ "$tuntype" =~ (gre|vxlan) && "$mac" == "eth" ]]; then - # Share ethernet address between tunnel/veth2 so L2 decap works. - ethaddr=$(ip netns exec "${ns2}" ip link show veth2 | \ - awk '/ether/ { print $2 }') - ip netns exec "${ns2}" ip link set testtun0 address $ethaddr -elif [[ "$mac" == "mpls" ]]; then - modprobe mpls_iptunnel ||true - modprobe mpls_gso ||true - ip netns exec "${ns2}" sysctl -qw net.mpls.platform_labels=65536 - ip netns exec "${ns2}" ip -f mpls route add 1000 dev lo - ip netns exec "${ns2}" ip link set lo up - ip netns exec "${ns2}" sysctl -qw net.mpls.conf.testtun0.input=1 - ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.lo.rp_filter=0 -fi - -# Because packets are decapped by the tunnel they arrive on testtun0 from -# the IP stack perspective. Ensure reverse path filtering is disabled -# otherwise we drop the TCP SYN as arriving on testtun0 instead of the -# expected veth2 (veth2 is where 192.168.1.2 is configured). -ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.rp_filter=0 -# rp needs to be disabled for both all and testtun0 as the rp value is -# selected as the max of the "all" and device-specific values. -ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.testtun0.rp_filter=0 -ip netns exec "${ns2}" ip link set dev testtun0 up -if [[ "$expect_tun_fail" == 1 ]]; then - # This tunnel mode is not supported, so we expect failure. - echo "test bpf encap with tunnel device decap (expect failure)" - ! client_connect -else - echo "test bpf encap with tunnel device decap" - client_connect - verify_data - server_listen - wait_for_port ${port} ${netcat_opt} -fi - -# serverside, use BPF for decap -ip netns exec "${ns2}" ip link del dev testtun0 -ip netns exec "${ns2}" tc qdisc add dev veth2 clsact -ip netns exec "${ns2}" tc filter add dev veth2 ingress \ - bpf direct-action object-file ${BPF_FILE} section decap -echo "test bpf encap with bpf decap" -client_connect -verify_data - -echo OK diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index 352adc8df2d1cd..9234a58b0a972d 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -74,31 +74,23 @@ #define _GNU_SOURCE #include <assert.h> #include <fcntl.h> -#include <errno.h> #include <getopt.h> #include <linux/if_link.h> #include <linux/if_ether.h> #include <linux/mman.h> #include <linux/netdev.h> -#include <linux/bitmap.h> #include <linux/ethtool.h> #include <arpa/inet.h> #include <net/if.h> #include <locale.h> -#include <poll.h> -#include <pthread.h> -#include <signal.h> #include <stdio.h> #include <stdlib.h> #include <libgen.h> -#include <string.h> #include <stddef.h> #include <sys/mman.h> -#include <sys/socket.h> -#include <sys/time.h> #include <sys/types.h> -#include <unistd.h> +#include "prog_tests/test_xsk.h" #include "xsk_xdp_progs.skel.h" #include "xsk.h" #include "xskxceiver.h" @@ -109,9 +101,6 @@ #include <network_helpers.h> -#define MAX_TX_BUDGET_DEFAULT 32 - -static bool opt_verbose; static bool opt_print_tests; static enum test_mode opt_mode = TEST_MODE_ALL; static u32 opt_run_test = RUN_ALL_TESTS; @@ -120,169 +109,12 @@ void test__fail(void) { /* for network_helpers.c */ } static void __exit_with_error(int error, const char *file, const char *func, int line) { - ksft_test_result_fail("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, - strerror(error)); + ksft_test_result_fail("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, + error, strerror(error)); ksft_exit_xfail(); } #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__) -#define busy_poll_string(test) (test)->ifobj_tx->busy_poll ? "BUSY-POLL " : "" -static char *mode_string(struct test_spec *test) -{ - switch (test->mode) { - case TEST_MODE_SKB: - return "SKB"; - case TEST_MODE_DRV: - return "DRV"; - case TEST_MODE_ZC: - return "ZC"; - default: - return "BOGUS"; - } -} - -static void report_failure(struct test_spec *test) -{ - if (test->fail) - return; - - ksft_test_result_fail("FAIL: %s %s%s\n", mode_string(test), busy_poll_string(test), - test->name); - test->fail = true; -} - -/* The payload is a word consisting of a packet sequence number in the upper - * 16-bits and a intra packet data sequence number in the lower 16 bits. So the 3rd packet's - * 5th word of data will contain the number (2<<16) | 4 as they are numbered from 0. - */ -static void write_payload(void *dest, u32 pkt_nb, u32 start, u32 size) -{ - u32 *ptr = (u32 *)dest, i; - - start /= sizeof(*ptr); - size /= sizeof(*ptr); - for (i = 0; i < size; i++) - ptr[i] = htonl(pkt_nb << 16 | (i + start)); -} - -static void gen_eth_hdr(struct xsk_socket_info *xsk, struct ethhdr *eth_hdr) -{ - memcpy(eth_hdr->h_dest, xsk->dst_mac, ETH_ALEN); - memcpy(eth_hdr->h_source, xsk->src_mac, ETH_ALEN); - eth_hdr->h_proto = htons(ETH_P_LOOPBACK); -} - -static bool is_umem_valid(struct ifobject *ifobj) -{ - return !!ifobj->umem->umem; -} - -static u32 mode_to_xdp_flags(enum test_mode mode) -{ - return (mode == TEST_MODE_SKB) ? XDP_FLAGS_SKB_MODE : XDP_FLAGS_DRV_MODE; -} - -static u64 umem_size(struct xsk_umem_info *umem) -{ - return umem->num_frames * umem->frame_size; -} - -static int xsk_configure_umem(struct ifobject *ifobj, struct xsk_umem_info *umem, void *buffer, - u64 size) -{ - struct xsk_umem_config cfg = { - .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, - .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, - .frame_size = umem->frame_size, - .frame_headroom = umem->frame_headroom, - .flags = XSK_UMEM__DEFAULT_FLAGS - }; - int ret; - - if (umem->fill_size) - cfg.fill_size = umem->fill_size; - - if (umem->comp_size) - cfg.comp_size = umem->comp_size; - - if (umem->unaligned_mode) - cfg.flags |= XDP_UMEM_UNALIGNED_CHUNK_FLAG; - - ret = xsk_umem__create(&umem->umem, buffer, size, - &umem->fq, &umem->cq, &cfg); - if (ret) - return ret; - - umem->buffer = buffer; - if (ifobj->shared_umem && ifobj->rx_on) { - umem->base_addr = umem_size(umem); - umem->next_buffer = umem_size(umem); - } - - return 0; -} - -static u64 umem_alloc_buffer(struct xsk_umem_info *umem) -{ - u64 addr; - - addr = umem->next_buffer; - umem->next_buffer += umem->frame_size; - if (umem->next_buffer >= umem->base_addr + umem_size(umem)) - umem->next_buffer = umem->base_addr; - - return addr; -} - -static void umem_reset_alloc(struct xsk_umem_info *umem) -{ - umem->next_buffer = 0; -} - -static void enable_busy_poll(struct xsk_socket_info *xsk) -{ - int sock_opt; - - sock_opt = 1; - if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_PREFER_BUSY_POLL, - (void *)&sock_opt, sizeof(sock_opt)) < 0) - exit_with_error(errno); - - sock_opt = 20; - if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL, - (void *)&sock_opt, sizeof(sock_opt)) < 0) - exit_with_error(errno); - - sock_opt = xsk->batch_size; - if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL_BUDGET, - (void *)&sock_opt, sizeof(sock_opt)) < 0) - exit_with_error(errno); -} - -static int __xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_info *umem, - struct ifobject *ifobject, bool shared) -{ - struct xsk_socket_config cfg = {}; - struct xsk_ring_cons *rxr; - struct xsk_ring_prod *txr; - - xsk->umem = umem; - cfg.rx_size = xsk->rxqsize; - cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; - cfg.bind_flags = ifobject->bind_flags; - if (shared) - cfg.bind_flags |= XDP_SHARED_UMEM; - if (ifobject->mtu > MAX_ETH_PKT_SIZE) - cfg.bind_flags |= XDP_USE_SG; - if (umem->comp_size) - cfg.tx_size = umem->comp_size; - if (umem->fill_size) - cfg.rx_size = umem->fill_size; - - txr = ifobject->tx_on ? &xsk->tx : NULL; - rxr = ifobject->rx_on ? &xsk->rx : NULL; - return xsk_socket__create(&xsk->xsk, ifobject->ifindex, 0, umem->umem, rxr, txr, &cfg); -} static bool ifobj_zc_avail(struct ifobject *ifobject) { @@ -314,7 +146,7 @@ static bool ifobj_zc_avail(struct ifobject *ifobject) ifobject->bind_flags = XDP_USE_NEED_WAKEUP | XDP_ZEROCOPY; ifobject->rx_on = true; xsk->rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; - ret = __xsk_configure_socket(xsk, umem, ifobject, false); + ret = xsk_configure_socket(xsk, umem, ifobject, false); if (!ret) zc_avail = true; @@ -327,25 +159,6 @@ out: return zc_avail; } -#define MAX_SKB_FRAGS_PATH "/proc/sys/net/core/max_skb_frags" -static unsigned int get_max_skb_frags(void) -{ - unsigned int max_skb_frags = 0; - FILE *file; - - file = fopen(MAX_SKB_FRAGS_PATH, "r"); - if (!file) { - ksft_print_msg("Error opening %s\n", MAX_SKB_FRAGS_PATH); - return 0; - } - - if (fscanf(file, "%u", &max_skb_frags) != 1) - ksft_print_msg("Error reading %s\n", MAX_SKB_FRAGS_PATH); - - fclose(file); - return max_skb_frags; -} - static struct option long_options[] = { {"interface", required_argument, 0, 'i'}, {"busy-poll", no_argument, 0, 'b'}, @@ -446,2256 +259,36 @@ static void parse_command_line(struct ifobject *ifobj_tx, struct ifobject *ifobj } } -static int set_ring_size(struct ifobject *ifobj) -{ - int ret; - u32 ctr = 0; - - while (ctr++ < SOCK_RECONF_CTR) { - ret = set_hw_ring_size(ifobj->ifname, &ifobj->ring); - if (!ret) - break; - - /* Retry if it fails */ - if (ctr >= SOCK_RECONF_CTR || errno != EBUSY) - return -errno; - - usleep(USLEEP_MAX); - } - - return ret; -} - -static int hw_ring_size_reset(struct ifobject *ifobj) -{ - ifobj->ring.tx_pending = ifobj->set_ring.default_tx; - ifobj->ring.rx_pending = ifobj->set_ring.default_rx; - return set_ring_size(ifobj); -} - -static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, - struct ifobject *ifobj_rx) -{ - u32 i, j; - - for (i = 0; i < MAX_INTERFACES; i++) { - struct ifobject *ifobj = i ? ifobj_rx : ifobj_tx; - - ifobj->xsk = &ifobj->xsk_arr[0]; - ifobj->use_poll = false; - ifobj->use_fill_ring = true; - ifobj->release_rx = true; - ifobj->validation_func = NULL; - ifobj->use_metadata = false; - - if (i == 0) { - ifobj->rx_on = false; - ifobj->tx_on = true; - } else { - ifobj->rx_on = true; - ifobj->tx_on = false; - } - - memset(ifobj->umem, 0, sizeof(*ifobj->umem)); - ifobj->umem->num_frames = DEFAULT_UMEM_BUFFERS; - ifobj->umem->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; - - for (j = 0; j < MAX_SOCKETS; j++) { - memset(&ifobj->xsk_arr[j], 0, sizeof(ifobj->xsk_arr[j])); - ifobj->xsk_arr[j].rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; - ifobj->xsk_arr[j].batch_size = DEFAULT_BATCH_SIZE; - if (i == 0) - ifobj->xsk_arr[j].pkt_stream = test->tx_pkt_stream_default; - else - ifobj->xsk_arr[j].pkt_stream = test->rx_pkt_stream_default; - - memcpy(ifobj->xsk_arr[j].src_mac, g_mac, ETH_ALEN); - memcpy(ifobj->xsk_arr[j].dst_mac, g_mac, ETH_ALEN); - ifobj->xsk_arr[j].src_mac[5] += ((j * 2) + 0); - ifobj->xsk_arr[j].dst_mac[5] += ((j * 2) + 1); - } - } - - if (ifobj_tx->hw_ring_size_supp) - hw_ring_size_reset(ifobj_tx); - - test->ifobj_tx = ifobj_tx; - test->ifobj_rx = ifobj_rx; - test->current_step = 0; - test->total_steps = 1; - test->nb_sockets = 1; - test->fail = false; - test->set_ring = false; - test->adjust_tail = false; - test->adjust_tail_support = false; - test->mtu = MAX_ETH_PKT_SIZE; - test->xdp_prog_rx = ifobj_rx->xdp_progs->progs.xsk_def_prog; - test->xskmap_rx = ifobj_rx->xdp_progs->maps.xsk; - test->xdp_prog_tx = ifobj_tx->xdp_progs->progs.xsk_def_prog; - test->xskmap_tx = ifobj_tx->xdp_progs->maps.xsk; -} - -static void test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, - struct ifobject *ifobj_rx, enum test_mode mode, - const struct test_spec *test_to_run) -{ - struct pkt_stream *tx_pkt_stream; - struct pkt_stream *rx_pkt_stream; - u32 i; - - tx_pkt_stream = test->tx_pkt_stream_default; - rx_pkt_stream = test->rx_pkt_stream_default; - memset(test, 0, sizeof(*test)); - test->tx_pkt_stream_default = tx_pkt_stream; - test->rx_pkt_stream_default = rx_pkt_stream; - - for (i = 0; i < MAX_INTERFACES; i++) { - struct ifobject *ifobj = i ? ifobj_rx : ifobj_tx; - - ifobj->bind_flags = XDP_USE_NEED_WAKEUP; - if (mode == TEST_MODE_ZC) - ifobj->bind_flags |= XDP_ZEROCOPY; - else - ifobj->bind_flags |= XDP_COPY; - } - - strncpy(test->name, test_to_run->name, MAX_TEST_NAME_SIZE); - test->test_func = test_to_run->test_func; - test->mode = mode; - __test_spec_init(test, ifobj_tx, ifobj_rx); -} - -static void test_spec_reset(struct test_spec *test) -{ - __test_spec_init(test, test->ifobj_tx, test->ifobj_rx); -} - -static void test_spec_set_xdp_prog(struct test_spec *test, struct bpf_program *xdp_prog_rx, - struct bpf_program *xdp_prog_tx, struct bpf_map *xskmap_rx, - struct bpf_map *xskmap_tx) -{ - test->xdp_prog_rx = xdp_prog_rx; - test->xdp_prog_tx = xdp_prog_tx; - test->xskmap_rx = xskmap_rx; - test->xskmap_tx = xskmap_tx; -} - -static int test_spec_set_mtu(struct test_spec *test, int mtu) -{ - int err; - - if (test->ifobj_rx->mtu != mtu) { - err = xsk_set_mtu(test->ifobj_rx->ifindex, mtu); - if (err) - return err; - test->ifobj_rx->mtu = mtu; - } - if (test->ifobj_tx->mtu != mtu) { - err = xsk_set_mtu(test->ifobj_tx->ifindex, mtu); - if (err) - return err; - test->ifobj_tx->mtu = mtu; - } - - return 0; -} - -static void pkt_stream_reset(struct pkt_stream *pkt_stream) -{ - if (pkt_stream) { - pkt_stream->current_pkt_nb = 0; - pkt_stream->nb_rx_pkts = 0; - } -} - -static struct pkt *pkt_stream_get_next_tx_pkt(struct pkt_stream *pkt_stream) -{ - if (pkt_stream->current_pkt_nb >= pkt_stream->nb_pkts) - return NULL; - - return &pkt_stream->pkts[pkt_stream->current_pkt_nb++]; -} - -static struct pkt *pkt_stream_get_next_rx_pkt(struct pkt_stream *pkt_stream, u32 *pkts_sent) -{ - while (pkt_stream->current_pkt_nb < pkt_stream->nb_pkts) { - (*pkts_sent)++; - if (pkt_stream->pkts[pkt_stream->current_pkt_nb].valid) - return &pkt_stream->pkts[pkt_stream->current_pkt_nb++]; - pkt_stream->current_pkt_nb++; - } - return NULL; -} - -static void pkt_stream_delete(struct pkt_stream *pkt_stream) -{ - free(pkt_stream->pkts); - free(pkt_stream); -} - -static void pkt_stream_restore_default(struct test_spec *test) -{ - struct pkt_stream *tx_pkt_stream = test->ifobj_tx->xsk->pkt_stream; - struct pkt_stream *rx_pkt_stream = test->ifobj_rx->xsk->pkt_stream; - - if (tx_pkt_stream != test->tx_pkt_stream_default) { - pkt_stream_delete(test->ifobj_tx->xsk->pkt_stream); - test->ifobj_tx->xsk->pkt_stream = test->tx_pkt_stream_default; - } - - if (rx_pkt_stream != test->rx_pkt_stream_default) { - pkt_stream_delete(test->ifobj_rx->xsk->pkt_stream); - test->ifobj_rx->xsk->pkt_stream = test->rx_pkt_stream_default; - } -} - -static struct pkt_stream *__pkt_stream_alloc(u32 nb_pkts) -{ - struct pkt_stream *pkt_stream; - - pkt_stream = calloc(1, sizeof(*pkt_stream)); - if (!pkt_stream) - return NULL; - - pkt_stream->pkts = calloc(nb_pkts, sizeof(*pkt_stream->pkts)); - if (!pkt_stream->pkts) { - free(pkt_stream); - return NULL; - } - - pkt_stream->nb_pkts = nb_pkts; - return pkt_stream; -} - -static bool pkt_continues(u32 options) -{ - return options & XDP_PKT_CONTD; -} - -static u32 ceil_u32(u32 a, u32 b) -{ - return (a + b - 1) / b; -} - -static u32 pkt_nb_frags(u32 frame_size, struct pkt_stream *pkt_stream, struct pkt *pkt) -{ - u32 nb_frags = 1, next_frag; - - if (!pkt) - return 1; - - if (!pkt_stream->verbatim) { - if (!pkt->valid || !pkt->len) - return 1; - return ceil_u32(pkt->len, frame_size); - } - - /* Search for the end of the packet in verbatim mode */ - if (!pkt_continues(pkt->options)) - return nb_frags; - - next_frag = pkt_stream->current_pkt_nb; - pkt++; - while (next_frag++ < pkt_stream->nb_pkts) { - nb_frags++; - if (!pkt_continues(pkt->options) || !pkt->valid) - break; - pkt++; - } - return nb_frags; -} - -static bool set_pkt_valid(int offset, u32 len) -{ - return len <= MAX_ETH_JUMBO_SIZE; -} - -static void pkt_set(struct pkt_stream *pkt_stream, struct pkt *pkt, int offset, u32 len) -{ - pkt->offset = offset; - pkt->len = len; - pkt->valid = set_pkt_valid(offset, len); -} - -static void pkt_stream_pkt_set(struct pkt_stream *pkt_stream, struct pkt *pkt, int offset, u32 len) -{ - bool prev_pkt_valid = pkt->valid; - - pkt_set(pkt_stream, pkt, offset, len); - pkt_stream->nb_valid_entries += pkt->valid - prev_pkt_valid; -} - -static u32 pkt_get_buffer_len(struct xsk_umem_info *umem, u32 len) -{ - return ceil_u32(len, umem->frame_size) * umem->frame_size; -} - -static struct pkt_stream *__pkt_stream_generate(u32 nb_pkts, u32 pkt_len, u32 nb_start, u32 nb_off) -{ - struct pkt_stream *pkt_stream; - u32 i; - - pkt_stream = __pkt_stream_alloc(nb_pkts); - if (!pkt_stream) - exit_with_error(ENOMEM); - - pkt_stream->nb_pkts = nb_pkts; - pkt_stream->max_pkt_len = pkt_len; - for (i = 0; i < nb_pkts; i++) { - struct pkt *pkt = &pkt_stream->pkts[i]; - - pkt_stream_pkt_set(pkt_stream, pkt, 0, pkt_len); - pkt->pkt_nb = nb_start + i * nb_off; - } - - return pkt_stream; -} - -static struct pkt_stream *pkt_stream_generate(u32 nb_pkts, u32 pkt_len) -{ - return __pkt_stream_generate(nb_pkts, pkt_len, 0, 1); -} - -static struct pkt_stream *pkt_stream_clone(struct pkt_stream *pkt_stream) -{ - return pkt_stream_generate(pkt_stream->nb_pkts, pkt_stream->pkts[0].len); -} - -static void pkt_stream_replace_ifobject(struct ifobject *ifobj, u32 nb_pkts, u32 pkt_len) -{ - ifobj->xsk->pkt_stream = pkt_stream_generate(nb_pkts, pkt_len); -} - -static void pkt_stream_replace(struct test_spec *test, u32 nb_pkts, u32 pkt_len) -{ - pkt_stream_replace_ifobject(test->ifobj_tx, nb_pkts, pkt_len); - pkt_stream_replace_ifobject(test->ifobj_rx, nb_pkts, pkt_len); -} - -static void __pkt_stream_replace_half(struct ifobject *ifobj, u32 pkt_len, - int offset) -{ - struct pkt_stream *pkt_stream; - u32 i; - - pkt_stream = pkt_stream_clone(ifobj->xsk->pkt_stream); - for (i = 1; i < ifobj->xsk->pkt_stream->nb_pkts; i += 2) - pkt_stream_pkt_set(pkt_stream, &pkt_stream->pkts[i], offset, pkt_len); - - ifobj->xsk->pkt_stream = pkt_stream; -} - -static void pkt_stream_replace_half(struct test_spec *test, u32 pkt_len, int offset) -{ - __pkt_stream_replace_half(test->ifobj_tx, pkt_len, offset); - __pkt_stream_replace_half(test->ifobj_rx, pkt_len, offset); -} - -static void pkt_stream_receive_half(struct test_spec *test) -{ - struct pkt_stream *pkt_stream = test->ifobj_tx->xsk->pkt_stream; - u32 i; - - test->ifobj_rx->xsk->pkt_stream = pkt_stream_generate(pkt_stream->nb_pkts, - pkt_stream->pkts[0].len); - pkt_stream = test->ifobj_rx->xsk->pkt_stream; - for (i = 1; i < pkt_stream->nb_pkts; i += 2) - pkt_stream->pkts[i].valid = false; - - pkt_stream->nb_valid_entries /= 2; -} - -static void pkt_stream_even_odd_sequence(struct test_spec *test) -{ - struct pkt_stream *pkt_stream; - u32 i; - - for (i = 0; i < test->nb_sockets; i++) { - pkt_stream = test->ifobj_tx->xsk_arr[i].pkt_stream; - pkt_stream = __pkt_stream_generate(pkt_stream->nb_pkts / 2, - pkt_stream->pkts[0].len, i, 2); - test->ifobj_tx->xsk_arr[i].pkt_stream = pkt_stream; - - pkt_stream = test->ifobj_rx->xsk_arr[i].pkt_stream; - pkt_stream = __pkt_stream_generate(pkt_stream->nb_pkts / 2, - pkt_stream->pkts[0].len, i, 2); - test->ifobj_rx->xsk_arr[i].pkt_stream = pkt_stream; - } -} - -static u64 pkt_get_addr(struct pkt *pkt, struct xsk_umem_info *umem) -{ - if (!pkt->valid) - return pkt->offset; - return pkt->offset + umem_alloc_buffer(umem); -} - -static void pkt_stream_cancel(struct pkt_stream *pkt_stream) -{ - pkt_stream->current_pkt_nb--; -} - -static void pkt_generate(struct xsk_socket_info *xsk, struct xsk_umem_info *umem, u64 addr, u32 len, - u32 pkt_nb, u32 bytes_written) -{ - void *data = xsk_umem__get_data(umem->buffer, addr); - - if (len < MIN_PKT_SIZE) - return; - - if (!bytes_written) { - gen_eth_hdr(xsk, data); - - len -= PKT_HDR_SIZE; - data += PKT_HDR_SIZE; - } else { - bytes_written -= PKT_HDR_SIZE; - } - - write_payload(data, pkt_nb, bytes_written, len); -} - -static struct pkt_stream *__pkt_stream_generate_custom(struct ifobject *ifobj, struct pkt *frames, - u32 nb_frames, bool verbatim) -{ - u32 i, len = 0, pkt_nb = 0, payload = 0; - struct pkt_stream *pkt_stream; - - pkt_stream = __pkt_stream_alloc(nb_frames); - if (!pkt_stream) - exit_with_error(ENOMEM); - - for (i = 0; i < nb_frames; i++) { - struct pkt *pkt = &pkt_stream->pkts[pkt_nb]; - struct pkt *frame = &frames[i]; - - pkt->offset = frame->offset; - if (verbatim) { - *pkt = *frame; - pkt->pkt_nb = payload; - if (!frame->valid || !pkt_continues(frame->options)) - payload++; - } else { - if (frame->valid) - len += frame->len; - if (frame->valid && pkt_continues(frame->options)) - continue; - - pkt->pkt_nb = pkt_nb; - pkt->len = len; - pkt->valid = frame->valid; - pkt->options = 0; - - len = 0; - } - - print_verbose("offset: %d len: %u valid: %u options: %u pkt_nb: %u\n", - pkt->offset, pkt->len, pkt->valid, pkt->options, pkt->pkt_nb); - - if (pkt->valid && pkt->len > pkt_stream->max_pkt_len) - pkt_stream->max_pkt_len = pkt->len; - - if (pkt->valid) - pkt_stream->nb_valid_entries++; - - pkt_nb++; - } - - pkt_stream->nb_pkts = pkt_nb; - pkt_stream->verbatim = verbatim; - return pkt_stream; -} - -static void pkt_stream_generate_custom(struct test_spec *test, struct pkt *pkts, u32 nb_pkts) -{ - struct pkt_stream *pkt_stream; - - pkt_stream = __pkt_stream_generate_custom(test->ifobj_tx, pkts, nb_pkts, true); - test->ifobj_tx->xsk->pkt_stream = pkt_stream; - - pkt_stream = __pkt_stream_generate_custom(test->ifobj_rx, pkts, nb_pkts, false); - test->ifobj_rx->xsk->pkt_stream = pkt_stream; -} - -static void pkt_print_data(u32 *data, u32 cnt) -{ - u32 i; - - for (i = 0; i < cnt; i++) { - u32 seqnum, pkt_nb; - - seqnum = ntohl(*data) & 0xffff; - pkt_nb = ntohl(*data) >> 16; - ksft_print_msg("%u:%u ", pkt_nb, seqnum); - data++; - } -} - -static void pkt_dump(void *pkt, u32 len, bool eth_header) -{ - struct ethhdr *ethhdr = pkt; - u32 i, *data; - - if (eth_header) { - /*extract L2 frame */ - ksft_print_msg("DEBUG>> L2: dst mac: "); - for (i = 0; i < ETH_ALEN; i++) - ksft_print_msg("%02X", ethhdr->h_dest[i]); - - ksft_print_msg("\nDEBUG>> L2: src mac: "); - for (i = 0; i < ETH_ALEN; i++) - ksft_print_msg("%02X", ethhdr->h_source[i]); - - data = pkt + PKT_HDR_SIZE; - } else { - data = pkt; - } - - /*extract L5 frame */ - ksft_print_msg("\nDEBUG>> L5: seqnum: "); - pkt_print_data(data, PKT_DUMP_NB_TO_PRINT); - ksft_print_msg("...."); - if (len > PKT_DUMP_NB_TO_PRINT * sizeof(u32)) { - ksft_print_msg("\n.... "); - pkt_print_data(data + len / sizeof(u32) - PKT_DUMP_NB_TO_PRINT, - PKT_DUMP_NB_TO_PRINT); - } - ksft_print_msg("\n---------------------------------------\n"); -} - -static bool is_offset_correct(struct xsk_umem_info *umem, struct pkt *pkt, u64 addr) -{ - u32 headroom = umem->unaligned_mode ? 0 : umem->frame_headroom; - u32 offset = addr % umem->frame_size, expected_offset; - int pkt_offset = pkt->valid ? pkt->offset : 0; - - if (!umem->unaligned_mode) - pkt_offset = 0; - - expected_offset = (pkt_offset + headroom + XDP_PACKET_HEADROOM) % umem->frame_size; - - if (offset == expected_offset) - return true; - - ksft_print_msg("[%s] expected [%u], got [%u]\n", __func__, expected_offset, offset); - return false; -} - -static bool is_metadata_correct(struct pkt *pkt, void *buffer, u64 addr) -{ - void *data = xsk_umem__get_data(buffer, addr); - struct xdp_info *meta = data - sizeof(struct xdp_info); - - if (meta->count != pkt->pkt_nb) { - ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%llu]\n", - __func__, pkt->pkt_nb, - (unsigned long long)meta->count); - return false; - } - - return true; -} - -static bool is_adjust_tail_supported(struct xsk_xdp_progs *skel_rx) -{ - struct bpf_map *data_map; - int adjust_value = 0; - int key = 0; - int ret; - - data_map = bpf_object__find_map_by_name(skel_rx->obj, "xsk_xdp_.bss"); - if (!data_map || !bpf_map__is_internal(data_map)) { - ksft_print_msg("Error: could not find bss section of XDP program\n"); - exit_with_error(errno); - } - - ret = bpf_map_lookup_elem(bpf_map__fd(data_map), &key, &adjust_value); - if (ret) { - ksft_print_msg("Error: bpf_map_lookup_elem failed with error %d\n", ret); - exit_with_error(errno); - } - - /* Set the 'adjust_value' variable to -EOPNOTSUPP in the XDP program if the adjust_tail - * helper is not supported. Skip the adjust_tail test case in this scenario. - */ - return adjust_value != -EOPNOTSUPP; -} - -static bool is_frag_valid(struct xsk_umem_info *umem, u64 addr, u32 len, u32 expected_pkt_nb, - u32 bytes_processed) -{ - u32 seqnum, pkt_nb, *pkt_data, words_to_end, expected_seqnum; - void *data = xsk_umem__get_data(umem->buffer, addr); - - addr -= umem->base_addr; - - if (addr >= umem->num_frames * umem->frame_size || - addr + len > umem->num_frames * umem->frame_size) { - ksft_print_msg("Frag invalid addr: %llx len: %u\n", - (unsigned long long)addr, len); - return false; - } - if (!umem->unaligned_mode && addr % umem->frame_size + len > umem->frame_size) { - ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", - (unsigned long long)addr, len); - return false; - } - - pkt_data = data; - if (!bytes_processed) { - pkt_data += PKT_HDR_SIZE / sizeof(*pkt_data); - len -= PKT_HDR_SIZE; - } else { - bytes_processed -= PKT_HDR_SIZE; - } - - expected_seqnum = bytes_processed / sizeof(*pkt_data); - seqnum = ntohl(*pkt_data) & 0xffff; - pkt_nb = ntohl(*pkt_data) >> 16; - - if (expected_pkt_nb != pkt_nb) { - ksft_print_msg("[%s] expected pkt_nb [%u], got pkt_nb [%u]\n", - __func__, expected_pkt_nb, pkt_nb); - goto error; - } - if (expected_seqnum != seqnum) { - ksft_print_msg("[%s] expected seqnum at start [%u], got seqnum [%u]\n", - __func__, expected_seqnum, seqnum); - goto error; - } - - words_to_end = len / sizeof(*pkt_data) - 1; - pkt_data += words_to_end; - seqnum = ntohl(*pkt_data) & 0xffff; - expected_seqnum += words_to_end; - if (expected_seqnum != seqnum) { - ksft_print_msg("[%s] expected seqnum at end [%u], got seqnum [%u]\n", - __func__, expected_seqnum, seqnum); - goto error; - } - - return true; - -error: - pkt_dump(data, len, !bytes_processed); - return false; -} - -static bool is_pkt_valid(struct pkt *pkt, void *buffer, u64 addr, u32 len) -{ - if (pkt->len != len) { - ksft_print_msg("[%s] expected packet length [%d], got length [%d]\n", - __func__, pkt->len, len); - pkt_dump(xsk_umem__get_data(buffer, addr), len, true); - return false; - } - - return true; -} - -static u32 load_value(u32 *counter) -{ - return __atomic_load_n(counter, __ATOMIC_ACQUIRE); -} - -static bool kick_tx_with_check(struct xsk_socket_info *xsk, int *ret) -{ - u32 max_budget = MAX_TX_BUDGET_DEFAULT; - u32 cons, ready_to_send; - int delta; - - cons = load_value(xsk->tx.consumer); - ready_to_send = load_value(xsk->tx.producer) - cons; - *ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); - - delta = load_value(xsk->tx.consumer) - cons; - /* By default, xsk should consume exact @max_budget descs at one - * send in this case where hitting the max budget limit in while - * loop is triggered in __xsk_generic_xmit(). Please make sure that - * the number of descs to be sent is larger than @max_budget, or - * else the tx.consumer will be updated in xskq_cons_peek_desc() - * in time which hides the issue we try to verify. - */ - if (ready_to_send > max_budget && delta != max_budget) - return false; - - return true; -} - -static int kick_tx(struct xsk_socket_info *xsk) -{ - int ret; - - if (xsk->check_consumer) { - if (!kick_tx_with_check(xsk, &ret)) - return TEST_FAILURE; - } else { - ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); - } - if (ret >= 0) - return TEST_PASS; - if (errno == ENOBUFS || errno == EAGAIN || errno == EBUSY || errno == ENETDOWN) { - usleep(100); - return TEST_PASS; - } - return TEST_FAILURE; -} - -static int kick_rx(struct xsk_socket_info *xsk) -{ - int ret; - - ret = recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); - if (ret < 0) - return TEST_FAILURE; - - return TEST_PASS; -} - -static int complete_pkts(struct xsk_socket_info *xsk, int batch_size) -{ - unsigned int rcvd; - u32 idx; - int ret; - - if (xsk_ring_prod__needs_wakeup(&xsk->tx)) { - ret = kick_tx(xsk); - if (ret) - return TEST_FAILURE; - } - - rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx); - if (rcvd) { - if (rcvd > xsk->outstanding_tx) { - u64 addr = *xsk_ring_cons__comp_addr(&xsk->umem->cq, idx + rcvd - 1); - - ksft_print_msg("[%s] Too many packets completed\n", __func__); - ksft_print_msg("Last completion address: %llx\n", - (unsigned long long)addr); - return TEST_FAILURE; - } - - xsk_ring_cons__release(&xsk->umem->cq, rcvd); - xsk->outstanding_tx -= rcvd; - } - - return TEST_PASS; -} - -static int __receive_pkts(struct test_spec *test, struct xsk_socket_info *xsk) -{ - u32 frags_processed = 0, nb_frags = 0, pkt_len = 0; - u32 idx_rx = 0, idx_fq = 0, rcvd, pkts_sent = 0; - struct pkt_stream *pkt_stream = xsk->pkt_stream; - struct ifobject *ifobj = test->ifobj_rx; - struct xsk_umem_info *umem = xsk->umem; - struct pollfd fds = { }; - struct pkt *pkt; - u64 first_addr = 0; - int ret; - - fds.fd = xsk_socket__fd(xsk->xsk); - fds.events = POLLIN; - - ret = kick_rx(xsk); - if (ret) - return TEST_FAILURE; - - if (ifobj->use_poll) { - ret = poll(&fds, 1, POLL_TMOUT); - if (ret < 0) - return TEST_FAILURE; - - if (!ret) { - if (!is_umem_valid(test->ifobj_tx)) - return TEST_PASS; - - ksft_print_msg("ERROR: [%s] Poll timed out\n", __func__); - return TEST_CONTINUE; - } - - if (!(fds.revents & POLLIN)) - return TEST_CONTINUE; - } - - rcvd = xsk_ring_cons__peek(&xsk->rx, xsk->batch_size, &idx_rx); - if (!rcvd) - return TEST_CONTINUE; - - if (ifobj->use_fill_ring) { - ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); - while (ret != rcvd) { - if (xsk_ring_prod__needs_wakeup(&umem->fq)) { - ret = poll(&fds, 1, POLL_TMOUT); - if (ret < 0) - return TEST_FAILURE; - } - ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); - } - } - - while (frags_processed < rcvd) { - const struct xdp_desc *desc = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++); - u64 addr = desc->addr, orig; - - orig = xsk_umem__extract_addr(addr); - addr = xsk_umem__add_offset_to_addr(addr); - - if (!nb_frags) { - pkt = pkt_stream_get_next_rx_pkt(pkt_stream, &pkts_sent); - if (!pkt) { - ksft_print_msg("[%s] received too many packets addr: %lx len %u\n", - __func__, addr, desc->len); - return TEST_FAILURE; - } - } - - print_verbose("Rx: addr: %lx len: %u options: %u pkt_nb: %u valid: %u\n", - addr, desc->len, desc->options, pkt->pkt_nb, pkt->valid); - - if (!is_frag_valid(umem, addr, desc->len, pkt->pkt_nb, pkt_len) || - !is_offset_correct(umem, pkt, addr) || (ifobj->use_metadata && - !is_metadata_correct(pkt, umem->buffer, addr))) - return TEST_FAILURE; - - if (!nb_frags++) - first_addr = addr; - frags_processed++; - pkt_len += desc->len; - if (ifobj->use_fill_ring) - *xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) = orig; - - if (pkt_continues(desc->options)) - continue; - - /* The complete packet has been received */ - if (!is_pkt_valid(pkt, umem->buffer, first_addr, pkt_len) || - !is_offset_correct(umem, pkt, addr)) - return TEST_FAILURE; - - pkt_stream->nb_rx_pkts++; - nb_frags = 0; - pkt_len = 0; - } - - if (nb_frags) { - /* In the middle of a packet. Start over from beginning of packet. */ - idx_rx -= nb_frags; - xsk_ring_cons__cancel(&xsk->rx, nb_frags); - if (ifobj->use_fill_ring) { - idx_fq -= nb_frags; - xsk_ring_prod__cancel(&umem->fq, nb_frags); - } - frags_processed -= nb_frags; - } - - if (ifobj->use_fill_ring) - xsk_ring_prod__submit(&umem->fq, frags_processed); - if (ifobj->release_rx) - xsk_ring_cons__release(&xsk->rx, frags_processed); - - pthread_mutex_lock(&pacing_mutex); - pkts_in_flight -= pkts_sent; - pthread_mutex_unlock(&pacing_mutex); - pkts_sent = 0; - -return TEST_CONTINUE; -} - -bool all_packets_received(struct test_spec *test, struct xsk_socket_info *xsk, u32 sock_num, - unsigned long *bitmap) -{ - struct pkt_stream *pkt_stream = xsk->pkt_stream; - - if (!pkt_stream) { - __set_bit(sock_num, bitmap); - return false; - } - - if (pkt_stream->nb_rx_pkts == pkt_stream->nb_valid_entries) { - __set_bit(sock_num, bitmap); - if (bitmap_full(bitmap, test->nb_sockets)) - return true; - } - - return false; -} - -static int receive_pkts(struct test_spec *test) -{ - struct timeval tv_end, tv_now, tv_timeout = {THREAD_TMOUT, 0}; - DECLARE_BITMAP(bitmap, test->nb_sockets); - struct xsk_socket_info *xsk; - u32 sock_num = 0; - int res, ret; - - ret = gettimeofday(&tv_now, NULL); - if (ret) - exit_with_error(errno); - - timeradd(&tv_now, &tv_timeout, &tv_end); - - while (1) { - xsk = &test->ifobj_rx->xsk_arr[sock_num]; - - if ((all_packets_received(test, xsk, sock_num, bitmap))) - break; - - res = __receive_pkts(test, xsk); - if (!(res == TEST_PASS || res == TEST_CONTINUE)) - return res; - - ret = gettimeofday(&tv_now, NULL); - if (ret) - exit_with_error(errno); - - if (timercmp(&tv_now, &tv_end, >)) { - ksft_print_msg("ERROR: [%s] Receive loop timed out\n", __func__); - return TEST_FAILURE; - } - sock_num = (sock_num + 1) % test->nb_sockets; - } - - return TEST_PASS; -} - -static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, bool timeout) -{ - u32 i, idx = 0, valid_pkts = 0, valid_frags = 0, buffer_len; - struct pkt_stream *pkt_stream = xsk->pkt_stream; - struct xsk_umem_info *umem = ifobject->umem; - bool use_poll = ifobject->use_poll; - struct pollfd fds = { }; - int ret; - - buffer_len = pkt_get_buffer_len(umem, pkt_stream->max_pkt_len); - /* pkts_in_flight might be negative if many invalid packets are sent */ - if (pkts_in_flight >= (int)((umem_size(umem) - xsk->batch_size * buffer_len) / - buffer_len)) { - ret = kick_tx(xsk); - if (ret) - return TEST_FAILURE; - return TEST_CONTINUE; - } - - fds.fd = xsk_socket__fd(xsk->xsk); - fds.events = POLLOUT; - - while (xsk_ring_prod__reserve(&xsk->tx, xsk->batch_size, &idx) < xsk->batch_size) { - if (use_poll) { - ret = poll(&fds, 1, POLL_TMOUT); - if (timeout) { - if (ret < 0) { - ksft_print_msg("ERROR: [%s] Poll error %d\n", - __func__, errno); - return TEST_FAILURE; - } - if (ret == 0) - return TEST_PASS; - break; - } - if (ret <= 0) { - ksft_print_msg("ERROR: [%s] Poll error %d\n", - __func__, errno); - return TEST_FAILURE; - } - } - - complete_pkts(xsk, xsk->batch_size); - } - - for (i = 0; i < xsk->batch_size; i++) { - struct pkt *pkt = pkt_stream_get_next_tx_pkt(pkt_stream); - u32 nb_frags_left, nb_frags, bytes_written = 0; - - if (!pkt) - break; - - nb_frags = pkt_nb_frags(umem->frame_size, pkt_stream, pkt); - if (nb_frags > xsk->batch_size - i) { - pkt_stream_cancel(pkt_stream); - xsk_ring_prod__cancel(&xsk->tx, xsk->batch_size - i); - break; - } - nb_frags_left = nb_frags; - - while (nb_frags_left--) { - struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); - - tx_desc->addr = pkt_get_addr(pkt, ifobject->umem); - if (pkt_stream->verbatim) { - tx_desc->len = pkt->len; - tx_desc->options = pkt->options; - } else if (nb_frags_left) { - tx_desc->len = umem->frame_size; - tx_desc->options = XDP_PKT_CONTD; - } else { - tx_desc->len = pkt->len - bytes_written; - tx_desc->options = 0; - } - if (pkt->valid) - pkt_generate(xsk, umem, tx_desc->addr, tx_desc->len, pkt->pkt_nb, - bytes_written); - bytes_written += tx_desc->len; - - print_verbose("Tx addr: %llx len: %u options: %u pkt_nb: %u\n", - tx_desc->addr, tx_desc->len, tx_desc->options, pkt->pkt_nb); - - if (nb_frags_left) { - i++; - if (pkt_stream->verbatim) - pkt = pkt_stream_get_next_tx_pkt(pkt_stream); - } - } - - if (pkt && pkt->valid) { - valid_pkts++; - valid_frags += nb_frags; - } - } - - pthread_mutex_lock(&pacing_mutex); - pkts_in_flight += valid_pkts; - pthread_mutex_unlock(&pacing_mutex); - - xsk_ring_prod__submit(&xsk->tx, i); - xsk->outstanding_tx += valid_frags; - - if (use_poll) { - ret = poll(&fds, 1, POLL_TMOUT); - if (ret <= 0) { - if (ret == 0 && timeout) - return TEST_PASS; - - ksft_print_msg("ERROR: [%s] Poll error %d\n", __func__, ret); - return TEST_FAILURE; - } - } - - if (!timeout) { - if (complete_pkts(xsk, i)) - return TEST_FAILURE; - - usleep(10); - return TEST_PASS; - } - - return TEST_CONTINUE; -} - -static int wait_for_tx_completion(struct xsk_socket_info *xsk) -{ - struct timeval tv_end, tv_now, tv_timeout = {THREAD_TMOUT, 0}; - int ret; - - ret = gettimeofday(&tv_now, NULL); - if (ret) - exit_with_error(errno); - timeradd(&tv_now, &tv_timeout, &tv_end); - - while (xsk->outstanding_tx) { - ret = gettimeofday(&tv_now, NULL); - if (ret) - exit_with_error(errno); - if (timercmp(&tv_now, &tv_end, >)) { - ksft_print_msg("ERROR: [%s] Transmission loop timed out\n", __func__); - return TEST_FAILURE; - } - - complete_pkts(xsk, xsk->batch_size); - } - - return TEST_PASS; -} - -bool all_packets_sent(struct test_spec *test, unsigned long *bitmap) -{ - return bitmap_full(bitmap, test->nb_sockets); -} - -static int send_pkts(struct test_spec *test, struct ifobject *ifobject) -{ - bool timeout = !is_umem_valid(test->ifobj_rx); - DECLARE_BITMAP(bitmap, test->nb_sockets); - u32 i, ret; - - while (!(all_packets_sent(test, bitmap))) { - for (i = 0; i < test->nb_sockets; i++) { - struct pkt_stream *pkt_stream; - - pkt_stream = ifobject->xsk_arr[i].pkt_stream; - if (!pkt_stream || pkt_stream->current_pkt_nb >= pkt_stream->nb_pkts) { - __set_bit(i, bitmap); - continue; - } - ret = __send_pkts(ifobject, &ifobject->xsk_arr[i], timeout); - if (ret == TEST_CONTINUE && !test->fail) - continue; - - if ((ret || test->fail) && !timeout) - return TEST_FAILURE; - - if (ret == TEST_PASS && timeout) - return ret; - - ret = wait_for_tx_completion(&ifobject->xsk_arr[i]); - if (ret) - return TEST_FAILURE; - } - } - - return TEST_PASS; -} - -static int get_xsk_stats(struct xsk_socket *xsk, struct xdp_statistics *stats) -{ - int fd = xsk_socket__fd(xsk), err; - socklen_t optlen, expected_len; - - optlen = sizeof(*stats); - err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, stats, &optlen); - if (err) { - ksft_print_msg("[%s] getsockopt(XDP_STATISTICS) error %u %s\n", - __func__, -err, strerror(-err)); - return TEST_FAILURE; - } - - expected_len = sizeof(struct xdp_statistics); - if (optlen != expected_len) { - ksft_print_msg("[%s] getsockopt optlen error. Expected: %u got: %u\n", - __func__, expected_len, optlen); - return TEST_FAILURE; - } - - return TEST_PASS; -} - -static int validate_rx_dropped(struct ifobject *ifobject) -{ - struct xsk_socket *xsk = ifobject->xsk->xsk; - struct xdp_statistics stats; - int err; - - err = kick_rx(ifobject->xsk); - if (err) - return TEST_FAILURE; - - err = get_xsk_stats(xsk, &stats); - if (err) - return TEST_FAILURE; - - /* The receiver calls getsockopt after receiving the last (valid) - * packet which is not the final packet sent in this test (valid and - * invalid packets are sent in alternating fashion with the final - * packet being invalid). Since the last packet may or may not have - * been dropped already, both outcomes must be allowed. - */ - if (stats.rx_dropped == ifobject->xsk->pkt_stream->nb_pkts / 2 || - stats.rx_dropped == ifobject->xsk->pkt_stream->nb_pkts / 2 - 1) - return TEST_PASS; - - return TEST_FAILURE; -} - -static int validate_rx_full(struct ifobject *ifobject) -{ - struct xsk_socket *xsk = ifobject->xsk->xsk; - struct xdp_statistics stats; - int err; - - usleep(1000); - err = kick_rx(ifobject->xsk); - if (err) - return TEST_FAILURE; - - err = get_xsk_stats(xsk, &stats); - if (err) - return TEST_FAILURE; - - if (stats.rx_ring_full) - return TEST_PASS; - - return TEST_FAILURE; -} - -static int validate_fill_empty(struct ifobject *ifobject) -{ - struct xsk_socket *xsk = ifobject->xsk->xsk; - struct xdp_statistics stats; - int err; - - usleep(1000); - err = kick_rx(ifobject->xsk); - if (err) - return TEST_FAILURE; - - err = get_xsk_stats(xsk, &stats); - if (err) - return TEST_FAILURE; - - if (stats.rx_fill_ring_empty_descs) - return TEST_PASS; - - return TEST_FAILURE; -} - -static int validate_tx_invalid_descs(struct ifobject *ifobject) -{ - struct xsk_socket *xsk = ifobject->xsk->xsk; - int fd = xsk_socket__fd(xsk); - struct xdp_statistics stats; - socklen_t optlen; - int err; - - optlen = sizeof(stats); - err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen); - if (err) { - ksft_print_msg("[%s] getsockopt(XDP_STATISTICS) error %u %s\n", - __func__, -err, strerror(-err)); - return TEST_FAILURE; - } - - if (stats.tx_invalid_descs != ifobject->xsk->pkt_stream->nb_pkts / 2) { - ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%llu] expected [%u]\n", - __func__, - (unsigned long long)stats.tx_invalid_descs, - ifobject->xsk->pkt_stream->nb_pkts); - return TEST_FAILURE; - } - - return TEST_PASS; -} - -static void xsk_configure_socket(struct test_spec *test, struct ifobject *ifobject, - struct xsk_umem_info *umem, bool tx) -{ - int i, ret; - - for (i = 0; i < test->nb_sockets; i++) { - bool shared = (ifobject->shared_umem && tx) ? true : !!i; - u32 ctr = 0; - - while (ctr++ < SOCK_RECONF_CTR) { - ret = __xsk_configure_socket(&ifobject->xsk_arr[i], umem, - ifobject, shared); - if (!ret) - break; - - /* Retry if it fails as xsk_socket__create() is asynchronous */ - if (ctr >= SOCK_RECONF_CTR) - exit_with_error(-ret); - usleep(USLEEP_MAX); - } - if (ifobject->busy_poll) - enable_busy_poll(&ifobject->xsk_arr[i]); - } -} - -static void thread_common_ops_tx(struct test_spec *test, struct ifobject *ifobject) -{ - xsk_configure_socket(test, ifobject, test->ifobj_rx->umem, true); - ifobject->xsk = &ifobject->xsk_arr[0]; - ifobject->xskmap = test->ifobj_rx->xskmap; - memcpy(ifobject->umem, test->ifobj_rx->umem, sizeof(struct xsk_umem_info)); - ifobject->umem->base_addr = 0; -} - -static void xsk_populate_fill_ring(struct xsk_umem_info *umem, struct pkt_stream *pkt_stream, - bool fill_up) -{ - u32 rx_frame_size = umem->frame_size - XDP_PACKET_HEADROOM; - u32 idx = 0, filled = 0, buffers_to_fill, nb_pkts; - int ret; - - if (umem->num_frames < XSK_RING_PROD__DEFAULT_NUM_DESCS) - buffers_to_fill = umem->num_frames; - else - buffers_to_fill = umem->fill_size; - - ret = xsk_ring_prod__reserve(&umem->fq, buffers_to_fill, &idx); - if (ret != buffers_to_fill) - exit_with_error(ENOSPC); - - while (filled < buffers_to_fill) { - struct pkt *pkt = pkt_stream_get_next_rx_pkt(pkt_stream, &nb_pkts); - u64 addr; - u32 i; - - for (i = 0; i < pkt_nb_frags(rx_frame_size, pkt_stream, pkt); i++) { - if (!pkt) { - if (!fill_up) - break; - addr = filled * umem->frame_size + umem->base_addr; - } else if (pkt->offset >= 0) { - addr = pkt->offset % umem->frame_size + umem_alloc_buffer(umem); - } else { - addr = pkt->offset + umem_alloc_buffer(umem); - } - - *xsk_ring_prod__fill_addr(&umem->fq, idx++) = addr; - if (++filled >= buffers_to_fill) - break; - } - } - xsk_ring_prod__submit(&umem->fq, filled); - xsk_ring_prod__cancel(&umem->fq, buffers_to_fill - filled); - - pkt_stream_reset(pkt_stream); - umem_reset_alloc(umem); -} - -static void thread_common_ops(struct test_spec *test, struct ifobject *ifobject) -{ - u64 umem_sz = ifobject->umem->num_frames * ifobject->umem->frame_size; - int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE; - LIBBPF_OPTS(bpf_xdp_query_opts, opts); - void *bufs; - int ret; - u32 i; - - if (ifobject->umem->unaligned_mode) - mmap_flags |= MAP_HUGETLB | MAP_HUGE_2MB; - - if (ifobject->shared_umem) - umem_sz *= 2; - - bufs = mmap(NULL, umem_sz, PROT_READ | PROT_WRITE, mmap_flags, -1, 0); - if (bufs == MAP_FAILED) - exit_with_error(errno); - - ret = xsk_configure_umem(ifobject, ifobject->umem, bufs, umem_sz); - if (ret) - exit_with_error(-ret); - - xsk_configure_socket(test, ifobject, ifobject->umem, false); - - ifobject->xsk = &ifobject->xsk_arr[0]; - - if (!ifobject->rx_on) - return; - - xsk_populate_fill_ring(ifobject->umem, ifobject->xsk->pkt_stream, ifobject->use_fill_ring); - - for (i = 0; i < test->nb_sockets; i++) { - ifobject->xsk = &ifobject->xsk_arr[i]; - ret = xsk_update_xskmap(ifobject->xskmap, ifobject->xsk->xsk, i); - if (ret) - exit_with_error(errno); - } -} - -static void *worker_testapp_validate_tx(void *arg) -{ - struct test_spec *test = (struct test_spec *)arg; - struct ifobject *ifobject = test->ifobj_tx; - int err; - - if (test->current_step == 1) { - if (!ifobject->shared_umem) - thread_common_ops(test, ifobject); - else - thread_common_ops_tx(test, ifobject); - } - - err = send_pkts(test, ifobject); - - if (!err && ifobject->validation_func) - err = ifobject->validation_func(ifobject); - if (err) - report_failure(test); - - pthread_exit(NULL); -} - -static void *worker_testapp_validate_rx(void *arg) -{ - struct test_spec *test = (struct test_spec *)arg; - struct ifobject *ifobject = test->ifobj_rx; - int err; - - if (test->current_step == 1) { - thread_common_ops(test, ifobject); - } else { - xsk_clear_xskmap(ifobject->xskmap); - err = xsk_update_xskmap(ifobject->xskmap, ifobject->xsk->xsk, 0); - if (err) { - ksft_print_msg("Error: Failed to update xskmap, error %s\n", - strerror(-err)); - exit_with_error(-err); - } - } - - pthread_barrier_wait(&barr); - - err = receive_pkts(test); - - if (!err && ifobject->validation_func) - err = ifobject->validation_func(ifobject); - - if (err) { - if (test->adjust_tail && !is_adjust_tail_supported(ifobject->xdp_progs)) - test->adjust_tail_support = false; - else - report_failure(test); - } - - pthread_exit(NULL); -} - -static u64 ceil_u64(u64 a, u64 b) -{ - return (a + b - 1) / b; -} - -static void testapp_clean_xsk_umem(struct ifobject *ifobj) -{ - u64 umem_sz = ifobj->umem->num_frames * ifobj->umem->frame_size; - - if (ifobj->shared_umem) - umem_sz *= 2; - - umem_sz = ceil_u64(umem_sz, HUGEPAGE_SIZE) * HUGEPAGE_SIZE; - xsk_umem__delete(ifobj->umem->umem); - munmap(ifobj->umem->buffer, umem_sz); -} - -static void handler(int signum) -{ - pthread_exit(NULL); -} - -static bool xdp_prog_changed_rx(struct test_spec *test) -{ - struct ifobject *ifobj = test->ifobj_rx; - - return ifobj->xdp_prog != test->xdp_prog_rx || ifobj->mode != test->mode; -} - -static bool xdp_prog_changed_tx(struct test_spec *test) -{ - struct ifobject *ifobj = test->ifobj_tx; - - return ifobj->xdp_prog != test->xdp_prog_tx || ifobj->mode != test->mode; -} - -static void xsk_reattach_xdp(struct ifobject *ifobj, struct bpf_program *xdp_prog, - struct bpf_map *xskmap, enum test_mode mode) -{ - int err; - - xsk_detach_xdp_program(ifobj->ifindex, mode_to_xdp_flags(ifobj->mode)); - err = xsk_attach_xdp_program(xdp_prog, ifobj->ifindex, mode_to_xdp_flags(mode)); - if (err) { - ksft_print_msg("Error attaching XDP program\n"); - exit_with_error(-err); - } - - if (ifobj->mode != mode && (mode == TEST_MODE_DRV || mode == TEST_MODE_ZC)) - if (!xsk_is_in_mode(ifobj->ifindex, XDP_FLAGS_DRV_MODE)) { - ksft_print_msg("ERROR: XDP prog not in DRV mode\n"); - exit_with_error(EINVAL); - } - - ifobj->xdp_prog = xdp_prog; - ifobj->xskmap = xskmap; - ifobj->mode = mode; -} - -static void xsk_attach_xdp_progs(struct test_spec *test, struct ifobject *ifobj_rx, - struct ifobject *ifobj_tx) -{ - if (xdp_prog_changed_rx(test)) - xsk_reattach_xdp(ifobj_rx, test->xdp_prog_rx, test->xskmap_rx, test->mode); - - if (!ifobj_tx || ifobj_tx->shared_umem) - return; - - if (xdp_prog_changed_tx(test)) - xsk_reattach_xdp(ifobj_tx, test->xdp_prog_tx, test->xskmap_tx, test->mode); -} - -static int __testapp_validate_traffic(struct test_spec *test, struct ifobject *ifobj1, - struct ifobject *ifobj2) -{ - pthread_t t0, t1; - int err; - - if (test->mtu > MAX_ETH_PKT_SIZE) { - if (test->mode == TEST_MODE_ZC && (!ifobj1->multi_buff_zc_supp || - (ifobj2 && !ifobj2->multi_buff_zc_supp))) { - ksft_test_result_skip("Multi buffer for zero-copy not supported.\n"); - return TEST_SKIP; - } - if (test->mode != TEST_MODE_ZC && (!ifobj1->multi_buff_supp || - (ifobj2 && !ifobj2->multi_buff_supp))) { - ksft_test_result_skip("Multi buffer not supported.\n"); - return TEST_SKIP; - } - } - err = test_spec_set_mtu(test, test->mtu); - if (err) { - ksft_print_msg("Error, could not set mtu.\n"); - exit_with_error(err); - } - - if (ifobj2) { - if (pthread_barrier_init(&barr, NULL, 2)) - exit_with_error(errno); - pkt_stream_reset(ifobj2->xsk->pkt_stream); - } - - test->current_step++; - pkt_stream_reset(ifobj1->xsk->pkt_stream); - pkts_in_flight = 0; - - signal(SIGUSR1, handler); - /*Spawn RX thread */ - pthread_create(&t0, NULL, ifobj1->func_ptr, test); - - if (ifobj2) { - pthread_barrier_wait(&barr); - if (pthread_barrier_destroy(&barr)) - exit_with_error(errno); - - /*Spawn TX thread */ - pthread_create(&t1, NULL, ifobj2->func_ptr, test); - - pthread_join(t1, NULL); - } - - if (!ifobj2) - pthread_kill(t0, SIGUSR1); - else - pthread_join(t0, NULL); - - if (test->total_steps == test->current_step || test->fail) { - u32 i; - - if (ifobj2) - for (i = 0; i < test->nb_sockets; i++) - xsk_socket__delete(ifobj2->xsk_arr[i].xsk); - - for (i = 0; i < test->nb_sockets; i++) - xsk_socket__delete(ifobj1->xsk_arr[i].xsk); - - testapp_clean_xsk_umem(ifobj1); - if (ifobj2 && !ifobj2->shared_umem) - testapp_clean_xsk_umem(ifobj2); - } - - return !!test->fail; -} - -static int testapp_validate_traffic(struct test_spec *test) -{ - struct ifobject *ifobj_rx = test->ifobj_rx; - struct ifobject *ifobj_tx = test->ifobj_tx; - - if ((ifobj_rx->umem->unaligned_mode && !ifobj_rx->unaligned_supp) || - (ifobj_tx->umem->unaligned_mode && !ifobj_tx->unaligned_supp)) { - ksft_test_result_skip("No huge pages present.\n"); - return TEST_SKIP; - } - - if (test->set_ring) { - if (ifobj_tx->hw_ring_size_supp) { - if (set_ring_size(ifobj_tx)) { - ksft_test_result_skip("Failed to change HW ring size.\n"); - return TEST_FAILURE; - } - } else { - ksft_test_result_skip("Changing HW ring size not supported.\n"); - return TEST_SKIP; - } - } - - xsk_attach_xdp_progs(test, ifobj_rx, ifobj_tx); - return __testapp_validate_traffic(test, ifobj_rx, ifobj_tx); -} - -static int testapp_validate_traffic_single_thread(struct test_spec *test, struct ifobject *ifobj) -{ - return __testapp_validate_traffic(test, ifobj, NULL); -} - -static int testapp_teardown(struct test_spec *test) -{ - int i; - - for (i = 0; i < MAX_TEARDOWN_ITER; i++) { - if (testapp_validate_traffic(test)) - return TEST_FAILURE; - test_spec_reset(test); - } - - return TEST_PASS; -} - -static void swap_directions(struct ifobject **ifobj1, struct ifobject **ifobj2) -{ - thread_func_t tmp_func_ptr = (*ifobj1)->func_ptr; - struct ifobject *tmp_ifobj = (*ifobj1); - - (*ifobj1)->func_ptr = (*ifobj2)->func_ptr; - (*ifobj2)->func_ptr = tmp_func_ptr; - - *ifobj1 = *ifobj2; - *ifobj2 = tmp_ifobj; -} - -static int testapp_bidirectional(struct test_spec *test) -{ - int res; - - test->ifobj_tx->rx_on = true; - test->ifobj_rx->tx_on = true; - test->total_steps = 2; - if (testapp_validate_traffic(test)) - return TEST_FAILURE; - - print_verbose("Switching Tx/Rx direction\n"); - swap_directions(&test->ifobj_rx, &test->ifobj_tx); - res = __testapp_validate_traffic(test, test->ifobj_rx, test->ifobj_tx); - - swap_directions(&test->ifobj_rx, &test->ifobj_tx); - return res; -} - -static int swap_xsk_resources(struct test_spec *test) -{ - int ret; - - test->ifobj_tx->xsk_arr[0].pkt_stream = NULL; - test->ifobj_rx->xsk_arr[0].pkt_stream = NULL; - test->ifobj_tx->xsk_arr[1].pkt_stream = test->tx_pkt_stream_default; - test->ifobj_rx->xsk_arr[1].pkt_stream = test->rx_pkt_stream_default; - test->ifobj_tx->xsk = &test->ifobj_tx->xsk_arr[1]; - test->ifobj_rx->xsk = &test->ifobj_rx->xsk_arr[1]; - - ret = xsk_update_xskmap(test->ifobj_rx->xskmap, test->ifobj_rx->xsk->xsk, 0); - if (ret) - return TEST_FAILURE; - - return TEST_PASS; -} - -static int testapp_xdp_prog_cleanup(struct test_spec *test) -{ - test->total_steps = 2; - test->nb_sockets = 2; - if (testapp_validate_traffic(test)) - return TEST_FAILURE; - - if (swap_xsk_resources(test)) - return TEST_FAILURE; - return testapp_validate_traffic(test); -} - -static int testapp_headroom(struct test_spec *test) -{ - test->ifobj_rx->umem->frame_headroom = UMEM_HEADROOM_TEST_SIZE; - return testapp_validate_traffic(test); -} - -static int testapp_stats_rx_dropped(struct test_spec *test) -{ - if (test->mode == TEST_MODE_ZC) { - ksft_test_result_skip("Can not run RX_DROPPED test for ZC mode\n"); - return TEST_SKIP; - } - - pkt_stream_replace_half(test, MIN_PKT_SIZE * 4, 0); - test->ifobj_rx->umem->frame_headroom = test->ifobj_rx->umem->frame_size - - XDP_PACKET_HEADROOM - MIN_PKT_SIZE * 3; - pkt_stream_receive_half(test); - test->ifobj_rx->validation_func = validate_rx_dropped; - return testapp_validate_traffic(test); -} - -static int testapp_stats_tx_invalid_descs(struct test_spec *test) -{ - pkt_stream_replace_half(test, XSK_UMEM__INVALID_FRAME_SIZE, 0); - test->ifobj_tx->validation_func = validate_tx_invalid_descs; - return testapp_validate_traffic(test); -} - -static int testapp_stats_rx_full(struct test_spec *test) -{ - pkt_stream_replace(test, DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, MIN_PKT_SIZE); - test->ifobj_rx->xsk->pkt_stream = pkt_stream_generate(DEFAULT_UMEM_BUFFERS, MIN_PKT_SIZE); - - test->ifobj_rx->xsk->rxqsize = DEFAULT_UMEM_BUFFERS; - test->ifobj_rx->release_rx = false; - test->ifobj_rx->validation_func = validate_rx_full; - return testapp_validate_traffic(test); -} - -static int testapp_stats_fill_empty(struct test_spec *test) -{ - pkt_stream_replace(test, DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, MIN_PKT_SIZE); - test->ifobj_rx->xsk->pkt_stream = pkt_stream_generate(DEFAULT_UMEM_BUFFERS, MIN_PKT_SIZE); - - test->ifobj_rx->use_fill_ring = false; - test->ifobj_rx->validation_func = validate_fill_empty; - return testapp_validate_traffic(test); -} - -static int testapp_send_receive_unaligned(struct test_spec *test) -{ - test->ifobj_tx->umem->unaligned_mode = true; - test->ifobj_rx->umem->unaligned_mode = true; - /* Let half of the packets straddle a 4K buffer boundary */ - pkt_stream_replace_half(test, MIN_PKT_SIZE, -MIN_PKT_SIZE / 2); - - return testapp_validate_traffic(test); -} - -static int testapp_send_receive_unaligned_mb(struct test_spec *test) -{ - test->mtu = MAX_ETH_JUMBO_SIZE; - test->ifobj_tx->umem->unaligned_mode = true; - test->ifobj_rx->umem->unaligned_mode = true; - pkt_stream_replace(test, DEFAULT_PKT_CNT, MAX_ETH_JUMBO_SIZE); - return testapp_validate_traffic(test); -} - -static int testapp_single_pkt(struct test_spec *test) -{ - struct pkt pkts[] = {{0, MIN_PKT_SIZE, 0, true}}; - - pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts)); - return testapp_validate_traffic(test); -} - -static int testapp_send_receive_mb(struct test_spec *test) -{ - test->mtu = MAX_ETH_JUMBO_SIZE; - pkt_stream_replace(test, DEFAULT_PKT_CNT, MAX_ETH_JUMBO_SIZE); - - return testapp_validate_traffic(test); -} - -static int testapp_invalid_desc_mb(struct test_spec *test) -{ - struct xsk_umem_info *umem = test->ifobj_tx->umem; - u64 umem_size = umem->num_frames * umem->frame_size; - struct pkt pkts[] = { - /* Valid packet for synch to start with */ - {0, MIN_PKT_SIZE, 0, true, 0}, - /* Zero frame len is not legal */ - {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, - {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, - {0, 0, 0, false, 0}, - /* Invalid address in the second frame */ - {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, - {umem_size, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, - /* Invalid len in the middle */ - {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, - {0, XSK_UMEM__INVALID_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, - /* Invalid options in the middle */ - {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, - {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XSK_DESC__INVALID_OPTION}, - /* Transmit 2 frags, receive 3 */ - {0, XSK_UMEM__MAX_FRAME_SIZE, 0, true, XDP_PKT_CONTD}, - {0, XSK_UMEM__MAX_FRAME_SIZE, 0, true, 0}, - /* Middle frame crosses chunk boundary with small length */ - {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, - {-MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false, 0}, - /* Valid packet for synch so that something is received */ - {0, MIN_PKT_SIZE, 0, true, 0}}; - - if (umem->unaligned_mode) { - /* Crossing a chunk boundary allowed */ - pkts[12].valid = true; - pkts[13].valid = true; - } - - test->mtu = MAX_ETH_JUMBO_SIZE; - pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts)); - return testapp_validate_traffic(test); -} - -static int testapp_invalid_desc(struct test_spec *test) -{ - struct xsk_umem_info *umem = test->ifobj_tx->umem; - u64 umem_size = umem->num_frames * umem->frame_size; - struct pkt pkts[] = { - /* Zero packet address allowed */ - {0, MIN_PKT_SIZE, 0, true}, - /* Allowed packet */ - {0, MIN_PKT_SIZE, 0, true}, - /* Straddling the start of umem */ - {-2, MIN_PKT_SIZE, 0, false}, - /* Packet too large */ - {0, XSK_UMEM__INVALID_FRAME_SIZE, 0, false}, - /* Up to end of umem allowed */ - {umem_size - MIN_PKT_SIZE - 2 * umem->frame_size, MIN_PKT_SIZE, 0, true}, - /* After umem ends */ - {umem_size, MIN_PKT_SIZE, 0, false}, - /* Straddle the end of umem */ - {umem_size - MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false}, - /* Straddle a 4K boundary */ - {0x1000 - MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false}, - /* Straddle a 2K boundary */ - {0x800 - MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, true}, - /* Valid packet for synch so that something is received */ - {0, MIN_PKT_SIZE, 0, true}}; - - if (umem->unaligned_mode) { - /* Crossing a page boundary allowed */ - pkts[7].valid = true; - } - if (umem->frame_size == XSK_UMEM__DEFAULT_FRAME_SIZE / 2) { - /* Crossing a 2K frame size boundary not allowed */ - pkts[8].valid = false; - } - - if (test->ifobj_tx->shared_umem) { - pkts[4].offset += umem_size; - pkts[5].offset += umem_size; - pkts[6].offset += umem_size; - } - - pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts)); - return testapp_validate_traffic(test); -} - -static int testapp_xdp_drop(struct test_spec *test) -{ - struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs; - struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs; - - test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_drop, skel_tx->progs.xsk_xdp_drop, - skel_rx->maps.xsk, skel_tx->maps.xsk); - - pkt_stream_receive_half(test); - return testapp_validate_traffic(test); -} - -static int testapp_xdp_metadata_copy(struct test_spec *test) -{ - struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs; - struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs; - - test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_populate_metadata, - skel_tx->progs.xsk_xdp_populate_metadata, - skel_rx->maps.xsk, skel_tx->maps.xsk); - test->ifobj_rx->use_metadata = true; - - skel_rx->bss->count = 0; - - return testapp_validate_traffic(test); -} - -static int testapp_xdp_shared_umem(struct test_spec *test) -{ - struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs; - struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs; - - test->total_steps = 1; - test->nb_sockets = 2; - - test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_shared_umem, - skel_tx->progs.xsk_xdp_shared_umem, - skel_rx->maps.xsk, skel_tx->maps.xsk); - - pkt_stream_even_odd_sequence(test); - - return testapp_validate_traffic(test); -} - -static int testapp_poll_txq_tmout(struct test_spec *test) -{ - test->ifobj_tx->use_poll = true; - /* create invalid frame by set umem frame_size and pkt length equal to 2048 */ - test->ifobj_tx->umem->frame_size = 2048; - pkt_stream_replace(test, 2 * DEFAULT_PKT_CNT, 2048); - return testapp_validate_traffic_single_thread(test, test->ifobj_tx); -} - -static int testapp_poll_rxq_tmout(struct test_spec *test) -{ - test->ifobj_rx->use_poll = true; - return testapp_validate_traffic_single_thread(test, test->ifobj_rx); -} - -static int testapp_too_many_frags(struct test_spec *test) -{ - struct pkt *pkts; - u32 max_frags, i; - int ret; - - if (test->mode == TEST_MODE_ZC) { - max_frags = test->ifobj_tx->xdp_zc_max_segs; - } else { - max_frags = get_max_skb_frags(); - if (!max_frags) { - ksft_print_msg("Couldn't retrieve MAX_SKB_FRAGS from system, using default (17) value\n"); - max_frags = 17; - } - max_frags += 1; - } - - pkts = calloc(2 * max_frags + 2, sizeof(struct pkt)); - if (!pkts) - return TEST_FAILURE; - - test->mtu = MAX_ETH_JUMBO_SIZE; - - /* Valid packet for synch */ - pkts[0].len = MIN_PKT_SIZE; - pkts[0].valid = true; - - /* One valid packet with the max amount of frags */ - for (i = 1; i < max_frags + 1; i++) { - pkts[i].len = MIN_PKT_SIZE; - pkts[i].options = XDP_PKT_CONTD; - pkts[i].valid = true; - } - pkts[max_frags].options = 0; - - /* An invalid packet with the max amount of frags but signals packet - * continues on the last frag - */ - for (i = max_frags + 1; i < 2 * max_frags + 1; i++) { - pkts[i].len = MIN_PKT_SIZE; - pkts[i].options = XDP_PKT_CONTD; - pkts[i].valid = false; - } - - /* Valid packet for synch */ - pkts[2 * max_frags + 1].len = MIN_PKT_SIZE; - pkts[2 * max_frags + 1].valid = true; - - pkt_stream_generate_custom(test, pkts, 2 * max_frags + 2); - ret = testapp_validate_traffic(test); - - free(pkts); - return ret; -} - -static int xsk_load_xdp_programs(struct ifobject *ifobj) -{ - ifobj->xdp_progs = xsk_xdp_progs__open_and_load(); - if (libbpf_get_error(ifobj->xdp_progs)) - return libbpf_get_error(ifobj->xdp_progs); - - return 0; -} - static void xsk_unload_xdp_programs(struct ifobject *ifobj) { xsk_xdp_progs__destroy(ifobj->xdp_progs); } -/* Simple test */ -static bool hugepages_present(void) -{ - size_t mmap_sz = 2 * DEFAULT_UMEM_BUFFERS * XSK_UMEM__DEFAULT_FRAME_SIZE; - void *bufs; - - bufs = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, MAP_HUGE_2MB); - if (bufs == MAP_FAILED) - return false; - - mmap_sz = ceil_u64(mmap_sz, HUGEPAGE_SIZE) * HUGEPAGE_SIZE; - munmap(bufs, mmap_sz); - return true; -} - -static void init_iface(struct ifobject *ifobj, thread_func_t func_ptr) -{ - LIBBPF_OPTS(bpf_xdp_query_opts, query_opts); - int err; - - ifobj->func_ptr = func_ptr; - - err = xsk_load_xdp_programs(ifobj); - if (err) { - ksft_print_msg("Error loading XDP program\n"); - exit_with_error(err); - } - - if (hugepages_present()) - ifobj->unaligned_supp = true; - - err = bpf_xdp_query(ifobj->ifindex, XDP_FLAGS_DRV_MODE, &query_opts); - if (err) { - ksft_print_msg("Error querying XDP capabilities\n"); - exit_with_error(-err); - } - if (query_opts.feature_flags & NETDEV_XDP_ACT_RX_SG) - ifobj->multi_buff_supp = true; - if (query_opts.feature_flags & NETDEV_XDP_ACT_XSK_ZEROCOPY) { - if (query_opts.xdp_zc_max_segs > 1) { - ifobj->multi_buff_zc_supp = true; - ifobj->xdp_zc_max_segs = query_opts.xdp_zc_max_segs; - } else { - ifobj->xdp_zc_max_segs = 0; - } - } -} - -static int testapp_send_receive(struct test_spec *test) -{ - return testapp_validate_traffic(test); -} - -static int testapp_send_receive_2k_frame(struct test_spec *test) -{ - test->ifobj_tx->umem->frame_size = 2048; - test->ifobj_rx->umem->frame_size = 2048; - pkt_stream_replace(test, DEFAULT_PKT_CNT, MIN_PKT_SIZE); - return testapp_validate_traffic(test); -} - -static int testapp_poll_rx(struct test_spec *test) -{ - test->ifobj_rx->use_poll = true; - return testapp_validate_traffic(test); -} - -static int testapp_poll_tx(struct test_spec *test) -{ - test->ifobj_tx->use_poll = true; - return testapp_validate_traffic(test); -} - -static int testapp_aligned_inv_desc(struct test_spec *test) -{ - return testapp_invalid_desc(test); -} - -static int testapp_aligned_inv_desc_2k_frame(struct test_spec *test) -{ - test->ifobj_tx->umem->frame_size = 2048; - test->ifobj_rx->umem->frame_size = 2048; - return testapp_invalid_desc(test); -} - -static int testapp_unaligned_inv_desc(struct test_spec *test) -{ - test->ifobj_tx->umem->unaligned_mode = true; - test->ifobj_rx->umem->unaligned_mode = true; - return testapp_invalid_desc(test); -} - -static int testapp_unaligned_inv_desc_4001_frame(struct test_spec *test) -{ - u64 page_size, umem_size; - - /* Odd frame size so the UMEM doesn't end near a page boundary. */ - test->ifobj_tx->umem->frame_size = 4001; - test->ifobj_rx->umem->frame_size = 4001; - test->ifobj_tx->umem->unaligned_mode = true; - test->ifobj_rx->umem->unaligned_mode = true; - /* This test exists to test descriptors that staddle the end of - * the UMEM but not a page. - */ - page_size = sysconf(_SC_PAGESIZE); - umem_size = test->ifobj_tx->umem->num_frames * test->ifobj_tx->umem->frame_size; - assert(umem_size % page_size > MIN_PKT_SIZE); - assert(umem_size % page_size < page_size - MIN_PKT_SIZE); - - return testapp_invalid_desc(test); -} - -static int testapp_aligned_inv_desc_mb(struct test_spec *test) -{ - return testapp_invalid_desc_mb(test); -} - -static int testapp_unaligned_inv_desc_mb(struct test_spec *test) -{ - test->ifobj_tx->umem->unaligned_mode = true; - test->ifobj_rx->umem->unaligned_mode = true; - return testapp_invalid_desc_mb(test); -} - -static int testapp_xdp_metadata(struct test_spec *test) -{ - return testapp_xdp_metadata_copy(test); -} - -static int testapp_xdp_metadata_mb(struct test_spec *test) -{ - test->mtu = MAX_ETH_JUMBO_SIZE; - return testapp_xdp_metadata_copy(test); -} - -static int testapp_hw_sw_min_ring_size(struct test_spec *test) -{ - int ret; - - test->set_ring = true; - test->total_steps = 2; - test->ifobj_tx->ring.tx_pending = DEFAULT_BATCH_SIZE; - test->ifobj_tx->ring.rx_pending = DEFAULT_BATCH_SIZE * 2; - test->ifobj_tx->xsk->batch_size = 1; - test->ifobj_rx->xsk->batch_size = 1; - ret = testapp_validate_traffic(test); - if (ret) - return ret; - - /* Set batch size to hw_ring_size - 1 */ - test->ifobj_tx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1; - test->ifobj_rx->xsk->batch_size = DEFAULT_BATCH_SIZE - 1; - return testapp_validate_traffic(test); -} - -static int testapp_hw_sw_max_ring_size(struct test_spec *test) -{ - u32 max_descs = XSK_RING_PROD__DEFAULT_NUM_DESCS * 4; - int ret; - - test->set_ring = true; - test->total_steps = 2; - test->ifobj_tx->ring.tx_pending = test->ifobj_tx->ring.tx_max_pending; - test->ifobj_tx->ring.rx_pending = test->ifobj_tx->ring.rx_max_pending; - test->ifobj_rx->umem->num_frames = max_descs; - test->ifobj_rx->umem->fill_size = max_descs; - test->ifobj_rx->umem->comp_size = max_descs; - test->ifobj_tx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; - test->ifobj_rx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; - - ret = testapp_validate_traffic(test); - if (ret) - return ret; - - /* Set batch_size to 8152 for testing, as the ice HW ignores the 3 lowest bits when - * updating the Rx HW tail register. - */ - test->ifobj_tx->xsk->batch_size = test->ifobj_tx->ring.tx_max_pending - 8; - test->ifobj_rx->xsk->batch_size = test->ifobj_tx->ring.tx_max_pending - 8; - pkt_stream_replace(test, max_descs, MIN_PKT_SIZE); - return testapp_validate_traffic(test); -} - -static int testapp_xdp_adjust_tail(struct test_spec *test, int adjust_value) -{ - struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs; - struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs; - - test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_adjust_tail, - skel_tx->progs.xsk_xdp_adjust_tail, - skel_rx->maps.xsk, skel_tx->maps.xsk); - - skel_rx->bss->adjust_value = adjust_value; - - return testapp_validate_traffic(test); -} - -static int testapp_adjust_tail(struct test_spec *test, u32 value, u32 pkt_len) -{ - int ret; - - test->adjust_tail_support = true; - test->adjust_tail = true; - test->total_steps = 1; - - pkt_stream_replace_ifobject(test->ifobj_tx, DEFAULT_BATCH_SIZE, pkt_len); - pkt_stream_replace_ifobject(test->ifobj_rx, DEFAULT_BATCH_SIZE, pkt_len + value); - - ret = testapp_xdp_adjust_tail(test, value); - if (ret) - return ret; - - if (!test->adjust_tail_support) { - ksft_test_result_skip("%s %sResize pkt with bpf_xdp_adjust_tail() not supported\n", - mode_string(test), busy_poll_string(test)); - return TEST_SKIP; - } - - return 0; -} - -static int testapp_adjust_tail_shrink(struct test_spec *test) -{ - /* Shrink by 4 bytes for testing purpose */ - return testapp_adjust_tail(test, -4, MIN_PKT_SIZE * 2); -} - -static int testapp_adjust_tail_shrink_mb(struct test_spec *test) -{ - test->mtu = MAX_ETH_JUMBO_SIZE; - /* Shrink by the frag size */ - return testapp_adjust_tail(test, -XSK_UMEM__MAX_FRAME_SIZE, XSK_UMEM__LARGE_FRAME_SIZE * 2); -} - -static int testapp_adjust_tail_grow(struct test_spec *test) -{ - /* Grow by 4 bytes for testing purpose */ - return testapp_adjust_tail(test, 4, MIN_PKT_SIZE * 2); -} - -static int testapp_adjust_tail_grow_mb(struct test_spec *test) -{ - test->mtu = MAX_ETH_JUMBO_SIZE; - /* Grow by (frag_size - last_frag_Size) - 1 to stay inside the last fragment */ - return testapp_adjust_tail(test, (XSK_UMEM__MAX_FRAME_SIZE / 2) - 1, - XSK_UMEM__LARGE_FRAME_SIZE * 2); -} - -static int testapp_tx_queue_consumer(struct test_spec *test) -{ - int nr_packets; - - if (test->mode == TEST_MODE_ZC) { - ksft_test_result_skip("Can not run TX_QUEUE_CONSUMER test for ZC mode\n"); - return TEST_SKIP; - } - - nr_packets = MAX_TX_BUDGET_DEFAULT + 1; - pkt_stream_replace(test, nr_packets, MIN_PKT_SIZE); - test->ifobj_tx->xsk->batch_size = nr_packets; - test->ifobj_tx->xsk->check_consumer = true; - - return testapp_validate_traffic(test); -} - static void run_pkt_test(struct test_spec *test) { int ret; ret = test->test_func(test); - if (ret == TEST_PASS) + switch (ret) { + case TEST_PASS: ksft_test_result_pass("PASS: %s %s%s\n", mode_string(test), busy_poll_string(test), test->name); - pkt_stream_restore_default(test); -} - -static struct ifobject *ifobject_create(void) -{ - struct ifobject *ifobj; - - ifobj = calloc(1, sizeof(struct ifobject)); - if (!ifobj) - return NULL; - - ifobj->xsk_arr = calloc(MAX_SOCKETS, sizeof(*ifobj->xsk_arr)); - if (!ifobj->xsk_arr) - goto out_xsk_arr; - - ifobj->umem = calloc(1, sizeof(*ifobj->umem)); - if (!ifobj->umem) - goto out_umem; - - return ifobj; - -out_umem: - free(ifobj->xsk_arr); -out_xsk_arr: - free(ifobj); - return NULL; -} + break; + case TEST_SKIP: + ksft_test_result_skip("SKIP: %s %s%s\n", mode_string(test), busy_poll_string(test), + test->name); + break; + case TEST_FAILURE: + ksft_test_result_fail("FAIL: %s %s%s\n", mode_string(test), busy_poll_string(test), + test->name); + break; + default: + ksft_test_result_fail("FAIL: %s %s%s -- Unexpected returned value (%d)\n", + mode_string(test), busy_poll_string(test), test->name, ret); + } -static void ifobject_delete(struct ifobject *ifobj) -{ - free(ifobj->umem); - free(ifobj->xsk_arr); - free(ifobj); + pkt_stream_restore_default(test); } static bool is_xdp_supported(int ifindex) @@ -2726,47 +319,6 @@ static bool is_xdp_supported(int ifindex) return true; } -static const struct test_spec tests[] = { - {.name = "SEND_RECEIVE", .test_func = testapp_send_receive}, - {.name = "SEND_RECEIVE_2K_FRAME", .test_func = testapp_send_receive_2k_frame}, - {.name = "SEND_RECEIVE_SINGLE_PKT", .test_func = testapp_single_pkt}, - {.name = "POLL_RX", .test_func = testapp_poll_rx}, - {.name = "POLL_TX", .test_func = testapp_poll_tx}, - {.name = "POLL_RXQ_FULL", .test_func = testapp_poll_rxq_tmout}, - {.name = "POLL_TXQ_FULL", .test_func = testapp_poll_txq_tmout}, - {.name = "SEND_RECEIVE_UNALIGNED", .test_func = testapp_send_receive_unaligned}, - {.name = "ALIGNED_INV_DESC", .test_func = testapp_aligned_inv_desc}, - {.name = "ALIGNED_INV_DESC_2K_FRAME_SIZE", .test_func = testapp_aligned_inv_desc_2k_frame}, - {.name = "UNALIGNED_INV_DESC", .test_func = testapp_unaligned_inv_desc}, - {.name = "UNALIGNED_INV_DESC_4001_FRAME_SIZE", - .test_func = testapp_unaligned_inv_desc_4001_frame}, - {.name = "UMEM_HEADROOM", .test_func = testapp_headroom}, - {.name = "TEARDOWN", .test_func = testapp_teardown}, - {.name = "BIDIRECTIONAL", .test_func = testapp_bidirectional}, - {.name = "STAT_RX_DROPPED", .test_func = testapp_stats_rx_dropped}, - {.name = "STAT_TX_INVALID", .test_func = testapp_stats_tx_invalid_descs}, - {.name = "STAT_RX_FULL", .test_func = testapp_stats_rx_full}, - {.name = "STAT_FILL_EMPTY", .test_func = testapp_stats_fill_empty}, - {.name = "XDP_PROG_CLEANUP", .test_func = testapp_xdp_prog_cleanup}, - {.name = "XDP_DROP_HALF", .test_func = testapp_xdp_drop}, - {.name = "XDP_SHARED_UMEM", .test_func = testapp_xdp_shared_umem}, - {.name = "XDP_METADATA_COPY", .test_func = testapp_xdp_metadata}, - {.name = "XDP_METADATA_COPY_MULTI_BUFF", .test_func = testapp_xdp_metadata_mb}, - {.name = "SEND_RECEIVE_9K_PACKETS", .test_func = testapp_send_receive_mb}, - {.name = "SEND_RECEIVE_UNALIGNED_9K_PACKETS", - .test_func = testapp_send_receive_unaligned_mb}, - {.name = "ALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_aligned_inv_desc_mb}, - {.name = "UNALIGNED_INV_DESC_MULTI_BUFF", .test_func = testapp_unaligned_inv_desc_mb}, - {.name = "TOO_MANY_FRAGS", .test_func = testapp_too_many_frags}, - {.name = "HW_SW_MIN_RING_SIZE", .test_func = testapp_hw_sw_min_ring_size}, - {.name = "HW_SW_MAX_RING_SIZE", .test_func = testapp_hw_sw_max_ring_size}, - {.name = "XDP_ADJUST_TAIL_SHRINK", .test_func = testapp_adjust_tail_shrink}, - {.name = "XDP_ADJUST_TAIL_SHRINK_MULTI_BUFF", .test_func = testapp_adjust_tail_shrink_mb}, - {.name = "XDP_ADJUST_TAIL_GROW", .test_func = testapp_adjust_tail_grow}, - {.name = "XDP_ADJUST_TAIL_GROW_MULTI_BUFF", .test_func = testapp_adjust_tail_grow_mb}, - {.name = "TX_QUEUE_CONSUMER", .test_func = testapp_tx_queue_consumer}, - }; - static void print_tests(void) { u32 i; @@ -2774,10 +326,13 @@ static void print_tests(void) printf("Tests:\n"); for (i = 0; i < ARRAY_SIZE(tests); i++) printf("%u: %s\n", i, tests[i].name); + for (i = ARRAY_SIZE(tests); i < ARRAY_SIZE(tests) + ARRAY_SIZE(ci_skip_tests); i++) + printf("%u: %s\n", i, ci_skip_tests[i - ARRAY_SIZE(tests)].name); } int main(int argc, char **argv) { + const size_t total_tests = ARRAY_SIZE(tests) + ARRAY_SIZE(ci_skip_tests); struct pkt_stream *rx_pkt_stream_default; struct pkt_stream *tx_pkt_stream_default; struct ifobject *ifobj_tx, *ifobj_rx; @@ -2805,7 +360,7 @@ int main(int argc, char **argv) print_tests(); ksft_exit_xpass(); } - if (opt_run_test != RUN_ALL_TESTS && opt_run_test >= ARRAY_SIZE(tests)) { + if (opt_run_test != RUN_ALL_TESTS && opt_run_test >= total_tests) { ksft_print_msg("Error: test %u does not exist.\n", opt_run_test); ksft_exit_xfail(); } @@ -2830,10 +385,13 @@ int main(int argc, char **argv) ifobj_tx->set_ring.default_rx = ifobj_tx->ring.rx_pending; } - init_iface(ifobj_rx, worker_testapp_validate_rx); - init_iface(ifobj_tx, worker_testapp_validate_tx); + if (init_iface(ifobj_rx, worker_testapp_validate_rx) || + init_iface(ifobj_tx, worker_testapp_validate_tx)) { + ksft_print_msg("Error : can't initialize interfaces\n"); + ksft_exit_xfail(); + } - test_spec_init(&test, ifobj_tx, ifobj_rx, 0, &tests[0]); + test_init(&test, ifobj_tx, ifobj_rx, 0, &tests[0]); tx_pkt_stream_default = pkt_stream_generate(DEFAULT_PKT_CNT, MIN_PKT_SIZE); rx_pkt_stream_default = pkt_stream_generate(DEFAULT_PKT_CNT, MIN_PKT_SIZE); if (!tx_pkt_stream_default || !rx_pkt_stream_default) @@ -2842,7 +400,7 @@ int main(int argc, char **argv) test.rx_pkt_stream_default = rx_pkt_stream_default; if (opt_run_test == RUN_ALL_TESTS) - nb_tests = ARRAY_SIZE(tests); + nb_tests = total_tests; else nb_tests = 1; if (opt_mode == TEST_MODE_ALL) { @@ -2864,11 +422,15 @@ int main(int argc, char **argv) if (opt_mode != TEST_MODE_ALL && i != opt_mode) continue; - for (j = 0; j < ARRAY_SIZE(tests); j++) { + for (j = 0; j < total_tests; j++) { if (opt_run_test != RUN_ALL_TESTS && j != opt_run_test) continue; - test_spec_init(&test, ifobj_tx, ifobj_rx, i, &tests[j]); + if (j < ARRAY_SIZE(tests)) + test_init(&test, ifobj_tx, ifobj_rx, i, &tests[j]); + else + test_init(&test, ifobj_tx, ifobj_rx, i, + &ci_skip_tests[j - ARRAY_SIZE(tests)]); run_pkt_test(&test); usleep(USLEEP_MAX); diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h index 4df3a5d329acf8..3ca518df23adaf 100644 --- a/tools/testing/selftests/bpf/xskxceiver.h +++ b/tools/testing/selftests/bpf/xskxceiver.h @@ -22,169 +22,13 @@ #define PF_XDP AF_XDP #endif -#ifndef SO_BUSY_POLL_BUDGET -#define SO_BUSY_POLL_BUDGET 70 -#endif - -#ifndef SO_PREFER_BUSY_POLL -#define SO_PREFER_BUSY_POLL 69 -#endif - -#define TEST_PASS 0 -#define TEST_FAILURE -1 -#define TEST_CONTINUE 1 -#define TEST_SKIP 2 -#define MAX_INTERFACES 2 -#define MAX_INTERFACE_NAME_CHARS 16 -#define MAX_TEST_NAME_SIZE 48 #define MAX_TEARDOWN_ITER 10 -#define PKT_HDR_SIZE (sizeof(struct ethhdr) + 2) /* Just to align the data in the packet */ -#define MIN_PKT_SIZE 64 -#define MAX_ETH_PKT_SIZE 1518 #define MAX_ETH_JUMBO_SIZE 9000 -#define USLEEP_MAX 10000 #define SOCK_RECONF_CTR 10 -#define DEFAULT_BATCH_SIZE 64 -#define POLL_TMOUT 1000 -#define THREAD_TMOUT 3 -#define DEFAULT_PKT_CNT (4 * 1024) -#define DEFAULT_UMEM_BUFFERS (DEFAULT_PKT_CNT / 4) #define RX_FULL_RXQSIZE 32 #define UMEM_HEADROOM_TEST_SIZE 128 #define XSK_UMEM__INVALID_FRAME_SIZE (MAX_ETH_JUMBO_SIZE + 1) -#define XSK_UMEM__LARGE_FRAME_SIZE (3 * 1024) -#define XSK_UMEM__MAX_FRAME_SIZE (4 * 1024) -#define XSK_DESC__INVALID_OPTION (0xffff) -#define HUGEPAGE_SIZE (2 * 1024 * 1024) -#define PKT_DUMP_NB_TO_PRINT 16 #define RUN_ALL_TESTS UINT_MAX #define NUM_MAC_ADDRESSES 4 -#define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0) - -enum test_mode { - TEST_MODE_SKB, - TEST_MODE_DRV, - TEST_MODE_ZC, - TEST_MODE_ALL -}; - -struct xsk_umem_info { - struct xsk_ring_prod fq; - struct xsk_ring_cons cq; - struct xsk_umem *umem; - u64 next_buffer; - u32 num_frames; - u32 frame_headroom; - void *buffer; - u32 frame_size; - u32 base_addr; - u32 fill_size; - u32 comp_size; - bool unaligned_mode; -}; - -struct xsk_socket_info { - struct xsk_ring_cons rx; - struct xsk_ring_prod tx; - struct xsk_umem_info *umem; - struct xsk_socket *xsk; - struct pkt_stream *pkt_stream; - u32 outstanding_tx; - u32 rxqsize; - u32 batch_size; - u8 dst_mac[ETH_ALEN]; - u8 src_mac[ETH_ALEN]; - bool check_consumer; -}; - -struct pkt { - int offset; - u32 len; - u32 pkt_nb; - bool valid; - u16 options; -}; - -struct pkt_stream { - u32 nb_pkts; - u32 current_pkt_nb; - struct pkt *pkts; - u32 max_pkt_len; - u32 nb_rx_pkts; - u32 nb_valid_entries; - bool verbatim; -}; - -struct set_hw_ring { - u32 default_tx; - u32 default_rx; -}; - -struct ifobject; -struct test_spec; -typedef int (*validation_func_t)(struct ifobject *ifobj); -typedef void *(*thread_func_t)(void *arg); -typedef int (*test_func_t)(struct test_spec *test); - -struct ifobject { - char ifname[MAX_INTERFACE_NAME_CHARS]; - struct xsk_socket_info *xsk; - struct xsk_socket_info *xsk_arr; - struct xsk_umem_info *umem; - thread_func_t func_ptr; - validation_func_t validation_func; - struct xsk_xdp_progs *xdp_progs; - struct bpf_map *xskmap; - struct bpf_program *xdp_prog; - struct ethtool_ringparam ring; - struct set_hw_ring set_ring; - enum test_mode mode; - int ifindex; - int mtu; - u32 bind_flags; - u32 xdp_zc_max_segs; - bool tx_on; - bool rx_on; - bool use_poll; - bool busy_poll; - bool use_fill_ring; - bool release_rx; - bool shared_umem; - bool use_metadata; - bool unaligned_supp; - bool multi_buff_supp; - bool multi_buff_zc_supp; - bool hw_ring_size_supp; -}; - -struct test_spec { - struct ifobject *ifobj_tx; - struct ifobject *ifobj_rx; - struct pkt_stream *tx_pkt_stream_default; - struct pkt_stream *rx_pkt_stream_default; - struct bpf_program *xdp_prog_rx; - struct bpf_program *xdp_prog_tx; - struct bpf_map *xskmap_rx; - struct bpf_map *xskmap_tx; - test_func_t test_func; - int mtu; - u16 total_steps; - u16 current_step; - u16 nb_sockets; - bool fail; - bool set_ring; - bool adjust_tail; - bool adjust_tail_support; - enum test_mode mode; - char name[MAX_TEST_NAME_SIZE]; -}; - -pthread_barrier_t barr; -pthread_mutex_t pacing_mutex = PTHREAD_MUTEX_INITIALIZER; - -int pkts_in_flight; - -static const u8 g_mac[ETH_ALEN] = {0x55, 0x44, 0x33, 0x22, 0x11, 0x00}; - #endif /* XSKXCEIVER_H_ */ |
