Skip to content

Latest commit

 

History

History
1385 lines (1257 loc) · 59.5 KB

bpf-prog-type.md

File metadata and controls

1385 lines (1257 loc) · 59.5 KB

BPF 程序类型

[TOC]

BPF 程序类型定义

BPF 相关的程序,首先需要设置为相对应的的程序类型,截止Linux 内核 5.8 程序类型定义有 29 个,而且还是持续增加中,完整的列表可参见 bpf.h

Linux 5.9 版本 30 个, 分析代码基于linux v5.8 tag

BPF 程序类型(prog_type)决定了程序可以调用的内核辅助函数的子集。BPF 程序类型也决定了程序输入上下文 -- bpf_context 结构的格式,其作为 BPF 程序中的第一个输入参数(数据blog)。例如,跟踪程序与套接字过滤器程序的辅助函数子集并不完全相同(尽管它们可能具有一些共同的帮助函数)。同样,跟踪程序的输入上下文(context)是一组寄存器值,而套接字过滤器的输入是一个网络数据包。更加详细的描述参见 man 2 bpf。

特定类型的eBPF程序可用的功能集将来可能会增加。

include/uapi/linux/bpf.h

 161 enum bpf_prog_type {
 162         BPF_PROG_TYPE_UNSPEC,
 163         BPF_PROG_TYPE_SOCKET_FILTER,
 164         BPF_PROG_TYPE_KPROBE,
 165         BPF_PROG_TYPE_SCHED_CLS,
 166         BPF_PROG_TYPE_SCHED_ACT,
 167         BPF_PROG_TYPE_TRACEPOINT,
 168         BPF_PROG_TYPE_XDP,
 169         BPF_PROG_TYPE_PERF_EVENT,
 170         BPF_PROG_TYPE_CGROUP_SKB,
 171         BPF_PROG_TYPE_CGROUP_SOCK,
 172         BPF_PROG_TYPE_LWT_IN,
 173         BPF_PROG_TYPE_LWT_OUT,
 174         BPF_PROG_TYPE_LWT_XMIT,
 175         BPF_PROG_TYPE_SOCK_OPS,
 176         BPF_PROG_TYPE_SK_SKB,
 177         BPF_PROG_TYPE_CGROUP_DEVICE,
 178         BPF_PROG_TYPE_SK_MSG,
 179         BPF_PROG_TYPE_RAW_TRACEPOINT,
 180         BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
 181         BPF_PROG_TYPE_LWT_SEG6LOCAL,
 182         BPF_PROG_TYPE_LIRC_MODE2,
 183         BPF_PROG_TYPE_SK_REUSEPORT,
 184         BPF_PROG_TYPE_FLOW_DISSECTOR,
 185         BPF_PROG_TYPE_CGROUP_SYSCTL,
 186         BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
 187         BPF_PROG_TYPE_CGROUP_SOCKOPT,
 188         BPF_PROG_TYPE_TRACING,
 189         BPF_PROG_TYPE_STRUCT_OPS,
 190         BPF_PROG_TYPE_EXT,
 191         BPF_PROG_TYPE_LSM,
 192 };

BPF 程序类型确定

程序在载入的过程中会根据 BPF 进行判断,load_bpf_file 用户程序用于加载的入口函数,loader.c 文件中加载 BPF 程序的使用样例如下:

以 “Hello World ” BPF 程序为例,完整代码参见 bpf_program.c

loader.c 程序

#include "bpf_load.h"
#include <stdio.h>

int main(int argc, char **argv) {
  if (load_bpf_file("bpf_program.o") != 0) { // 用于加载 ELF 格式的 BPF 程序
    printf("The kernel didn't load the BPF program\n");
    return -1;
  }

  read_trace_pipe();

  return 0;
}

bpf_program.c 程序源码

#include <linux/bpf.h>
#define SEC(NAME) __attribute__((section(NAME), used))

static int (*bpf_trace_printk)(const char *fmt, int fmt_size,
                               ...) = (void *)BPF_FUNC_trace_printk;

SEC("tracepoint/syscalls/sys_enter_execve")
int bpf_prog(void *ctx) {
  char msg[] = "Hello, BPF World!";
  bpf_trace_printk(msg, sizeof(msg));
  return 0;
}

char _license[] SEC("license") = "GPL";

load_bpf_file 函数

参见文件 samples/bpf/bpf_load.c 中的 load_bpf_file 函数:

此三个变量为全局

36 int map_fd[MAX_MAPS]; 37 int prog_fd[MAX_PROGS]; 38 int event_fd[MAX_PROGS];

30 #define DEBUGFS "/sys/kernel/debug/tracing/"  
31 // 加载过程中需用到的全局变量
32 static char license[128];
33 static int kern_version;
34 static bool processed_sec[128];
35 char bpf_log_buf[BPF_LOG_BUF_SIZE];
36 int map_fd[MAX_MAPS];
37 int prog_fd[MAX_PROGS];
38 int event_fd[MAX_PROGS];
39 int prog_cnt;
40 int prog_array_fd = -1;
41
42 struct bpf_map_data map_data[MAX_MAPS];
43 int map_data_count;

// load_bpf_file 为主程序调用的主入口
659 int load_bpf_file(char *path)
660 {
661         return do_load_bpf_file(path, NULL);
662 }

// main -> load_bpf_file -> do_load_bpf_file
508 static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
509 {
510         int fd, i, ret, maps_shndx = -1, strtabidx = -1;
511         Elf *elf;
512         GElf_Ehdr ehdr;
513         GElf_Shdr shdr, shdr_prog;
514         Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL;
515         char *shname, *shname_prog;
516         int nr_maps = 0;

526         fd = open(path, O_RDONLY, 0);
527         if (fd < 0)
528                 return 1;
529
530         elf = elf_begin(fd, ELF_C_READ, NULL);
534
535         if (gelf_getehdr(elf, &ehdr) != &ehdr)
536                 return 1;

            // 循环读取 elf 中的所有 sections,获取 license 和 map 相关信息
541         /* scan over all elf sections to get license and map info */
542         for (i = 1; i < ehdr.e_shnum; i++) {
543
544                 if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
545                         continue;
                    
  									// 读取 license 信息
552                 if (strcmp(shname, "license") == 0) {
553                         processed_sec[i] = true;
554                         memcpy(license, data->d_buf, data->d_size);
555                 } else if (strcmp(shname, "version") == 0) { //  读取 version 信息
556                         processed_sec[i] = true;
557                         if (data->d_size != sizeof(int)) {
558                                 printf("invalid size of version section %zd\n",
559                                        data->d_size);
560                                 return 1;
561                         }
562                         memcpy(&kern_version, data->d_buf, sizeof(int));
563                 } else if (strcmp(shname, "maps") == 0) {  //  读取 maps 信息
564                         int j;
565
566                         maps_shndx = i;
567                         data_maps = data;
568                         for (j = 0; j < MAX_MAPS; j++)
569                                 map_data[j].fd = -1;
570                 } else if (shdr.sh_type == SHT_SYMTAB) {
571                         strtabidx = shdr.sh_link;
572                         symbols = data;
573                 }
574         }
            // 如果存在 maps 数据,则初始化 map 
583         if (data_maps) {
584                 nr_maps = load_elf_maps_section(map_data, maps_shndx,
585                                                 elf, symbols, strtabidx);

591                 if (load_maps(map_data, nr_maps, fixup_map))
592                         goto done;
593                 map_data_count = nr_maps;
594
595                 processed_sec[maps_shndx] = true;
596         }
597
            // 处理所有的需要重定向的区域,并完成针对 maps insns 的重写
598         /* process all relo sections, and rewrite bpf insns for maps */
599         for (i = 1; i < ehdr.e_shnum; i++) {
600                 if (processed_sec[i])
601                         continue;
602
603                 if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
604                         continue;
605
606                 if (shdr.sh_type == SHT_REL) {
607                         struct bpf_insn *insns;
608
609                         /* locate prog sec that need map fixup (relocations) */
610                         if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog,
611                                     &shdr_prog, &data_prog))
612                                 continue;
613
614                         if (shdr_prog.sh_type != SHT_PROGBITS ||
615                             !(shdr_prog.sh_flags & SHF_EXECINSTR))
616                                 continue;
617
618                         insns = (struct bpf_insn *) data_prog->d_buf;
619                         processed_sec[i] = true; /* relo section */
620
621                         if (parse_relo_and_apply(data, symbols, &shdr, insns,
622                                                  map_data, nr_maps))
623                                 continue;
624                 }
625         }
626
627         /* load programs */
628         for (i = 1; i < ehdr.e_shnum; i++) {
629
630                 if (processed_sec[i])
631                         continue;
632
633                 if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
634                         continue;
635									// 此处进行 BPF 程序类型进行判断,并调用 load_and_attach 进行加载
636                 if (memcmp(shname, "kprobe/", 7) == 0 ||
637                     memcmp(shname, "kretprobe/", 10) == 0 ||
638                     memcmp(shname, "tracepoint/", 11) == 0 ||
639                     memcmp(shname, "raw_tracepoint/", 15) == 0 ||
640                     memcmp(shname, "xdp", 3) == 0 ||
641                     memcmp(shname, "perf_event", 10) == 0 ||
642                     memcmp(shname, "socket", 6) == 0 ||
643                     memcmp(shname, "cgroup/", 7) == 0 ||
644                     memcmp(shname, "sockops", 7) == 0 ||
645                     memcmp(shname, "sk_skb", 6) == 0 ||
646                     memcmp(shname, "sk_msg", 6) == 0) {
647                         ret = load_and_attach(shname, data->d_buf,
648                                               data->d_size);
649                         if (ret != 0)
650                                 goto done;
651                 }
652         }
653
654 done:
655         close(fd);
656         return ret;
657 }

其中 get_sec 函数完整定义如下:

// main -> load_bpf_file -> do_load_bpf_file -> get_sec
316 static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname,
317                    GElf_Shdr *shdr, Elf_Data **data)
318 {
319         Elf_Scn *scn;
320
321         scn = elf_getscn(elf, i);
322         if (!scn)
323                 return 1;
324
325         if (gelf_getshdr(scn, shdr) != shdr)
326                 return 2;
327
328         *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
329         if (!*shname || !shdr->sh_size)
330                 return 3;
331
332         *data = elf_getdata(scn, 0);
333         if (!*data || elf_getdata(scn, *data) != NULL)
334                 return 4;
335
336         return 0;
337 }

load_and_attach 函数

samples/bpf/bpf_load.c 中的函数 load_and_attach 函数中确定程序类型,函数 78 - 126 行通过 sec 中定义的名字来确定 BPF 程序类型,比如 bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;if (is_tracepoint) { prog_type = BPF_PROG_TYPE_TRACEPOINT;

在确定程序类型以后,调用 sys_perf_event_openioctl 两个函数完成 BPF 程序注册到 trace 数据结构中,255 行代码 ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd) 完成了 BPF程序 与 TracePoint 的关联。

 76 static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 77 {
 78         bool is_socket = strncmp(event, "socket", 6) == 0;
 79         bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
 80         bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
 81         bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
 82         bool is_raw_tracepoint = strncmp(event, "raw_tracepoint/", 15) == 0;
 83         bool is_xdp = strncmp(event, "xdp", 3) == 0;
 84         bool is_perf_event = strncmp(event, "perf_event", 10) == 0;
 85         bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0;
 86         bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
 87         bool is_sockops = strncmp(event, "sockops", 7) == 0;
 88         bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0;
 89         bool is_sk_msg = strncmp(event, "sk_msg", 6) == 0;
 90         size_t insns_cnt = size / sizeof(struct bpf_insn);
 91         enum bpf_prog_type prog_type;
 92         char buf[256];
 93         int fd, efd, err, id;
 94         struct perf_event_attr attr = {};
 95
 96         attr.type = PERF_TYPE_TRACEPOINT;
 97         attr.sample_type = PERF_SAMPLE_RAW;
 98         attr.sample_period = 1;
 99         attr.wakeup_events = 1;
100
101         if (is_socket) {
102                 prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
103         } else if (is_kprobe || is_kretprobe) {
104                 prog_type = BPF_PROG_TYPE_KPROBE;
105         } else if (is_tracepoint) {
106                 prog_type = BPF_PROG_TYPE_TRACEPOINT;
107         } else if (is_raw_tracepoint) {
108                 prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT;
109         } else if (is_xdp) {
110                 prog_type = BPF_PROG_TYPE_XDP;
111         } else if (is_perf_event) {
112                 prog_type = BPF_PROG_TYPE_PERF_EVENT;
113         } else if (is_cgroup_skb) {
114                 prog_type = BPF_PROG_TYPE_CGROUP_SKB;
115         } else if (is_cgroup_sk) {
116                 prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
117         } else if (is_sockops) {
118                 prog_type = BPF_PROG_TYPE_SOCK_OPS;
119         } else if (is_sk_skb) {
120                 prog_type = BPF_PROG_TYPE_SK_SKB;
121         } else if (is_sk_msg) {
122                 prog_type = BPF_PROG_TYPE_SK_MSG;
123         } else {
124                 printf("Unknown event '%s'\n", event);
125                 return -1;
126         }
127
128         if (prog_cnt == MAX_PROGS) /*#define MAX_PROGS 32 samples/bpf/bpf_load.h*/
129                 return -1;
130
131         fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version,
132                               bpf_log_buf, BPF_LOG_BUF_SIZE);
				    //....
   
168         if (is_kprobe || is_kretprobe) {
                  // ...
212         } else if (is_tracepoint) {  // 如果为 tracepoint,获取到 tracepoint id 的完整目录
213                 event += 11;
214
215                 if (*event == 0) {
216                         printf("event name cannot be empty\n");
217                         return -1;
218                 }
219                 strcpy(buf, DEBUGFS); 
220                 strcat(buf, "events/"); 
221                 strcat(buf, event);
222                 strcat(buf, "/id"); // "/sys/kernel/debug/tracing/syscalls/sys_enter_execve/id"
223         }
224         // 1. 打开文件 "/sys/kernel/debug/tracing/syscalls/sys_enter_execve/id",文件内容 “685”
225         efd = open(buf, O_RDONLY, 0);
226         if (efd < 0) {
227                 printf("failed to open event %s\n", event);
228                 return -1;
229         }
230         // 2. 获取到 event id  “685”
231         err = read(efd, buf, sizeof(buf));
232         if (err < 0 || err >= sizeof(buf)) {
233                 printf("read from '%s' failed '%s'\n", event, strerror(errno));
234                 return -1;
235         }
236
237         close(efd);
238
239         buf[err] = 0;
240         id = atoi(buf);
241         attr.config = id;
242         // 3. 使用 sys_perf_event_open 函数开启  
243         efd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
244         if (efd < 0) {
245                 printf("event %d fd %d err %s\n", id, efd, strerror(errno));
246                 return -1;
247         }
248         event_fd[prog_cnt - 1] = efd;

            // 4. 开启该 event 的追踪
249         err = ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
250         if (err < 0) {
251                 printf("ioctl PERF_EVENT_IOC_ENABLE failed err %s\n",
252                        strerror(errno));
253                 return -1;
254         }

            // 5. 设置当前追踪所对应的 BPF 程序,fd 即为我们本次打开的 BPF 程序 fd
            // 相关说明参见 https://lwn.net/Articles/683504/
255         err = ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
256         if (err < 0) {
257                 printf("ioctl PERF_EVENT_IOC_SET_BPF failed err %s\n",
258                        strerror(errno));
259                 return -1;
260         }
261
262         return 0;

在 perf 底层注册结构 perf_event ,位于文件 include/linux/trace_events.h

611 struct perf_event {
    // ...
749 #ifdef CONFIG_EVENT_TRACING
750         struct trace_event_call         *tp_event;   // -> trace_event_call
751         struct event_filter             *filter;
752 #ifdef CONFIG_FUNCTION_TRACER
    // ...
} 

278 struct trace_event_call {
		// ..
305         struct bpf_prog_array __rcu     *prog_array;  // 保存 bpf 程序

310 };

在 event 触发后的代码逻辑

 // perf event 触发的时候调用 bpf
 9240 void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
 9241                                struct trace_event_call *call, u64 count,
 9242                                struct pt_regs *regs, struct hlist_head *head,
 9243                                struct task_struct *task)
 9244 {
 9245         if (bpf_prog_array_valid(call)) {
 9246                 *(struct pt_regs **)raw_data = regs;
 9247                 if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) { // 调用 bpf 程序
 9248                         perf_swevent_put_recursion_context(rctx);
 9249                         return;
 9250                 }
 9251         }
 9252         perf_tp_event(call->event.type, count, raw_data, size, regs, head,
 9253                       rctx, task);
 9254 }
 9255 EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);

样例程序 ELF 和加载分析

使用工具 readelf 查看 bpf_program.o 获取相关信息

# readelf -h bpf_program.o
ELF Header:
  Magic:   7f 45 4c 46 02 01 01 00 00 00 00 00 00 00 00 00
  Class:                             ELF64
  Data:                              2's complement, little endian
  Version:                           1 (current)
  OS/ABI:                            UNIX - System V
  ABI Version:                       0
  Type:                              REL (Relocatable file)
  Machine:                           Linux BPF                  # Linux BPF 程序类型
  Version:                           0x1
  Entry point address:               0x0
  Start of program headers:          0 (bytes into file)
  Start of section headers:          424 (bytes into file)
  Flags:                             0x0
  Size of this header:               64 (bytes)
  Size of program headers:           0 (bytes)
  Number of program headers:         0
  Size of section headers:           64 (bytes)
  Number of section headers:         8
  Section header string table index: 1
  
# readelf -S bpf_program.o
There are 8 section headers, starting at offset 0x1a8:

Section Headers:
  [Nr] Name              Type             Address           Offset
       Size              EntSize          Flags  Link  Info  Align
  [ 0]                   NULL             0000000000000000  00000000
       0000000000000000  0000000000000000           0     0     0
  [ 1] .strtab           STRTAB           0000000000000000  0000012a
       0000000000000079  0000000000000000           0     0     1
  [ 2] .text             PROGBITS         0000000000000000  00000040
       0000000000000000  0000000000000000  AX       0     0     4
  [ 3] tracepoint/syscal PROGBITS         0000000000000000  00000040  # 此处决定了 BPF 的程序类型
       0000000000000070  0000000000000000  AX       0     0     8
  [ 4] .rodata.str1.1    PROGBITS         0000000000000000  000000b0
       0000000000000012  0000000000000001 AMS       0     0     1
  [ 5] license           PROGBITS         0000000000000000  000000c2
       0000000000000004  0000000000000000  WA       0     0     1
  [ 6] .llvm_addrsig     LOOS+0xfff4c03   0000000000000000  00000128
       0000000000000002  0000000000000000   E       7     0     1
  [ 7] .symtab           SYMTAB           0000000000000000  000000c8
       0000000000000060  0000000000000018           1     2     8

BPF 的程序类型是在程序加载的时候在内核中进行确定的,一旦程序类型确定,也就确定了 BPF 程序能够访问到的 BPF 内核帮助函数。

# strace -ebpf ./bpfload
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=37366, si_uid=0, si_status=0, si_utime=0, si_stime=0} ---
bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_TRACEPOINT, insn_cnt=14, insns=0x6fecd0, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(0, 0, 0), prog_flags=0, prog_name="", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS}, 72) = 4

一个程序中可以包含多个 section 区域和不同的 BPF 程序类型

SEC("tracepoint/syscalls/sys_enter_execve")
int bpf_prog(void *ctx) {
  char msg[] = "Hello, BPF World!";
  bpf_trace_printk(msg, sizeof(msg));
  return 0;
}

SEC("socket")
int socket_prog(struct __sk_buff *skb) {
  return 0;
}

使用 strace 的结果如下

# strace -ebpf ./bpfload
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=37806, si_uid=0, si_status=0, si_utime=0, si_stime=0} ---

# BPF_PROG_TYPE_TRACEPOINT
bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_TRACEPOINT, insn_cnt=14, insns=0x1527df0, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(0, 0, 0), prog_flags=0, prog_name="", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS}, 72) = 4

# BPF_PROG_TYPE_SOCKET_FILTER
bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_SOCKET_FILTER, insn_cnt=2, insns=0x1527e70, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(0, 0, 0), prog_flags=0, prog_name="", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS}, 72) = 6
^Cstrace: Process 37805 detached

一个 ELF 文件中可以包含多个 BPF 程序,加载后的程序 fd 会保存在全局变量 prog_fd[] 数组中,按照顺序进行加载。

BPF Verfier 检测(TODO)

BPF Verfier 会根据底层的程序类型进行对应的检查:

static bool may_access_skb(enum bpf_prog_type type)
{
        switch (type) {
        case BPF_PROG_TYPE_SOCKET_FILTER:
        case BPF_PROG_TYPE_SCHED_CLS:
        case BPF_PROG_TYPE_SCHED_ACT:
                return true;
        default:
                return false;
        }
}

BPF 程序类型支持 Helper 函数列表

每种 BPF 程序类型可以使用的函数列表参见:program-types。程序类型对应的函数关系可以通过以下命令来进行获取:

git grep -W 'func_proto(enum bpf_func_id func_id' kernel/ net/ drivers/

完整的程序类型对应的帮助函数表格如下:

Program Type Helper Functions
BPF_PROG_TYPE_SOCKET_FILTER BPF_FUNC_skb_load_bytes()
BPF_FUNC_skb_load_bytes_relative()
BPF_FUNC_get_socket_cookie()
BPF_FUNC_get_socket_uid()
BPF_FUNC_perf_event_output()
Base functions
BPF_PROG_TYPE_KPROBE BPF_FUNC_perf_event_output()
BPF_FUNC_get_stackid()
BPF_FUNC_get_stack()
BPF_FUNC_perf_event_read_value()
BPF_FUNC_override_return()
Tracing functions
BPF_PROG_TYPE_SCHED_CLS
BPF_PROG_TYPE_SCHED_ACT
BPF_FUNC_skb_store_bytes()
BPF_FUNC_skb_load_bytes()
BPF_FUNC_skb_load_bytes_relative()
BPF_FUNC_skb_pull_data()
BPF_FUNC_csum_diff()
BPF_FUNC_csum_update()
BPF_FUNC_l3_csum_replace()
BPF_FUNC_l4_csum_replace()
BPF_FUNC_clone_redirect()
BPF_FUNC_get_cgroup_classid()
BPF_FUNC_skb_vlan_push()
BPF_FUNC_skb_vlan_pop()
BPF_FUNC_skb_change_proto()
BPF_FUNC_skb_change_type()
BPF_FUNC_skb_adjust_room()
BPF_FUNC_skb_change_tail()
BPF_FUNC_skb_get_tunnel_key()
BPF_FUNC_skb_set_tunnel_key()
BPF_FUNC_skb_get_tunnel_opt()
BPF_FUNC_skb_set_tunnel_opt()
BPF_FUNC_redirect()
BPF_FUNC_get_route_realm()
BPF_FUNC_get_hash_recalc()
BPF_FUNC_set_hash_invalid()
BPF_FUNC_set_hash()
BPF_FUNC_perf_event_output()
BPF_FUNC_get_smp_processor_id()
BPF_FUNC_skb_under_cgroup()
BPF_FUNC_get_socket_cookie()
BPF_FUNC_get_socket_uid()
BPF_FUNC_fib_lookup()
BPF_FUNC_skb_get_xfrm_state()
BPF_FUNC_skb_cgroup_id()
Base functions
BPF_PROG_TYPE_TRACEPOINT BPF_FUNC_perf_event_output()
BPF_FUNC_get_stackid()
BPF_FUNC_get_stack()
Tracing functions
BPF_PROG_TYPE_XDP BPF_FUNC_perf_event_output()
BPF_FUNC_get_smp_processor_id()
BPF_FUNC_csum_diff()
BPF_FUNC_xdp_adjust_head()
BPF_FUNC_xdp_adjust_meta()
BPF_FUNC_redirect()
BPF_FUNC_redirect_map()
BPF_FUNC_xdp_adjust_tail()
BPF_FUNC_fib_lookup()
Base functions
BPF_PROG_TYPE_PERF_EVENT BPF_FUNC_perf_event_output()
BPF_FUNC_get_stackid()
BPF_FUNC_get_stack()
BPF_FUNC_perf_prog_read_value()
Tracing functions
BPF_PROG_TYPE_CGROUP_SKB BPF_FUNC_skb_load_bytes()
BPF_FUNC_skb_load_bytes_relative()
BPF_FUNC_get_socket_cookie()
BPF_FUNC_get_socket_uid()
Base functions
BPF_PROG_TYPE_CGROUP_SOCK BPF_FUNC_get_current_uid_gid()
Base functions
BPF_PROG_TYPE_LWT_IN BPF_FUNC_lwt_push_encap()
LWT functions
Base functions
BPF_PROG_TYPE_LWT_OUT LWT functions
Base functions
BPF_PROG_TYPE_LWT_XMIT BPF_FUNC_skb_get_tunnel_key()
BPF_FUNC_skb_set_tunnel_key()
BPF_FUNC_skb_get_tunnel_opt()
BPF_FUNC_skb_set_tunnel_opt()
BPF_FUNC_redirect()
BPF_FUNC_clone_redirect()
BPF_FUNC_skb_change_tail()
BPF_FUNC_skb_change_head()
BPF_FUNC_skb_store_bytes()
BPF_FUNC_csum_update()
BPF_FUNC_l3_csum_replace()
BPF_FUNC_l4_csum_replace()
BPF_FUNC_set_hash_invalid()
LWT functions
BPF_PROG_TYPE_SOCK_OPS BPF_FUNC_setsockopt()
BPF_FUNC_getsockopt()
BPF_FUNC_sock_ops_cb_flags_set()
BPF_FUNC_sock_map_update()
BPF_FUNC_sock_hash_update()
BPF_FUNC_get_socket_cookie()
Base functions
BPF_PROG_TYPE_SK_SKB BPF_FUNC_skb_store_bytes()
BPF_FUNC_skb_load_bytes()
BPF_FUNC_skb_pull_data()
BPF_FUNC_skb_change_tail()
BPF_FUNC_skb_change_head()
BPF_FUNC_get_socket_cookie()
BPF_FUNC_get_socket_uid()
BPF_FUNC_sk_redirect_map()
BPF_FUNC_sk_redirect_hash()
BPF_FUNC_sk_lookup_tcp()
BPF_FUNC_sk_lookup_udp()
BPF_FUNC_sk_release()
Base functions
BPF_PROG_TYPE_CGROUP_DEVICE BPF_FUNC_map_lookup_elem()
BPF_FUNC_map_update_elem()
BPF_FUNC_map_delete_elem()
BPF_FUNC_get_current_uid_gid()
BPF_FUNC_trace_printk()
BPF_PROG_TYPE_SK_MSG BPF_FUNC_msg_redirect_map()
BPF_FUNC_msg_redirect_hash()
BPF_FUNC_msg_apply_bytes()
BPF_FUNC_msg_cork_bytes()
BPF_FUNC_msg_pull_data()
BPF_FUNC_msg_push_data()
BPF_FUNC_msg_pop_data()
Base functions
BPF_PROG_TYPE_RAW_TRACEPOINT BPF_FUNC_perf_event_output()
BPF_FUNC_get_stackid()
BPF_FUNC_get_stack()
BPF_FUNC_skb_output()
Tracing functions
BPF_PROG_TYPE_CGROUP_SOCK_ADDR BPF_FUNC_get_current_uid_gid()
BPF_FUNC_bind()
BPF_FUNC_get_socket_cookie()
Base functions
BPF_PROG_TYPE_LWT_SEG6LOCAL BPF_FUNC_lwt_seg6_store_bytes()
BPF_FUNC_lwt_seg6_action()
BPF_FUNC_lwt_seg6_adjust_srh()
LWT functions
BPF_PROG_TYPE_LIRC_MODE2 BPF_FUNC_rc_repeat()
BPF_FUNC_rc_keydown()
BPF_FUNC_rc_pointer_rel()
BPF_FUNC_map_lookup_elem()
BPF_FUNC_map_update_elem()
BPF_FUNC_map_delete_elem()
BPF_FUNC_ktime_get_ns()
BPF_FUNC_tail_call()
BPF_FUNC_get_prandom_u32()
BPF_FUNC_trace_printk()
BPF_PROG_TYPE_SK_REUSEPORT BPF_FUNC_sk_select_reuseport()
BPF_FUNC_skb_load_bytes()
BPF_FUNC_load_bytes_relative()
Base functions
BPF_PROG_TYPE_FLOW_DISSECTOR BPF_FUNC_skb_load_bytes()
Base functions

BPF Helper 函数分组

Function Group Functions
Base functions BPF_FUNC_map_lookup_elem()
BPF_FUNC_map_update_elem()
BPF_FUNC_map_delete_elem()
BPF_FUNC_map_peek_elem()
BPF_FUNC_map_pop_elem()
BPF_FUNC_map_push_elem()
BPF_FUNC_get_prandom_u32()
BPF_FUNC_get_smp_processor_id()
BPF_FUNC_get_numa_node_id()
BPF_FUNC_tail_call()
BPF_FUNC_ktime_get_boot_ns()
BPF_FUNC_ktime_get_ns()
BPF_FUNC_trace_printk()
BPF_FUNC_spin_lock()
BPF_FUNC_spin_unlock()
Tracing functions BPF_FUNC_map_lookup_elem()
BPF_FUNC_map_update_elem()
BPF_FUNC_map_delete_elem()
BPF_FUNC_probe_read()
BPF_FUNC_ktime_get_boot_ns()
BPF_FUNC_ktime_get_ns()
BPF_FUNC_tail_call()
BPF_FUNC_get_current_pid_tgid()
BPF_FUNC_get_current_task()
BPF_FUNC_get_current_uid_gid()
BPF_FUNC_get_current_comm()
BPF_FUNC_trace_printk()
BPF_FUNC_get_smp_processor_id()
BPF_FUNC_get_numa_node_id()
BPF_FUNC_perf_event_read()
BPF_FUNC_probe_write_user()
BPF_FUNC_current_task_under_cgroup()
BPF_FUNC_get_prandom_u32()
BPF_FUNC_probe_read_str()
BPF_FUNC_get_current_cgroup_id()
BPF_FUNC_send_signal()
BPF_FUNC_probe_read_kernel()
BPF_FUNC_probe_read_kernel_str()
BPF_FUNC_probe_read_user()
BPF_FUNC_probe_read_user_str()
BPF_FUNC_send_signal_thread()
BPF_FUNC_get_ns_current_pid_tgid()
BPF_FUNC_xdp_output()
BPF_FUNC_get_task_stack()
LWT functions BPF_FUNC_skb_load_bytes()
BPF_FUNC_skb_pull_data()
BPF_FUNC_csum_diff()
BPF_FUNC_get_cgroup_classid()
BPF_FUNC_get_route_realm()
BPF_FUNC_get_hash_recalc()
BPF_FUNC_perf_event_output()
BPF_FUNC_get_smp_processor_id()
BPF_FUNC_skb_under_cgroup()

运行 BPF 程序类型查看

BPF 程序类型可以使用工具 bpftools 进行查看

yum install bpftool -y

或者自己从源码编译

# bpftool prog help
Usage: bpftool prog { show | list } [PROG]
       bpftool prog dump xlated PROG [{ file FILE | opcodes | visual | linum }]
       bpftool prog dump jited  PROG [{ file FILE | opcodes | linum }]
       bpftool prog pin   PROG FILE
       bpftool prog { load | loadall } OBJ  PATH \
                         [type TYPE] [dev NAME] \
                         [map { idx IDX | name NAME } MAP]\
                         [pinmaps MAP_DIR]
       bpftool prog attach PROG ATTACH_TYPE [MAP]
       bpftool prog detach PROG ATTACH_TYPE [MAP]
       bpftool prog tracelog
       bpftool prog help
       MAP := { id MAP_ID | pinned FILE }
       PROG := { id PROG_ID | pinned FILE | tag PROG_TAG }
       TYPE := { socket | kprobe | kretprobe | classifier | action |q
[...]

查看运行了 Hello World BPF 程序的机器,结果如下:

# bpftool prog show
7: tracepoint  tag c6e8e35bea53af79  gpl
	loaded_at 2020-09-26T13:19:22+0000  uid 0
	xlated 112B  jited 86B  memlock 4096B

bpftool 源码中也可以看到名字和类型的对应关系。

const char * const prog_type_name[] = {
        [BPF_PROG_TYPE_UNSPEC]                  = "unspec",
        [BPF_PROG_TYPE_SOCKET_FILTER]           = "socket_filter",
        [BPF_PROG_TYPE_KPROBE]                  = "kprobe",
        [BPF_PROG_TYPE_SCHED_CLS]               = "sched_cls",
        [BPF_PROG_TYPE_SCHED_ACT]               = "sched_act",
        [BPF_PROG_TYPE_TRACEPOINT]              = "tracepoint",
        [BPF_PROG_TYPE_XDP]                     = "xdp",
        [BPF_PROG_TYPE_PERF_EVENT]              = "perf_event",
        [BPF_PROG_TYPE_CGROUP_SKB]              = "cgroup_skb",
        [BPF_PROG_TYPE_CGROUP_SOCK]             = "cgroup_sock",
        [BPF_PROG_TYPE_LWT_IN]                  = "lwt_in",
        [BPF_PROG_TYPE_LWT_OUT]                 = "lwt_out",
        [BPF_PROG_TYPE_LWT_XMIT]                = "lwt_xmit",
        [BPF_PROG_TYPE_SOCK_OPS]                = "sock_ops",
        [BPF_PROG_TYPE_SK_SKB]                  = "sk_skb",
        [BPF_PROG_TYPE_CGROUP_DEVICE]           = "cgroup_device",
        [BPF_PROG_TYPE_SK_MSG]                  = "sk_msg",
        [BPF_PROG_TYPE_RAW_TRACEPOINT]          = "raw_tracepoint",
        [BPF_PROG_TYPE_CGROUP_SOCK_ADDR]        = "cgroup_sock_addr",
        [BPF_PROG_TYPE_LWT_SEG6LOCAL]           = "lwt_seg6local",
        [BPF_PROG_TYPE_LIRC_MODE2]              = "lirc_mode2",
        [BPF_PROG_TYPE_SK_REUSEPORT]            = "sk_reuseport",
        [BPF_PROG_TYPE_FLOW_DISSECTOR]          = "flow_dissector",
        [BPF_PROG_TYPE_CGROUP_SYSCTL]           = "cgroup_sysctl",
        [BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE] = "raw_tracepoint_writable",
        [BPF_PROG_TYPE_CGROUP_SOCKOPT]          = "cgroup_sockopt",
        [BPF_PROG_TYPE_TRACING]                 = "tracing",
        [BPF_PROG_TYPE_STRUCT_OPS]              = "struct_ops",
        [BPF_PROG_TYPE_EXT]                     = "ext",
        [BPF_PROG_TYPE_LSM]                     = "lsm",
        [BPF_PROG_TYPE_SK_LOOKUP]               = "sk_lookup",
};

TracePoint bpf 程序函数参数读取

# 可以参考到所有的 tracepoint
$ perf list  

# bcc 工具 tplist 可以查看到 tracepoint 的结构体参数
$ ./tplist -v syscalls:sys_enter_execve
syscalls:sys_enter_execve
    int __syscall_nr;
    const char * filename;
    const char *const * argv;
    const char *const * envp;

HelloWorld 改进版本 bpf_program_tp_args.c

#include <linux/bpf.h>
#define SEC(NAME) __attribute__((section(NAME), used))

static int (*bpf_trace_printk)(const char *fmt, int fmt_size,
                               ...) = (void *)BPF_FUNC_trace_printk;

/*
 +---------+
| 8 bytes | hidden 'struct pt_regs *' (inaccessible to bpf program)
+---------+
| N bytes | static tracepoint fields defined in tracepoint/format (bpf readonly)
+---------+
| dynamic | __dynamic_array bytes of tracepoint (inaccessible to bpf yet)
----------
 */
struct execve_params {
    __u64 hidden_pad;  // hidden 'struct pt_regs *' (inaccessible to bpf program)
                       // https://lore.kernel.org/patchwork/patch/664886/
    int syscall_nr;
    const char * filename;
    const char *const * argv;
    const char *const * envp;
};

SEC("tracepoint/syscalls/sys_enter_execve")
int bpf_prog(struct execve_params *ctx) {
  char fmt[] = "sysnr %d, filename";
  int nr = ctx->syscall_nr;
  bpf_trace_printk(fmt,sizeof(fmt), nr);
  return 0;
}

char _license[] SEC("license") = "GPL";

参考

完整函数代码

samples/bpf/bpf_load.h 头文件

  7 #define MAX_MAPS 32      // 单个程序中的最大 MAP 数目
  8 #define MAX_PROGS 32     // 单个程序中的允许定义的最大 BPF 程序分区的数目
    
  1 /* SPDX-License-Identifier: GPL-2.0 */
  2 #ifndef __BPF_LOAD_H
  3 #define __BPF_LOAD_H
  4
  5 #include <bpf/bpf.h>
  6
  7 #define MAX_MAPS 32
  8 #define MAX_PROGS 32
  9
 10 struct bpf_load_map_def {
 11         unsigned int type;
 12         unsigned int key_size;
 13         unsigned int value_size;
 14         unsigned int max_entries;
 15         unsigned int map_flags;
 16         unsigned int inner_map_idx;
 17         unsigned int numa_node;
 18 };
 19
 20 struct bpf_map_data {
 21         int fd;
 22         char *name;
 23         size_t elf_offset;
 24         struct bpf_load_map_def def;
 25 };
 26
 27 typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx);
 28
 29 extern int prog_fd[MAX_PROGS];
 30 extern int event_fd[MAX_PROGS];
 31 extern char bpf_log_buf[BPF_LOG_BUF_SIZE];
 32 extern int prog_cnt;
 33
 34 /* There is a one-to-one mapping between map_fd[] and map_data[].
 35  * The map_data[] just contains more rich info on the given map.
 36  */
 37 extern int map_fd[MAX_MAPS];
 38 extern struct bpf_map_data map_data[MAX_MAPS];
 39 extern int map_data_count;
 40
 41 /* parses elf file compiled by llvm .c->.o
 42  * . parses 'maps' section and creates maps via BPF syscall
 43  * . parses 'license' section and passes it to syscall
 44  * . parses elf relocations for BPF maps and adjusts BPF_LD_IMM64 insns by
 45  *   storing map_fd into insn->imm and marking such insns as BPF_PSEUDO_MAP_FD
 46  * . loads eBPF programs via BPF syscall
 47  *
 48  * One ELF file can contain multiple BPF programs which will be loaded
 49  * and their FDs stored stored in prog_fd array
 50  *
 51  * returns zero on success
 52  */
 53 int load_bpf_file(char *path);
 54 int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map);
 55
 56 int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
 57 #endif

load_and_attach 完整定义

 76 static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 77 {
 78         bool is_socket = strncmp(event, "socket", 6) == 0;
 79         bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
 80         bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
 81         bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
 82         bool is_raw_tracepoint = strncmp(event, "raw_tracepoint/", 15) == 0;
 83         bool is_xdp = strncmp(event, "xdp", 3) == 0;
 84         bool is_perf_event = strncmp(event, "perf_event", 10) == 0;
 85         bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0;
 86         bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
 87         bool is_sockops = strncmp(event, "sockops", 7) == 0;
 88         bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0;
 89         bool is_sk_msg = strncmp(event, "sk_msg", 6) == 0;
 90         size_t insns_cnt = size / sizeof(struct bpf_insn);
 91         enum bpf_prog_type prog_type;
 92         char buf[256];
 93         int fd, efd, err, id;
 94         struct perf_event_attr attr = {};
 95
 96         attr.type = PERF_TYPE_TRACEPOINT;
 97         attr.sample_type = PERF_SAMPLE_RAW;
 98         attr.sample_period = 1;
 99         attr.wakeup_events = 1;
100
101         if (is_socket) {
102                 prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
103         } else if (is_kprobe || is_kretprobe) {
104                 prog_type = BPF_PROG_TYPE_KPROBE;
105         } else if (is_tracepoint) {
106                 prog_type = BPF_PROG_TYPE_TRACEPOINT;
107         } else if (is_raw_tracepoint) {
108                 prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT;
109         } else if (is_xdp) {
110                 prog_type = BPF_PROG_TYPE_XDP;
111         } else if (is_perf_event) {
112                 prog_type = BPF_PROG_TYPE_PERF_EVENT;
113         } else if (is_cgroup_skb) {
114                 prog_type = BPF_PROG_TYPE_CGROUP_SKB;
115         } else if (is_cgroup_sk) {
116                 prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
117         } else if (is_sockops) {
118                 prog_type = BPF_PROG_TYPE_SOCK_OPS;
119         } else if (is_sk_skb) {
120                 prog_type = BPF_PROG_TYPE_SK_SKB;
121         } else if (is_sk_msg) {
122                 prog_type = BPF_PROG_TYPE_SK_MSG;
123         } else {
124                 printf("Unknown event '%s'\n", event);
125                 return -1;
126         }
127
128         if (prog_cnt == MAX_PROGS) /*#define MAX_PROGS 32 samples/bpf/bpf_load.h*/
129                 return -1;
130
131         fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version,
132                               bpf_log_buf, BPF_LOG_BUF_SIZE);
133         if (fd < 0) {
134                 printf("bpf_load_program() err=%d\n%s", errno, bpf_log_buf);
135                 return -1;
136         }
137
138         prog_fd[prog_cnt++] = fd;
139
140         if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
141                 return 0;
142
143         if (is_socket || is_sockops || is_sk_skb || is_sk_msg) {
144                 if (is_socket)
145                         event += 6;
146                 else
147                         event += 7;
148                 if (*event != '/')
149                         return 0;
150                 event++;
151                 if (!isdigit(*event)) {
152                         printf("invalid prog number\n");
153                         return -1;
154                 }
155                 return populate_prog_array(event, fd);
156         }
157
158         if (is_raw_tracepoint) {
159                 efd = bpf_raw_tracepoint_open(event + 15, fd);
160                 if (efd < 0) {
161                         printf("tracepoint %s %s\n", event + 15, strerror(errno));
162                         return -1;
163                 }
164                 event_fd[prog_cnt - 1] = efd;
165                 return 0;
166         }
167
168         if (is_kprobe || is_kretprobe) {
169                 bool need_normal_check = true;
170                 const char *event_prefix = "";
171
172                 if (is_kprobe)
173                         event += 7;
174                 else
175                         event += 10;
176
177                 if (*event == 0) {
178                         printf("event name cannot be empty\n");
179                         return -1;
180                 }
181
182                 if (isdigit(*event))
183                         return populate_prog_array(event, fd);
184
185 #ifdef __x86_64__
186                 if (strncmp(event, "sys_", 4) == 0) {
187                         snprintf(buf, sizeof(buf), "%c:__x64_%s __x64_%s",
188                                 is_kprobe ? 'p' : 'r', event, event);
189                         err = write_kprobe_events(buf);
190                         if (err >= 0) {
191                                 need_normal_check = false;
192                                 event_prefix = "__x64_";
193                         }
194                 }
195 #endif
196                 if (need_normal_check) {
197                         snprintf(buf, sizeof(buf), "%c:%s %s",
198                                 is_kprobe ? 'p' : 'r', event, event);
199                         err = write_kprobe_events(buf);
200                         if (err < 0) {
201                                 printf("failed to create kprobe '%s' error '%s'\n",
202                                        event, strerror(errno));
203                                 return -1;
204                         }
205                 }
206
207                 strcpy(buf, DEBUGFS);
208                 strcat(buf, "events/kprobes/");
209                 strcat(buf, event_prefix);
210                 strcat(buf, event);
211                 strcat(buf, "/id");
212         } else if (is_tracepoint) { // 如果为
213                 event += 11;
214
215                 if (*event == 0) {
216                         printf("event name cannot be empty\n");
217                         return -1;
218                 }
219                 strcpy(buf, DEBUGFS); 
220                 strcat(buf, "events/"); // "/sys/kernel/debug/tracing/"
221                 strcat(buf, event);
222                 strcat(buf, "/id");
223         }
224
225         efd = open(buf, O_RDONLY, 0);
226         if (efd < 0) {
227                 printf("failed to open event %s\n", event);
228                 return -1;
229         }
230
231         err = read(efd, buf, sizeof(buf));
232         if (err < 0 || err >= sizeof(buf)) {
233                 printf("read from '%s' failed '%s'\n", event, strerror(errno));
234                 return -1;
235         }
236
237         close(efd);
238
239         buf[err] = 0;
240         id = atoi(buf);
241         attr.config = id;
242
243         efd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
244         if (efd < 0) {
245                 printf("event %d fd %d err %s\n", id, efd, strerror(errno));
246                 return -1;
247         }
248         event_fd[prog_cnt - 1] = efd;
249         err = ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
250         if (err < 0) {
251                 printf("ioctl PERF_EVENT_IOC_ENABLE failed err %s\n",
252                        strerror(errno));
253                 return -1;
254         }
255         err = ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
256         if (err < 0) {
257                 printf("ioctl PERF_EVENT_IOC_SET_BPF failed err %s\n",
258                        strerror(errno));
259                 return -1;
260         }
261
262         return 0;
263 }

ioctl 函数定义在文件 kernel/events/core.c 文件中

 6199 static const struct file_operations perf_fops = {
 6200         .llseek                 = no_llseek,
 6201         .release                = perf_release,
 6202         .read                   = perf_read,
 6203         .poll                   = perf_poll,
 6204         .unlocked_ioctl         = perf_ioctl,
 6205         .compat_ioctl           = perf_compat_ioctl,
 6206         .mmap                   = perf_mmap,
 6207         .fasync                 = perf_fasync,
 6208 };

 5517 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 5518 {
 5519         struct perf_event *event = file->private_data;
 5520         struct perf_event_context *ctx;
 5521         long ret;
 5522
 5523         /* Treat ioctl like writes as it is likely a mutating operation. */
 5524         ret = security_perf_event_write(event);
 5525         if (ret)
 5526                 return ret;
 5527
 5528         ctx = perf_event_ctx_lock(event);
 5529         ret = _perf_ioctl(event, cmd, arg);
 5530         perf_event_ctx_unlock(event, ctx);
 5531
 5532         return ret;
 5533 }

 5417 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
 5418 {
         // ...
 5475         case PERF_EVENT_IOC_SET_BPF:
 5476                 return perf_event_set_bpf_prog(event, arg);
        // ...
 }
 
 // perf_event_set_bpf_prog 定义如下
 9593 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 9594 {
 9595         bool is_kprobe, is_tracepoint, is_syscall_tp;
 9596         struct bpf_prog *prog;
 9597         int ret;
 9598
 9599         if (!perf_event_is_tracing(event))
 9600                 return perf_event_set_bpf_handler(event, prog_fd);
 9601
 9602         is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
 9603         is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
 9604         is_syscall_tp = is_syscall_trace_event(event->tp_event);
 9605         if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
 9606                 /* bpf programs can only be attached to u/kprobe or tracepoint */
 9607                 return -EINVAL;
 9608
 9609         prog = bpf_prog_get(prog_fd);
 9610         if (IS_ERR(prog))
 9611                 return PTR_ERR(prog);
 9612
 9613         if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
 9614             (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
 9615             (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
 9616                 /* valid fd, but invalid bpf program type */
 9617                 bpf_prog_put(prog);
 9618                 return -EINVAL;
 9619         }
 9620
 9621         /* Kprobe override only works for kprobes, not uprobes. */
 9622         if (prog->kprobe_override &&
 9623             !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
 9624                 bpf_prog_put(prog);
 9625                 return -EINVAL;
 9626         }
 9627
 9628         if (is_tracepoint || is_syscall_tp) {
 9629                 int off = trace_event_get_offsets(event->tp_event);
 9630
 9631                 if (prog->aux->max_ctx_offset > off) {
 9632                         bpf_prog_put(prog);
 9633                         return -EACCES;
 9634                 }
 9635         }
 9636
 9637         ret = perf_event_attach_bpf_prog(event, prog);
 9638         if (ret)
 9639                 bpf_prog_put(prog);
 9640         return ret;
 9641 }
  // 函数最终调用 perf_event_attach_bpf_prog

该函数 perf_event_attach_bpf_prog 定义在 kernel/trace/bpf_trace.c 中

1687 #define BPF_TRACE_MAX_PROGS 64
1688
1689 int perf_event_attach_bpf_prog(struct perf_event *event,
1690                                struct bpf_prog *prog)
1691 {
1692         struct bpf_prog_array *old_array;
1693         struct bpf_prog_array *new_array;
1694         int ret = -EEXIST;
1695
1696         /*
1697          * Kprobe override only works if they are on the function entry,
1698          * and only if they are on the opt-in list.
1699          */
1700         if (prog->kprobe_override &&
1701             (!trace_kprobe_on_func_entry(event->tp_event) ||
1702              !trace_kprobe_error_injectable(event->tp_event)))
1703                 return -EINVAL;
1704
1705         mutex_lock(&bpf_event_mutex);
1706
1707         if (event->prog)
1708                 goto unlock;
1709
1710         old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
1711         if (old_array &&
1712             bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) {
1713                 ret = -E2BIG;
1714                 goto unlock;
1715         }
1716
1717         ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
1718         if (ret < 0)
1719                 goto unlock;
1720
1721         /* set the new array to event->tp_event and set event->prog */
1722         event->prog = prog;
1723         rcu_assign_pointer(event->tp_event->prog_array, new_array);
1724         bpf_prog_array_free(old_array);
1725
1726 unlock:
1727         mutex_unlock(&bpf_event_mutex);
1728         return ret;
1729 }
1730

最终触发的逻辑:

 // perf event 触发的时候调用 bpf
 9240 void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
 9241                                struct trace_event_call *call, u64 count,
 9242                                struct pt_regs *regs, struct hlist_head *head,
 9243                                struct task_struct *task)
 9244 {
 9245         if (bpf_prog_array_valid(call)) {
 9246                 *(struct pt_regs **)raw_data = regs;
 9247                 if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) { // 调用 bpf 程序
 9248                         perf_swevent_put_recursion_context(rctx);
 9249                         return;
 9250                 }
 9251         }
 9252         perf_tp_event(call->event.type, count, raw_data, size, regs, head,
 9253                       rctx, task);
 9254 }
 9255 EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);

函数 bpf_prog_array_valid 位于文件 include/linux/trace_events.h

312 #ifdef CONFIG_PERF_EVENTS
313 static inline bool bpf_prog_array_valid(struct trace_event_call *call)
314 {
315         /*
316          * This inline function checks whether call->prog_array
317          * is valid or not. The function is called in various places,
318          * outside rcu_read_lock/unlock, as a heuristic to speed up execution.
319          *
320          * If this function returns true, and later call->prog_array
321          * becomes false inside rcu_read_lock/unlock region,
322          * we bail out then. If this function return false,
323          * there is a risk that we might miss a few events if the checking
324          * were delayed until inside rcu_read_lock/unlock region and
325          * call->prog_array happened to become non-NULL then.
326          *
327          * Here, READ_ONCE() is used instead of rcu_access_pointer().
328          * rcu_access_pointer() requires the actual definition of
329          * "struct bpf_prog_array" while READ_ONCE() only needs
330          * a declaration of the same type.
331          */
332         return !!READ_ONCE(call->prog_array);
333 }
334 #endif

结构 struct trace_event_call 定义在 include/linux/trace_events.h 中

611 struct perf_event {
    // ...
749 #ifdef CONFIG_EVENT_TRACING
750         struct trace_event_call         *tp_event;   // -> trace_event_call
751         struct event_filter             *filter;
752 #ifdef CONFIG_FUNCTION_TRACER
    // ...
}   

278 struct trace_event_call {
279         struct list_head        list;
280         struct trace_event_class *class;
281         union {
282                 char                    *name;
283                 /* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */
284                 struct tracepoint       *tp;
285         };
286         struct trace_event      event;
287         char                    *print_fmt;
288         struct event_filter     *filter;
289         void                    *mod;
290         void                    *data;
291         /*
292          *   bit 0:             filter_active
293          *   bit 1:             allow trace by non root (cap any)
294          *   bit 2:             failed to apply filter
295          *   bit 3:             trace internal event (do not enable)
296          *   bit 4:             Event was enabled by module
297          *   bit 5:             use call filter rather than file filter
298          *   bit 6:             Event is a tracepoint
299          */
300         int                     flags; /* static flags of different events */
301
302 #ifdef CONFIG_PERF_EVENTS
303         int                             perf_refcount;
304         struct hlist_head __percpu      *perf_events;
305         struct bpf_prog_array __rcu     *prog_array;  // 保存 bpf 程序
306
307         int     (*perf_perm)(struct trace_event_call *,
308                              struct perf_event *);
309 #endif
310 };

bpf_load_program 函数定义在 tools/lib/bpf/bpf.c 文件中

332 int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
333                      size_t insns_cnt, const char *license,
334                      __u32 kern_version, char *log_buf,
335                      size_t log_buf_sz)
336 {
337         struct bpf_load_program_attr load_attr;
338
339         memset(&load_attr, 0, sizeof(struct bpf_load_program_attr));
340         load_attr.prog_type = type;
341         load_attr.expected_attach_type = 0;
342         load_attr.name = NULL;
343         load_attr.insns = insns;
344         load_attr.insns_cnt = insns_cnt;
345         load_attr.license = license;
346         load_attr.kern_version = kern_version;
347
348         return bpf_load_program_xattr(&load_attr, log_buf, log_buf_sz);
349 }

220 int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
221                            char *log_buf, size_t log_buf_sz)
222 {
223         void *finfo = NULL, *linfo = NULL;
224         union bpf_attr attr;
225         __u32 log_level;
226         int fd;
227
228         if (!load_attr || !log_buf != !log_buf_sz)
229                 return -EINVAL;
230
231         log_level = load_attr->log_level;
232         if (log_level > (4 | 2 | 1) || (log_level && !log_buf))
233                 return -EINVAL;
234
235         memset(&attr, 0, sizeof(attr));
236         attr.prog_type = load_attr->prog_type;
237         attr.expected_attach_type = load_attr->expected_attach_type;
238         if (attr.prog_type == BPF_PROG_TYPE_STRUCT_OPS ||
239             attr.prog_type == BPF_PROG_TYPE_LSM) {
240                 attr.attach_btf_id = load_attr->attach_btf_id;
241         } else if (attr.prog_type == BPF_PROG_TYPE_TRACING ||
242                    attr.prog_type == BPF_PROG_TYPE_EXT) {
243                 attr.attach_btf_id = load_attr->attach_btf_id;
244                 attr.attach_prog_fd = load_attr->attach_prog_fd;
245         } else {
246                 attr.prog_ifindex = load_attr->prog_ifindex;
247                 attr.kern_version = load_attr->kern_version;
248         }
247                 attr.kern_version = load_attr->kern_version;
248         }
249         attr.insn_cnt = (__u32)load_attr->insns_cnt;
250         attr.insns = ptr_to_u64(load_attr->insns);
251         attr.license = ptr_to_u64(load_attr->license);
252
253         attr.log_level = log_level;
254         if (log_level) {
255                 attr.log_buf = ptr_to_u64(log_buf);
256                 attr.log_size = log_buf_sz;
257         } else {
258                 attr.log_buf = ptr_to_u64(NULL);
259                 attr.log_size = 0;
260         }
261
262         attr.prog_btf_fd = load_attr->prog_btf_fd;
263         attr.func_info_rec_size = load_attr->func_info_rec_size;
264         attr.func_info_cnt = load_attr->func_info_cnt;
265         attr.func_info = ptr_to_u64(load_attr->func_info);
266         attr.line_info_rec_size = load_attr->line_info_rec_size;
267         attr.line_info_cnt = load_attr->line_info_cnt;
268         attr.line_info = ptr_to_u64(load_attr->line_info);
269         if (load_attr->name)
270                 memcpy(attr.prog_name, load_attr->name,
271                        min(strlen(load_attr->name), BPF_OBJ_NAME_LEN - 1));
272         attr.prog_flags = load_attr->prog_flags;
273
274         fd = sys_bpf_prog_load(&attr, sizeof(attr));
275         if (fd >= 0)
276                 return fd;
277
278         /* After bpf_prog_load, the kernel may modify certain attributes
279          * to give user space a hint how to deal with loading failure.
280          * Check to see whether we can make some changes and load again.
281          */
282         while (errno == E2BIG && (!finfo || !linfo)) {
283                 if (!finfo && attr.func_info_cnt &&
284                     attr.func_info_rec_size < load_attr->func_info_rec_size) {
285                         /* try with corrected func info records */
286                         finfo = alloc_zero_tailing_info(load_attr->func_info,
287                                                         load_attr->func_info_cnt,
288                                                         load_attr->func_info_rec_size,
289                                                         attr.func_info_rec_size);
290                         if (!finfo)
291                                 goto done;
292
293                         attr.func_info = ptr_to_u64(finfo);
294                         attr.func_info_rec_size = load_attr->func_info_rec_size;
295                 } else if (!linfo && attr.line_info_cnt &&
296                            attr.line_info_rec_size <
297                            load_attr->line_info_rec_size) {
298                         linfo = alloc_zero_tailing_info(load_attr->line_info,
299                                                         load_attr->line_info_cnt,
300                                                         load_attr->line_info_rec_size,
301                                                         attr.line_info_rec_size);
302                         if (!linfo)
303                                 goto done;
304
305                         attr.line_info = ptr_to_u64(linfo);
306                         attr.line_info_rec_size = load_attr->line_info_rec_size;
307                 } else {
308                         break;
309                 }
310
311                 fd = sys_bpf_prog_load(&attr, sizeof(attr));
312
313                 if (fd >= 0)
314                         goto done;
315         }
316
317         if (log_level || !log_buf)
318                 goto done;
319
320         /* Try again with log */
321         attr.log_buf = ptr_to_u64(log_buf);
322         attr.log_size = log_buf_sz;
323         attr.log_level = 1;
324         log_buf[0] = 0;
325         fd = sys_bpf_prog_load(&attr, sizeof(attr));
326 done:
327         free(finfo);
328         free(linfo);
329         return fd;
330 }