1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
|
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net>
*/
#include <sysdep/stub.h>
#include <linux/futex.h>
#include <sys/socket.h>
#include <errno.h>
/*
* Known security issues
*
* Userspace can jump to this address to execute *any* syscall that is
* permitted by the stub. As we will return afterwards, it can do
* whatever it likes, including:
* - Tricking the kernel into handing out the memory FD
* - Using this memory FD to read/write all physical memory
* - Running in parallel to the kernel processing a syscall
* (possibly creating data races?)
* - Blocking e.g. SIGALRM to avoid time based scheduling
*
* To avoid this, the permitted location for each syscall needs to be
* checked for in the SECCOMP filter (which is reasonably simple). Also,
* more care will need to go into considerations how the code might be
* tricked by using a prepared stack (or even modifying the stack from
* another thread in case SMP support is added).
*
* As for the SIGALRM, the best counter measure will be to check in the
* kernel that the process is reporting back the SIGALRM in a timely
* fashion.
*/
static __always_inline int syscall_handler(int fd_map[STUB_MAX_FDS])
{
struct stub_data *d = get_stub_data();
int i;
unsigned long res;
int fd;
for (i = 0; i < d->syscall_data_len; i++) {
struct stub_syscall *sc = &d->syscall_data[i];
switch (sc->syscall) {
case STUB_SYSCALL_MMAP:
if (fd_map)
fd = fd_map[sc->mem.fd];
else
fd = sc->mem.fd;
res = stub_syscall6(STUB_MMAP_NR,
sc->mem.addr, sc->mem.length,
sc->mem.prot,
MAP_SHARED | MAP_FIXED,
fd, sc->mem.offset);
if (res != sc->mem.addr) {
d->err = res;
d->syscall_data_len = i;
return -1;
}
break;
case STUB_SYSCALL_MUNMAP:
res = stub_syscall2(__NR_munmap,
sc->mem.addr, sc->mem.length);
if (res) {
d->err = res;
d->syscall_data_len = i;
return -1;
}
break;
default:
d->err = -95; /* EOPNOTSUPP */
d->syscall_data_len = i;
return -1;
}
}
d->err = 0;
d->syscall_data_len = 0;
return 0;
}
void __section(".__syscall_stub")
stub_syscall_handler(void)
{
syscall_handler(NULL);
trap_myself();
}
void __section(".__syscall_stub")
stub_signal_interrupt(int sig, siginfo_t *info, void *p)
{
struct stub_data *d = get_stub_data();
char rcv_data;
union {
char data[CMSG_SPACE(sizeof(int) * STUB_MAX_FDS)];
struct cmsghdr align;
} ctrl = {};
struct iovec iov = {
.iov_base = &rcv_data,
.iov_len = 1,
};
struct msghdr msghdr = {
.msg_iov = &iov,
.msg_iovlen = 1,
.msg_control = &ctrl,
.msg_controllen = sizeof(ctrl),
};
ucontext_t *uc = p;
struct cmsghdr *fd_msg;
int *fd_map;
int num_fds;
long res;
d->signal = sig;
d->si_offset = (unsigned long)info - (unsigned long)&d->sigstack[0];
d->mctx_offset = (unsigned long)&uc->uc_mcontext - (unsigned long)&d->sigstack[0];
restart_wait:
d->futex = FUTEX_IN_KERN;
do {
res = stub_syscall3(__NR_futex, (unsigned long)&d->futex,
FUTEX_WAKE, 1);
} while (res == -EINTR);
do {
res = stub_syscall4(__NR_futex, (unsigned long)&d->futex,
FUTEX_WAIT, FUTEX_IN_KERN, 0);
} while (res == -EINTR || d->futex == FUTEX_IN_KERN);
if (res < 0 && res != -EAGAIN)
stub_syscall1(__NR_exit_group, 1);
if (d->syscall_data_len) {
/* Read passed FDs (if any) */
do {
res = stub_syscall3(__NR_recvmsg, 0, (unsigned long)&msghdr, 0);
} while (res == -EINTR);
/* We should never have a receive error (other than -EAGAIN) */
if (res < 0 && res != -EAGAIN)
stub_syscall1(__NR_exit_group, 1);
/* Receive the FDs */
num_fds = 0;
fd_msg = msghdr.msg_control;
fd_map = (void *)&CMSG_DATA(fd_msg);
if (res == iov.iov_len && msghdr.msg_controllen > sizeof(struct cmsghdr))
num_fds = (fd_msg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
/* Try running queued syscalls. */
res = syscall_handler(fd_map);
while (num_fds)
stub_syscall2(__NR_close, fd_map[--num_fds], 0);
} else {
res = 0;
}
if (res < 0 || d->restart_wait) {
/* Report SIGSYS if we restart. */
d->signal = SIGSYS;
d->restart_wait = 0;
goto restart_wait;
}
/* Restore arch dependent state that is not part of the mcontext */
stub_seccomp_restore_state(&d->arch_data);
/* Return so that the host modified mcontext is restored. */
}
void __section(".__syscall_stub")
stub_signal_restorer(void)
{
/* We must not have anything on the stack when doing rt_sigreturn */
stub_syscall0(__NR_rt_sigreturn);
}
|