Source file
src/syscall/exec_linux.go
Documentation: syscall
1
2
3
4
5
6
7 package syscall
8
9 import (
10 "runtime"
11 "unsafe"
12 )
13
14
15
16 type SysProcIDMap struct {
17 ContainerID int
18 HostID int
19 Size int
20 }
21
22 type SysProcAttr struct {
23 Chroot string
24 Credential *Credential
25 Ptrace bool
26 Setsid bool
27 Setpgid bool
28 Setctty bool
29 Noctty bool
30 Ctty int
31 Foreground bool
32 Pgid int
33 Pdeathsig Signal
34 Cloneflags uintptr
35 Unshareflags uintptr
36 UidMappings []SysProcIDMap
37 GidMappings []SysProcIDMap
38
39
40
41
42 GidMappingsEnableSetgroups bool
43 AmbientCaps []uintptr
44 }
45
46 var (
47 none = [...]byte{'n', 'o', 'n', 'e', 0}
48 slash = [...]byte{'/', 0}
49 )
50
51
52 func runtime_BeforeFork()
53 func runtime_AfterFork()
54 func runtime_AfterForkInChild()
55
56
57
58
59
60
61
62
63
64
65
66 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
67
68
69 r1, err1, p, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
70 if locked {
71 runtime_AfterFork()
72 }
73 if err1 != 0 {
74 return 0, err1
75 }
76
77
78 pid = int(r1)
79
80 if sys.UidMappings != nil || sys.GidMappings != nil {
81 Close(p[0])
82 err := writeUidGidMappings(pid, sys)
83 var err2 Errno
84 if err != nil {
85 err2 = err.(Errno)
86 }
87 RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
88 Close(p[1])
89 }
90
91 return pid, 0
92 }
93
94
95
96
97
98
99
100
101
102
103
104 func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (r1 uintptr, err1 Errno, p [2]int, locked bool) {
105
106 const (
107 PR_CAP_AMBIENT = 0x2f
108 PR_CAP_AMBIENT_RAISE = 0x2
109 )
110
111
112
113
114
115
116
117
118 var (
119 err2 Errno
120 nextfd int
121 i int
122 )
123
124
125 ppid, _, _ := RawSyscall(SYS_GETPID, 0, 0, 0)
126
127
128
129
130 fd := make([]int, len(attr.Files))
131 nextfd = len(attr.Files)
132 for i, ufd := range attr.Files {
133 if nextfd < int(ufd) {
134 nextfd = int(ufd)
135 }
136 fd[i] = int(ufd)
137 }
138 nextfd++
139
140
141
142 if sys.UidMappings != nil || sys.GidMappings != nil {
143 if err := forkExecPipe(p[:]); err != nil {
144 err1 = err.(Errno)
145 return
146 }
147 }
148
149
150
151 runtime_BeforeFork()
152 locked = true
153 switch {
154 case runtime.GOARCH == "amd64" && sys.Cloneflags&CLONE_NEWUSER == 0:
155 r1, err1 = rawVforkSyscall(SYS_CLONE, uintptr(SIGCHLD|CLONE_VFORK|CLONE_VM)|sys.Cloneflags)
156 case runtime.GOARCH == "s390x":
157 r1, _, err1 = RawSyscall6(SYS_CLONE, 0, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0)
158 default:
159 r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0)
160 }
161 if err1 != 0 || r1 != 0 {
162
163
164
165
166
167
168 return
169 }
170
171
172
173 runtime_AfterForkInChild()
174
175
176 if len(sys.AmbientCaps) > 0 {
177 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_KEEPCAPS, 1, 0, 0, 0, 0)
178 if err1 != 0 {
179 goto childerror
180 }
181 }
182
183
184 if sys.UidMappings != nil || sys.GidMappings != nil {
185 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(p[1]), 0, 0); err1 != 0 {
186 goto childerror
187 }
188 r1, _, err1 = RawSyscall(SYS_READ, uintptr(p[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
189 if err1 != 0 {
190 goto childerror
191 }
192 if r1 != unsafe.Sizeof(err2) {
193 err1 = EINVAL
194 goto childerror
195 }
196 if err2 != 0 {
197 err1 = err2
198 goto childerror
199 }
200 }
201
202
203 if sys.Setsid {
204 _, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0)
205 if err1 != 0 {
206 goto childerror
207 }
208 }
209
210
211 if sys.Setpgid || sys.Foreground {
212
213 _, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0)
214 if err1 != 0 {
215 goto childerror
216 }
217 }
218
219 if sys.Foreground {
220 pgrp := int32(sys.Pgid)
221 if pgrp == 0 {
222 r1, _, err1 = RawSyscall(SYS_GETPID, 0, 0, 0)
223 if err1 != 0 {
224 goto childerror
225 }
226
227 pgrp = int32(r1)
228 }
229
230
231 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp)))
232 if err1 != 0 {
233 goto childerror
234 }
235 }
236
237
238 if sys.Unshareflags != 0 {
239 _, _, err1 = RawSyscall(SYS_UNSHARE, sys.Unshareflags, 0, 0)
240 if err1 != 0 {
241 goto childerror
242 }
243
244
245
246
247
248
249
250 if sys.Unshareflags&CLONE_NEWNS == CLONE_NEWNS {
251 _, _, err1 = RawSyscall6(SYS_MOUNT, uintptr(unsafe.Pointer(&none[0])), uintptr(unsafe.Pointer(&slash[0])), 0, MS_REC|MS_PRIVATE, 0, 0)
252 if err1 != 0 {
253 goto childerror
254 }
255 }
256 }
257
258
259 if chroot != nil {
260 _, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0)
261 if err1 != 0 {
262 goto childerror
263 }
264 }
265
266
267 if cred := sys.Credential; cred != nil {
268 ngroups := uintptr(len(cred.Groups))
269 groups := uintptr(0)
270 if ngroups > 0 {
271 groups = uintptr(unsafe.Pointer(&cred.Groups[0]))
272 }
273 if !(sys.GidMappings != nil && !sys.GidMappingsEnableSetgroups && ngroups == 0) && !cred.NoSetGroups {
274 _, _, err1 = RawSyscall(_SYS_setgroups, ngroups, groups, 0)
275 if err1 != 0 {
276 goto childerror
277 }
278 }
279 _, _, err1 = RawSyscall(sys_SETGID, uintptr(cred.Gid), 0, 0)
280 if err1 != 0 {
281 goto childerror
282 }
283 _, _, err1 = RawSyscall(sys_SETUID, uintptr(cred.Uid), 0, 0)
284 if err1 != 0 {
285 goto childerror
286 }
287 }
288
289 for _, c := range sys.AmbientCaps {
290 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0)
291 if err1 != 0 {
292 goto childerror
293 }
294 }
295
296
297 if dir != nil {
298 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
299 if err1 != 0 {
300 goto childerror
301 }
302 }
303
304
305 if sys.Pdeathsig != 0 {
306 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0)
307 if err1 != 0 {
308 goto childerror
309 }
310
311
312
313
314 r1, _, _ = RawSyscall(SYS_GETPPID, 0, 0, 0)
315 if r1 != ppid {
316 pid, _, _ := RawSyscall(SYS_GETPID, 0, 0, 0)
317 _, _, err1 := RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0)
318 if err1 != 0 {
319 goto childerror
320 }
321 }
322 }
323
324
325
326 if pipe < nextfd {
327 _, _, err1 = RawSyscall(_SYS_dup, uintptr(pipe), uintptr(nextfd), 0)
328 if err1 != 0 {
329 goto childerror
330 }
331 RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEXEC)
332 pipe = nextfd
333 nextfd++
334 }
335 for i = 0; i < len(fd); i++ {
336 if fd[i] >= 0 && fd[i] < int(i) {
337 if nextfd == pipe {
338 nextfd++
339 }
340 _, _, err1 = RawSyscall(_SYS_dup, uintptr(fd[i]), uintptr(nextfd), 0)
341 if err1 != 0 {
342 goto childerror
343 }
344 RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEXEC)
345 fd[i] = nextfd
346 nextfd++
347 }
348 }
349
350
351 for i = 0; i < len(fd); i++ {
352 if fd[i] == -1 {
353 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
354 continue
355 }
356 if fd[i] == int(i) {
357
358
359 _, _, err1 = RawSyscall(SYS_FCNTL, uintptr(fd[i]), F_SETFD, 0)
360 if err1 != 0 {
361 goto childerror
362 }
363 continue
364 }
365
366
367 _, _, err1 = RawSyscall(_SYS_dup, uintptr(fd[i]), uintptr(i), 0)
368 if err1 != 0 {
369 goto childerror
370 }
371 }
372
373
374
375
376
377 for i = len(fd); i < 3; i++ {
378 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
379 }
380
381
382 if sys.Noctty {
383 _, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0)
384 if err1 != 0 {
385 goto childerror
386 }
387 }
388
389
390 if sys.Setctty {
391 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 1)
392 if err1 != 0 {
393 goto childerror
394 }
395 }
396
397
398
399
400 if sys.Ptrace {
401 _, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0)
402 if err1 != 0 {
403 goto childerror
404 }
405 }
406
407
408 _, _, err1 = RawSyscall(SYS_EXECVE,
409 uintptr(unsafe.Pointer(argv0)),
410 uintptr(unsafe.Pointer(&argv[0])),
411 uintptr(unsafe.Pointer(&envv[0])))
412
413 childerror:
414
415 RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1))
416 for {
417 RawSyscall(SYS_EXIT, 253, 0, 0)
418 }
419 }
420
421
422 func forkExecPipe(p []int) (err error) {
423 err = Pipe2(p, O_CLOEXEC)
424
425
426 if err == ENOSYS {
427 if err = Pipe(p); err != nil {
428 return
429 }
430 if _, err = fcntl(p[0], F_SETFD, FD_CLOEXEC); err != nil {
431 return
432 }
433 _, err = fcntl(p[1], F_SETFD, FD_CLOEXEC)
434 }
435 return
436 }
437
438
439 func writeIDMappings(path string, idMap []SysProcIDMap) error {
440 fd, err := Open(path, O_RDWR, 0)
441 if err != nil {
442 return err
443 }
444
445 data := ""
446 for _, im := range idMap {
447 data = data + itoa(im.ContainerID) + " " + itoa(im.HostID) + " " + itoa(im.Size) + "\n"
448 }
449
450 bytes, err := ByteSliceFromString(data)
451 if err != nil {
452 Close(fd)
453 return err
454 }
455
456 if _, err := Write(fd, bytes); err != nil {
457 Close(fd)
458 return err
459 }
460
461 if err := Close(fd); err != nil {
462 return err
463 }
464
465 return nil
466 }
467
468
469
470
471
472 func writeSetgroups(pid int, enable bool) error {
473 sgf := "/proc/" + itoa(pid) + "/setgroups"
474 fd, err := Open(sgf, O_RDWR, 0)
475 if err != nil {
476 return err
477 }
478
479 var data []byte
480 if enable {
481 data = []byte("allow")
482 } else {
483 data = []byte("deny")
484 }
485
486 if _, err := Write(fd, data); err != nil {
487 Close(fd)
488 return err
489 }
490
491 return Close(fd)
492 }
493
494
495
496 func writeUidGidMappings(pid int, sys *SysProcAttr) error {
497 if sys.UidMappings != nil {
498 uidf := "/proc/" + itoa(pid) + "/uid_map"
499 if err := writeIDMappings(uidf, sys.UidMappings); err != nil {
500 return err
501 }
502 }
503
504 if sys.GidMappings != nil {
505
506 if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT {
507 return err
508 }
509 gidf := "/proc/" + itoa(pid) + "/gid_map"
510 if err := writeIDMappings(gidf, sys.GidMappings); err != nil {
511 return err
512 }
513 }
514
515 return nil
516 }
517
View as plain text