package syscall
import (
"internal/itoa"
"runtime"
"unsafe"
)
const (
CLONE_VM = 0x00000100
CLONE_FS = 0x00000200
CLONE_FILES = 0x00000400
CLONE_SIGHAND = 0x00000800
CLONE_PIDFD = 0x00001000
CLONE_PTRACE = 0x00002000
CLONE_VFORK = 0x00004000
CLONE_PARENT = 0x00008000
CLONE_THREAD = 0x00010000
CLONE_NEWNS = 0x00020000
CLONE_SYSVSEM = 0x00040000
CLONE_SETTLS = 0x00080000
CLONE_PARENT_SETTID = 0x00100000
CLONE_CHILD_CLEARTID = 0x00200000
CLONE_DETACHED = 0x00400000
CLONE_UNTRACED = 0x00800000
CLONE_CHILD_SETTID = 0x01000000
CLONE_NEWCGROUP = 0x02000000
CLONE_NEWUTS = 0x04000000
CLONE_NEWIPC = 0x08000000
CLONE_NEWUSER = 0x10000000
CLONE_NEWPID = 0x20000000
CLONE_NEWNET = 0x40000000
CLONE_IO = 0x80000000
CLONE_CLEAR_SIGHAND = 0x100000000
CLONE_INTO_CGROUP = 0x200000000
CLONE_NEWTIME = 0x00000080
)
type SysProcIDMap struct {
ContainerID int
HostID int
Size int
}
type SysProcAttr struct {
Chroot string
Credential *Credential
Ptrace bool
Setsid bool
Setpgid bool
Setctty bool
Noctty bool
Ctty int
Foreground bool
Pgid int
Pdeathsig Signal
Cloneflags uintptr
Unshareflags uintptr
UidMappings []SysProcIDMap
GidMappings []SysProcIDMap
GidMappingsEnableSetgroups bool
AmbientCaps []uintptr
UseCgroupFD bool
CgroupFD int
}
var (
none = [...]byte {'n' , 'o' , 'n' , 'e' , 0 }
slash = [...]byte {'/' , 0 }
)
func runtime_BeforeFork ()
func runtime_AfterFork ()
func runtime_AfterForkInChild ()
func forkAndExecInChild (argv0 *byte , argv , envv []*byte , chroot , dir *byte , attr *ProcAttr , sys *SysProcAttr , pipe int ) (pid int , err Errno ) {
upid , err , mapPipe , locked := forkAndExecInChild1 (argv0 , argv , envv , chroot , dir , attr , sys , pipe )
if locked {
runtime_AfterFork ()
}
if err != 0 {
return 0 , err
}
pid = int (upid )
if sys .UidMappings != nil || sys .GidMappings != nil {
Close (mapPipe [0 ])
var err2 Errno
if sys .Unshareflags &CLONE_NEWUSER == 0 {
if err := writeUidGidMappings (pid , sys ); err != nil {
err2 = err .(Errno )
}
}
RawSyscall (SYS_WRITE , uintptr (mapPipe [1 ]), uintptr (unsafe .Pointer (&err2 )), unsafe .Sizeof (err2 ))
Close (mapPipe [1 ])
}
return pid , 0
}
const _LINUX_CAPABILITY_VERSION_3 = 0x20080522
type capHeader struct {
version uint32
pid int32
}
type capData struct {
effective uint32
permitted uint32
inheritable uint32
}
type caps struct {
hdr capHeader
data [2 ]capData
}
func capToIndex (cap uintptr ) uintptr { return cap >> 5 }
func capToMask (cap uintptr ) uint32 { return 1 << uint (cap &31 ) }
type cloneArgs struct {
flags uint64
pidFD uint64
childTID uint64
parentTID uint64
exitSignal uint64
stack uint64
stackSize uint64
tls uint64
setTID uint64
setTIDSize uint64
cgroup uint64
}
func forkAndExecInChild1 (argv0 *byte , argv , envv []*byte , chroot , dir *byte , attr *ProcAttr , sys *SysProcAttr , pipe int ) (pid uintptr , err1 Errno , mapPipe [2 ]int , locked bool ) {
const (
PR_CAP_AMBIENT = 0x2f
PR_CAP_AMBIENT_RAISE = 0x2
)
var (
err2 Errno
nextfd int
i int
caps caps
fd1 , flags uintptr
puid , psetgroups , pgid []byte
uidmap , setgroups , gidmap []byte
clone3 *cloneArgs
pgrp int32
dirfd int
cred *Credential
ngroups , groups uintptr
c uintptr
)
rlim , rlimOK := origRlimitNofile .Load ().(Rlimit )
if sys .UidMappings != nil {
puid = []byte ("/proc/self/uid_map\000" )
uidmap = formatIDMappings (sys .UidMappings )
}
if sys .GidMappings != nil {
psetgroups = []byte ("/proc/self/setgroups\000" )
pgid = []byte ("/proc/self/gid_map\000" )
if sys .GidMappingsEnableSetgroups {
setgroups = []byte ("allow\000" )
} else {
setgroups = []byte ("deny\000" )
}
gidmap = formatIDMappings (sys .GidMappings )
}
ppid , _ := rawSyscallNoError (SYS_GETPID , 0 , 0 , 0 )
fd := make ([]int , len (attr .Files ))
nextfd = len (attr .Files )
for i , ufd := range attr .Files {
if nextfd < int (ufd ) {
nextfd = int (ufd )
}
fd [i ] = int (ufd )
}
nextfd ++
if sys .UidMappings != nil || sys .GidMappings != nil {
if err := forkExecPipe (mapPipe [:]); err != nil {
err1 = err .(Errno )
return
}
}
flags = sys .Cloneflags
if sys .Cloneflags &CLONE_NEWUSER == 0 && sys .Unshareflags &CLONE_NEWUSER == 0 {
flags |= CLONE_VFORK | CLONE_VM
}
if sys .UseCgroupFD {
clone3 = &cloneArgs {
flags : uint64 (flags ) | CLONE_INTO_CGROUP ,
exitSignal : uint64 (SIGCHLD ),
cgroup : uint64 (sys .CgroupFD ),
}
} else if flags &CLONE_NEWTIME != 0 {
clone3 = &cloneArgs {
flags : uint64 (flags ),
exitSignal : uint64 (SIGCHLD ),
}
}
runtime_BeforeFork ()
locked = true
if clone3 != nil {
pid , err1 = rawVforkSyscall (_SYS_clone3 , uintptr (unsafe .Pointer (clone3 )), unsafe .Sizeof (*clone3 ))
} else {
flags |= uintptr (SIGCHLD )
if runtime .GOARCH == "s390x" {
pid , err1 = rawVforkSyscall (SYS_CLONE , 0 , flags )
} else {
pid , err1 = rawVforkSyscall (SYS_CLONE , flags , 0 )
}
}
if err1 != 0 || pid != 0 {
return
}
if len (sys .AmbientCaps ) > 0 {
_, _, err1 = RawSyscall6 (SYS_PRCTL , PR_SET_KEEPCAPS , 1 , 0 , 0 , 0 , 0 )
if err1 != 0 {
goto childerror
}
}
if sys .UidMappings != nil || sys .GidMappings != nil {
if _, _, err1 = RawSyscall (SYS_CLOSE , uintptr (mapPipe [1 ]), 0 , 0 ); err1 != 0 {
goto childerror
}
pid , _, err1 = RawSyscall (SYS_READ , uintptr (mapPipe [0 ]), uintptr (unsafe .Pointer (&err2 )), unsafe .Sizeof (err2 ))
if err1 != 0 {
goto childerror
}
if pid != unsafe .Sizeof (err2 ) {
err1 = EINVAL
goto childerror
}
if err2 != 0 {
err1 = err2
goto childerror
}
}
if sys .Setsid {
_, _, err1 = RawSyscall (SYS_SETSID , 0 , 0 , 0 )
if err1 != 0 {
goto childerror
}
}
if sys .Setpgid || sys .Foreground {
_, _, err1 = RawSyscall (SYS_SETPGID , 0 , uintptr (sys .Pgid ), 0 )
if err1 != 0 {
goto childerror
}
}
if sys .Foreground {
pgrp = int32 (sys .Pgid )
if pgrp == 0 {
pid , _ = rawSyscallNoError (SYS_GETPID , 0 , 0 , 0 )
pgrp = int32 (pid )
}
_, _, err1 = RawSyscall (SYS_IOCTL , uintptr (sys .Ctty ), uintptr (TIOCSPGRP ), uintptr (unsafe .Pointer (&pgrp )))
if err1 != 0 {
goto childerror
}
}
runtime_AfterForkInChild ()
if sys .Unshareflags != 0 {
_, _, err1 = RawSyscall (SYS_UNSHARE , sys .Unshareflags , 0 , 0 )
if err1 != 0 {
goto childerror
}
if sys .Unshareflags &CLONE_NEWUSER != 0 && sys .GidMappings != nil {
dirfd = int (_AT_FDCWD )
if fd1 , _, err1 = RawSyscall6 (SYS_OPENAT , uintptr (dirfd ), uintptr (unsafe .Pointer (&psetgroups [0 ])), uintptr (O_WRONLY ), 0 , 0 , 0 ); err1 != 0 {
goto childerror
}
pid , _, err1 = RawSyscall (SYS_WRITE , uintptr (fd1 ), uintptr (unsafe .Pointer (&setgroups [0 ])), uintptr (len (setgroups )))
if err1 != 0 {
goto childerror
}
if _, _, err1 = RawSyscall (SYS_CLOSE , uintptr (fd1 ), 0 , 0 ); err1 != 0 {
goto childerror
}
if fd1 , _, err1 = RawSyscall6 (SYS_OPENAT , uintptr (dirfd ), uintptr (unsafe .Pointer (&pgid [0 ])), uintptr (O_WRONLY ), 0 , 0 , 0 ); err1 != 0 {
goto childerror
}
pid , _, err1 = RawSyscall (SYS_WRITE , uintptr (fd1 ), uintptr (unsafe .Pointer (&gidmap [0 ])), uintptr (len (gidmap )))
if err1 != 0 {
goto childerror
}
if _, _, err1 = RawSyscall (SYS_CLOSE , uintptr (fd1 ), 0 , 0 ); err1 != 0 {
goto childerror
}
}
if sys .Unshareflags &CLONE_NEWUSER != 0 && sys .UidMappings != nil {
dirfd = int (_AT_FDCWD )
if fd1 , _, err1 = RawSyscall6 (SYS_OPENAT , uintptr (dirfd ), uintptr (unsafe .Pointer (&puid [0 ])), uintptr (O_WRONLY ), 0 , 0 , 0 ); err1 != 0 {
goto childerror
}
pid , _, err1 = RawSyscall (SYS_WRITE , uintptr (fd1 ), uintptr (unsafe .Pointer (&uidmap [0 ])), uintptr (len (uidmap )))
if err1 != 0 {
goto childerror
}
if _, _, err1 = RawSyscall (SYS_CLOSE , uintptr (fd1 ), 0 , 0 ); err1 != 0 {
goto childerror
}
}
if sys .Unshareflags &CLONE_NEWNS == CLONE_NEWNS {
_, _, err1 = RawSyscall6 (SYS_MOUNT , uintptr (unsafe .Pointer (&none [0 ])), uintptr (unsafe .Pointer (&slash [0 ])), 0 , MS_REC |MS_PRIVATE , 0 , 0 )
if err1 != 0 {
goto childerror
}
}
}
if chroot != nil {
_, _, err1 = RawSyscall (SYS_CHROOT , uintptr (unsafe .Pointer (chroot )), 0 , 0 )
if err1 != 0 {
goto childerror
}
}
if cred = sys .Credential ; cred != nil {
ngroups = uintptr (len (cred .Groups ))
groups = uintptr (0 )
if ngroups > 0 {
groups = uintptr (unsafe .Pointer (&cred .Groups [0 ]))
}
if !(sys .GidMappings != nil && !sys .GidMappingsEnableSetgroups && ngroups == 0 ) && !cred .NoSetGroups {
_, _, err1 = RawSyscall (_SYS_setgroups , ngroups , groups , 0 )
if err1 != 0 {
goto childerror
}
}
_, _, err1 = RawSyscall (sys_SETGID , uintptr (cred .Gid ), 0 , 0 )
if err1 != 0 {
goto childerror
}
_, _, err1 = RawSyscall (sys_SETUID , uintptr (cred .Uid ), 0 , 0 )
if err1 != 0 {
goto childerror
}
}
if len (sys .AmbientCaps ) != 0 {
caps .hdr .version = _LINUX_CAPABILITY_VERSION_3
if _, _, err1 = RawSyscall (SYS_CAPGET , uintptr (unsafe .Pointer (&caps .hdr )), uintptr (unsafe .Pointer (&caps .data [0 ])), 0 ); err1 != 0 {
goto childerror
}
for _, c = range sys .AmbientCaps {
caps .data [capToIndex (c )].permitted |= capToMask (c )
caps .data [capToIndex (c )].inheritable |= capToMask (c )
}
if _, _, err1 = RawSyscall (SYS_CAPSET , uintptr (unsafe .Pointer (&caps .hdr )), uintptr (unsafe .Pointer (&caps .data [0 ])), 0 ); err1 != 0 {
goto childerror
}
for _, c = range sys .AmbientCaps {
_, _, err1 = RawSyscall6 (SYS_PRCTL , PR_CAP_AMBIENT , uintptr (PR_CAP_AMBIENT_RAISE ), c , 0 , 0 , 0 )
if err1 != 0 {
goto childerror
}
}
}
if dir != nil {
_, _, err1 = RawSyscall (SYS_CHDIR , uintptr (unsafe .Pointer (dir )), 0 , 0 )
if err1 != 0 {
goto childerror
}
}
if sys .Pdeathsig != 0 {
_, _, err1 = RawSyscall6 (SYS_PRCTL , PR_SET_PDEATHSIG , uintptr (sys .Pdeathsig ), 0 , 0 , 0 , 0 )
if err1 != 0 {
goto childerror
}
pid , _ = rawSyscallNoError (SYS_GETPPID , 0 , 0 , 0 )
if pid != ppid {
pid , _ = rawSyscallNoError (SYS_GETPID , 0 , 0 , 0 )
_, _, err1 = RawSyscall (SYS_KILL , pid , uintptr (sys .Pdeathsig ), 0 )
if err1 != 0 {
goto childerror
}
}
}
if pipe < nextfd {
_, _, err1 = RawSyscall (SYS_DUP3 , uintptr (pipe ), uintptr (nextfd ), O_CLOEXEC )
if err1 != 0 {
goto childerror
}
pipe = nextfd
nextfd ++
}
for i = 0 ; i < len (fd ); i ++ {
if fd [i ] >= 0 && fd [i ] < i {
if nextfd == pipe {
nextfd ++
}
_, _, err1 = RawSyscall (SYS_DUP3 , uintptr (fd [i ]), uintptr (nextfd ), O_CLOEXEC )
if err1 != 0 {
goto childerror
}
fd [i ] = nextfd
nextfd ++
}
}
for i = 0 ; i < len (fd ); i ++ {
if fd [i ] == -1 {
RawSyscall (SYS_CLOSE , uintptr (i ), 0 , 0 )
continue
}
if fd [i ] == i {
_, _, err1 = RawSyscall (fcntl64Syscall , uintptr (fd [i ]), F_SETFD , 0 )
if err1 != 0 {
goto childerror
}
continue
}
_, _, err1 = RawSyscall (SYS_DUP3 , uintptr (fd [i ]), uintptr (i ), 0 )
if err1 != 0 {
goto childerror
}
}
for i = len (fd ); i < 3 ; i ++ {
RawSyscall (SYS_CLOSE , uintptr (i ), 0 , 0 )
}
if sys .Noctty {
_, _, err1 = RawSyscall (SYS_IOCTL , 0 , uintptr (TIOCNOTTY ), 0 )
if err1 != 0 {
goto childerror
}
}
if sys .Setctty {
_, _, err1 = RawSyscall (SYS_IOCTL , uintptr (sys .Ctty ), uintptr (TIOCSCTTY ), 1 )
if err1 != 0 {
goto childerror
}
}
if rlimOK && rlim .Cur != 0 {
rawSetrlimit (RLIMIT_NOFILE , &rlim )
}
if sys .Ptrace {
_, _, err1 = RawSyscall (SYS_PTRACE , uintptr (PTRACE_TRACEME ), 0 , 0 )
if err1 != 0 {
goto childerror
}
}
_, _, err1 = RawSyscall (SYS_EXECVE ,
uintptr (unsafe .Pointer (argv0 )),
uintptr (unsafe .Pointer (&argv [0 ])),
uintptr (unsafe .Pointer (&envv [0 ])))
childerror :
RawSyscall (SYS_WRITE , uintptr (pipe ), uintptr (unsafe .Pointer (&err1 )), unsafe .Sizeof (err1 ))
for {
RawSyscall (SYS_EXIT , 253 , 0 , 0 )
}
}
func formatIDMappings (idMap []SysProcIDMap ) []byte {
var data []byte
for _ , im := range idMap {
data = append (data , itoa .Itoa (im .ContainerID )+" " +itoa .Itoa (im .HostID )+" " +itoa .Itoa (im .Size )+"\n" ...)
}
return data
}
func writeIDMappings (path string , idMap []SysProcIDMap ) error {
fd , err := Open (path , O_RDWR , 0 )
if err != nil {
return err
}
if _ , err := Write (fd , formatIDMappings (idMap )); err != nil {
Close (fd )
return err
}
if err := Close (fd ); err != nil {
return err
}
return nil
}
func writeSetgroups (pid int , enable bool ) error {
sgf := "/proc/" + itoa .Itoa (pid ) + "/setgroups"
fd , err := Open (sgf , O_RDWR , 0 )
if err != nil {
return err
}
var data []byte
if enable {
data = []byte ("allow" )
} else {
data = []byte ("deny" )
}
if _ , err := Write (fd , data ); err != nil {
Close (fd )
return err
}
return Close (fd )
}
func writeUidGidMappings (pid int , sys *SysProcAttr ) error {
if sys .UidMappings != nil {
uidf := "/proc/" + itoa .Itoa (pid ) + "/uid_map"
if err := writeIDMappings (uidf , sys .UidMappings ); err != nil {
return err
}
}
if sys .GidMappings != nil {
if err := writeSetgroups (pid , sys .GidMappingsEnableSetgroups ); err != nil && err != ENOENT {
return err
}
gidf := "/proc/" + itoa .Itoa (pid ) + "/gid_map"
if err := writeIDMappings (gidf , sys .GidMappings ); err != nil {
return err
}
}
return nil
}
The pages are generated with Golds v0.6.7 . (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu .
PR and bug reports are welcome and can be submitted to the issue list .
Please follow @Go100and1 (reachable from the left QR code) to get the latest news of Golds .