Andreas Christoforou         home     posts

Limiting syscalls with seccomp

Seccomp and Seccomp-BPF are used to limit the system calls available to a Linux process. System call filtering isn’t a sandbox. It provides a clearly defined mechanism for minimizing the exposed kernel surface.

The initial implementation, also known as “mode 1 seccomp” only allowed ‘read’, ‘write’, ‘exit’ and ‘sigreturn’ syscalls to be only possible to read/write to already opened files and exit.

Example how we call seccomp:

#include <stdio.h>
#include <stdlib.h>
#include <sys/prctl.h>
#include <linux/seccomp.h>
#include <unistd.h>
int main(int argc, char* argv[])
{
    prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT);
    
    /* Redirect stderr to stdout */
    dup2(1, 2);
    printf("Nothing.\n");
  
  return 0; 
}

Seccomp is good for absolute restrictions, a more fine grained approach is required when attempting to lock down more complex applications. In order to solve this problem Seccomp - Berkley Packet Filter (Seccomp-BPF) was introduced.

Seccomp-BPF program receives the following struct as an input argument. We can filter based on the system call number and on the arguments.

struct seccomp_data {
    int nr;                     /* System call number */
    __u32 arch;                 /* AUDIT_ARCH_* value  (see <linux/audit.h>)*/
    __u64 instruction_pointer;  /* CPU instruction pointer */
    __u64 args[6];              /* Up to 6 system call arguments */
    };

Seccomp using Linux Socket Filtering aka Berkeley Packet Filter (BPF) which contains the following structures

struct sock_filter {/* Filter block */
    __u16code;   /* Actual filter code */
    __u8jt;      /* Jump true */
    __u8jf;      /* Jump false */
    __u32k;      /* Generic multiuse field */
    };

struct sock_fprog { /* Required for SO_ATTACH_FILTER. */
    unsigned short len;     /* Number of filter blocks */
    struct sock_filter __user *filter;
    };

So basically what BPFs do in seccomp is to operate on this data, and return a value that tells the kernel what to do next: allow the process to perform the call (SECCOMP_RET_ALLOW), kill it (SECCOMP_RET_KILL), or other options.

As per the documentation of seccomp the best practice is to check the arch system call numbers with seccomp_data->arch to avoid any issues like if the syscall numbers in the different calling conventions overlap, then checks in the filters may be abused.

BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, arch))),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 1, 0),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL),

Example filter to allow open syscall

struct sock_filter filter [] = {
    BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)),
    BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_open, 0, 1),
    BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
    BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
    };

PR_SET_NO_NEW_PRIVS, which impedes child processes to have more privileges than those of the parent. This is needed to make the following call to prctl, which sets the seccomp filter using the PR_SET_SECCOMP option, succeed even when not being root.

if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == -1) {
    perror("setting new privileges");
    return 1;
    }

Set seccomp

if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) == -1) {
    perror("seccomp filter error");
    return 1;
    }

Simple example

#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <stddef.h>
#include <sys/prctl.h>
#include <linux/seccomp.h>
#include <linux/filter.h>
#include <linux/unistd.h>
#include <linux/audit.h>

int main(int argc, char* argv[])
{
    int fd;
    
    #define syscall_nr (offsetof(struct seccomp_data, nr))
    #define arch_nr (offsetof(struct seccomp_data, arch))
    
    /* architecture x86_64 */
    #define REG_SYSCALL REG_RAX
    #define ARCH_NR AUDIT_ARCH_X86_64

    struct sock_filter filter[] = {
        /* Validate architecture. */
        BPF_STMT(BPF_LD+BPF_W+BPF_ABS, arch_nr),
        BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, ARCH_NR, 1, 0),
        BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
        /* Get system call number. */
        BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_nr),
        /* List allowed syscalls. */
        BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_rt_sigreturn, 0, 1),
        BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
        BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit_group, 0, 1),
        BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
        BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit, 0, 1),
        BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
        BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_read, 0, 1),
        BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
        BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_write, 0, 1),
        BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
        BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_open, 0, 1),
        BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
        BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_dup, 0, 1),
        BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
        BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_fcntl, 0, 1),
        BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
        BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_brk, 0, 1),
        BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
        BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_fstat, 0, 1),
        BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
        BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_close, 0, 1),
        BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
        /* Kill Process */
        BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
        };
    
    struct sock_fprog prog = {
	.len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
	.filter = filter,
        };

    if ( prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == -1 ) {
        perror("prctl(PR_SET_NO_NEW_PRIVS)\n");
        return -1;
        }
    
    if ( prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) == -1 ) {
        perror("Seccomp filter error\n");
        return -1;
        }
    
    if ( fd = open("/etc/passwd", O_RDONLY) == -1 ){
        perror("Error");
        return -1;
        }
    
    close(fd);

    return 0;
 
}

sources

https://outflux.net/teach-seccomp/
http://www.alfonsobeato.net/c/filter-and-modify-system-calls-with-seccomp-and-ptrace/
https://www.illogicalexpressions.com/linux/2016/08/31/seccomp-and-seccomp-bpf.html
https://www.kernel.org/doc/Documentation/networking/filter.txt
https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt
http://elixir.free-electrons.com/linux/latest/source/include/uapi/linux/seccomp.h#L47