diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go index efe6dca58b2..d459ba8792c 100644 --- a/libcontainer/seccomp/patchbpf/enosys_linux.go +++ b/libcontainer/seccomp/patchbpf/enosys_linux.go @@ -164,11 +164,11 @@ func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error) return program, nil } -type nativeArch uint32 +type linuxAuditArch uint32 -const invalidArch nativeArch = 0 +const invalidArch linuxAuditArch = 0 -func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) { +func scmpArchToAuditArch(arch libseccomp.ScmpArch) (linuxAuditArch, error) { switch arch { case libseccomp.ArchNative: // Convert to actual native architecture. @@ -176,85 +176,89 @@ func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) { if err != nil { return invalidArch, fmt.Errorf("unable to get native arch: %w", err) } - return archToNative(arch) + return scmpArchToAuditArch(arch) case libseccomp.ArchX86: - return nativeArch(C.C_AUDIT_ARCH_I386), nil + return linuxAuditArch(C.C_AUDIT_ARCH_I386), nil case libseccomp.ArchAMD64, libseccomp.ArchX32: // NOTE: x32 is treated like x86_64 except all x32 syscalls have the // 30th bit of the syscall number set to indicate that it's not a // normal x86_64 syscall. - return nativeArch(C.C_AUDIT_ARCH_X86_64), nil + return linuxAuditArch(C.C_AUDIT_ARCH_X86_64), nil case libseccomp.ArchARM: - return nativeArch(C.C_AUDIT_ARCH_ARM), nil + return linuxAuditArch(C.C_AUDIT_ARCH_ARM), nil case libseccomp.ArchARM64: - return nativeArch(C.C_AUDIT_ARCH_AARCH64), nil + return linuxAuditArch(C.C_AUDIT_ARCH_AARCH64), nil case libseccomp.ArchMIPS: - return nativeArch(C.C_AUDIT_ARCH_MIPS), nil + return linuxAuditArch(C.C_AUDIT_ARCH_MIPS), nil case libseccomp.ArchMIPS64: - return nativeArch(C.C_AUDIT_ARCH_MIPS64), nil + return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64), nil case libseccomp.ArchMIPS64N32: - return nativeArch(C.C_AUDIT_ARCH_MIPS64N32), nil + return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64N32), nil case libseccomp.ArchMIPSEL: - return nativeArch(C.C_AUDIT_ARCH_MIPSEL), nil + return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL), nil case libseccomp.ArchMIPSEL64: - return nativeArch(C.C_AUDIT_ARCH_MIPSEL64), nil + return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64), nil case libseccomp.ArchMIPSEL64N32: - return nativeArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil + return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil case libseccomp.ArchPPC: - return nativeArch(C.C_AUDIT_ARCH_PPC), nil + return linuxAuditArch(C.C_AUDIT_ARCH_PPC), nil case libseccomp.ArchPPC64: - return nativeArch(C.C_AUDIT_ARCH_PPC64), nil + return linuxAuditArch(C.C_AUDIT_ARCH_PPC64), nil case libseccomp.ArchPPC64LE: - return nativeArch(C.C_AUDIT_ARCH_PPC64LE), nil + return linuxAuditArch(C.C_AUDIT_ARCH_PPC64LE), nil case libseccomp.ArchS390: - return nativeArch(C.C_AUDIT_ARCH_S390), nil + return linuxAuditArch(C.C_AUDIT_ARCH_S390), nil case libseccomp.ArchS390X: - return nativeArch(C.C_AUDIT_ARCH_S390X), nil + return linuxAuditArch(C.C_AUDIT_ARCH_S390X), nil case libseccomp.ArchRISCV64: - return nativeArch(C.C_AUDIT_ARCH_RISCV64), nil + return linuxAuditArch(C.C_AUDIT_ARCH_RISCV64), nil default: return invalidArch, fmt.Errorf("unknown architecture: %v", arch) } } -type lastSyscallMap map[nativeArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall +type lastSyscallMap map[linuxAuditArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall // Figure out largest syscall number referenced in the filter for each // architecture. We will be generating code based on the native architecture // representation, but SCMP_ARCH_X32 means we have to track cases where the // same architecture has different largest syscalls based on the mode. func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) { - lastSyscalls := make(lastSyscallMap) - // Only loop over architectures which are present in the filter. Any other - // architectures will get the libseccomp bad architecture action anyway. + scmpArchs := make(map[libseccomp.ScmpArch]struct{}) for _, ociArch := range config.Architectures { arch, err := libseccomp.GetArchFromString(ociArch) if err != nil { return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err) } + scmpArchs[arch] = struct{}{} + } + // On architectures like ppc64le, Docker inexplicably doesn't include the + // native architecture in the architecture list which results in no + // architectures being present in the list at all (rendering the ENOSYS + // stub a no-op). So, always include the native architecture. + if nativeScmpArch, err := libseccomp.GetNativeArch(); err != nil { + return nil, fmt.Errorf("unable to get native arch: %w", err) + } else if _, ok := scmpArchs[nativeScmpArch]; !ok { + logrus.Debugf("seccomp: adding implied native architecture %v to config set", nativeScmpArch) + scmpArchs[nativeScmpArch] = struct{}{} + } + logrus.Debugf("seccomp: configured architecture set: %s", scmpArchs) - // Map native architecture to a real architecture value to avoid - // doubling-up the lastSyscall mapping. - if arch == libseccomp.ArchNative { - nativeArch, err := libseccomp.GetNativeArch() - if err != nil { - return nil, fmt.Errorf("unable to get native architecture: %w", err) - } - arch = nativeArch - } - - // Figure out native architecture representation of the architecture. - nativeArch, err := archToNative(arch) + // Only loop over architectures which are present in the filter. Any other + // architectures will get the libseccomp bad architecture action anyway. + lastSyscalls := make(lastSyscallMap) + for arch := range scmpArchs { + auditArch, err := scmpArchToAuditArch(arch) if err != nil { return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err) } - if _, ok := lastSyscalls[nativeArch]; !ok { - lastSyscalls[nativeArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{} + if _, ok := lastSyscalls[auditArch]; !ok { + lastSyscalls[auditArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{} } - if _, ok := lastSyscalls[nativeArch][arch]; ok { + if _, ok := lastSyscalls[auditArch][arch]; ok { // Because of ArchNative we may hit the same entry multiple times. - // Just skip it if we've seen this (nativeArch, ScmpArch) + // Just skip it if we've seen this (linuxAuditArch, ScmpArch) // combination before. continue } @@ -272,10 +276,11 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) { } } if largestSyscall != 0 { - lastSyscalls[nativeArch][arch] = largestSyscall + logrus.Debugf("seccomp: largest syscall number for arch %v is %v", arch, largestSyscall) + lastSyscalls[auditArch][arch] = largestSyscall } else { - logrus.Warnf("could not find any syscalls for arch %s", ociArch) - delete(lastSyscalls[nativeArch], arch) + logrus.Warnf("could not find any syscalls for arch %v", arch) + delete(lastSyscalls[auditArch], arch) } } return lastSyscalls, nil @@ -293,10 +298,10 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) { // close_range(2) which were added out-of-order in the syscall table between // kernel releases. func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) { - // A jump-table for each nativeArch used to generate the initial + // A jump-table for each linuxAuditArch used to generate the initial // conditional jumps -- measured from the *END* of the program so they // remain valid after prepending to the tail. - archJumpTable := map[nativeArch]uint32{} + archJumpTable := map[linuxAuditArch]uint32{} // Generate our own -ENOSYS rules for each architecture. They have to be // generated in reverse (prepended to the tail of the program) because the @@ -309,7 +314,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) } // Generate the syscall -ENOSYS rules. - for nativeArch, maxSyscalls := range lastSyscalls { + for auditArch, maxSyscalls := range lastSyscalls { // The number of instructions from the tail of this section which need // to be jumped in order to reach the -ENOSYS return. If the section // does not jump, it will fall through to the actual filter. @@ -390,7 +395,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) // If we're on x86 we need to add a check for x32 and if we're in // the wrong mode we jump over the section. - if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) { + if uint32(auditArch) == uint32(C.C_AUDIT_ARCH_X86_64) { // Generate a prefix to check the mode. switch scmpArch { case libseccomp.ArchAMD64: @@ -419,8 +424,8 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) section = append(section, sectionTail...) case 2: // x32 and x86_64 are a unique case, we can't handle any others. - if uint32(nativeArch) != uint32(C.C_AUDIT_ARCH_X86_64) { - return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", nativeArch) + if uint32(auditArch) != uint32(C.C_AUDIT_ARCH_X86_64) { + return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", auditArch) } x32sysno, ok := maxSyscalls[libseccomp.ArchX32] @@ -497,7 +502,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) programTail = append(section, programTail...) // Update jump table. - archJumpTable[nativeArch] = uint32(len(programTail)) + archJumpTable[auditArch] = uint32(len(programTail)) } // Add a dummy "jump to filter" for any architecture we might miss below. @@ -517,9 +522,9 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) // architectures based on how large the jumps are going to be, or // re-sort the candidate architectures each time to make sure that we // pick the largest jump which is going to be smaller than 255. - for nativeArch := range lastSyscalls { + for auditArch := range lastSyscalls { // We jump forwards but the jump table is calculated from the *END*. - jump := uint32(len(programTail)) - archJumpTable[nativeArch] + jump := uint32(len(programTail)) - archJumpTable[auditArch] // Same routine as above -- this is a basic jeq check, complicated // slightly if it turns out that we need to do a long jump. @@ -528,7 +533,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) // jeq [arch],[jump] bpf.JumpIf{ Cond: bpf.JumpEqual, - Val: uint32(nativeArch), + Val: uint32(auditArch), SkipTrue: uint8(jump), }, }, programTail...) @@ -537,7 +542,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) // jne [arch],1 bpf.JumpIf{ Cond: bpf.JumpNotEqual, - Val: uint32(nativeArch), + Val: uint32(auditArch), SkipTrue: 1, }, // ja [jump] diff --git a/libcontainer/seccomp/patchbpf/enosys_linux_test.go b/libcontainer/seccomp/patchbpf/enosys_linux_test.go index e2d363a43bd..3d442e1daa6 100644 --- a/libcontainer/seccomp/patchbpf/enosys_linux_test.go +++ b/libcontainer/seccomp/patchbpf/enosys_linux_test.go @@ -12,6 +12,7 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" libseccomp "github.com/seccomp/libseccomp-golang" + "github.com/sirupsen/logrus" "golang.org/x/net/bpf" ) @@ -23,7 +24,7 @@ type seccompData struct { } // mockSyscallPayload creates a fake seccomp_data struct with the given data. -func mockSyscallPayload(t *testing.T, sysno libseccomp.ScmpSyscall, arch nativeArch, args ...uint64) []byte { +func mockSyscallPayload(t *testing.T, sysno libseccomp.ScmpSyscall, arch linuxAuditArch, args ...uint64) []byte { var buf bytes.Buffer data := seccompData{ @@ -105,6 +106,18 @@ var testArches = []string{ "ppc64le", "s390", "s390x", + // Dummy value to indicate a configuration with no architecture specified. + "native", +} + +var nativeArch string + +func init() { + scmpNativeArch, err := libseccomp.GetNativeArch() + if err != nil { + logrus.Panicf("get native arch: %v", err) + } + nativeArch = scmpNativeArch.String() } func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) { @@ -150,17 +163,20 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) for _, arch := range testArches { type syscallTest struct { - syscall string sysno libseccomp.ScmpSyscall + syscall string expected uint32 } + if arch == "native" { + arch = nativeArch + } scmpArch, err := libseccomp.GetArchFromString(arch) if err != nil { t.Fatalf("unknown libseccomp architecture %q: %v", arch, err) } - nativeArch, err := archToNative(scmpArch) + auditArch, err := scmpArchToAuditArch(scmpArch) if err != nil { t.Fatalf("unknown audit architecture %q: %v", arch, err) } @@ -179,9 +195,9 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) t.Fatalf("unknown syscall %q on arch %q: %v", syscall, arch, err) } syscallTests = append(syscallTests, syscallTest{ - syscall, - sysno, - expected, + sysno: sysno, + syscall: syscall, + expected: expected, }) } @@ -228,12 +244,19 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) // Test syscalls in the explicit list. for _, test := range syscallTests { - // Override the expected value in the two special cases. - if !archSet[arch] || isAllowAction(defaultAction) { + // Override the expected value in the two special cases: + // 1. If the default action is allow, the filter won't have + // the stub prepended so we expect a fallthrough. + // 2. If the executing architecture is not in the architecture + // set, then the architecture is not handled by the stub -- + // *except* in the case of the native architecture (which + // is always included in the stub). + if isAllowAction(defaultAction) || + (!archSet[arch] && arch != nativeArch) { test.expected = retFallthrough } - payload := mockSyscallPayload(t, test.sysno, nativeArch, 0x1337, 0xF00BA5) + payload := mockSyscallPayload(t, test.sysno, auditArch, 0x1337, 0xF00BA5) // NOTE: golang.org/x/net/bpf returns int here rather // than uint32. rawRet, err := filter.Run(payload) @@ -247,7 +270,7 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) t.Logf(" [%4.1d] %s", idx, insn) } t.Logf("payload: %#v", payload) - t.Errorf("filter %s(%d) %q(%d): got %#x, want %#x", arch, nativeArch, test.syscall, test.sysno, ret, test.expected) + t.Errorf("filter %s(%d) %q(%d): got %#x, want %#x", arch, auditArch, test.syscall, test.sysno, ret, test.expected) } } } @@ -263,7 +286,14 @@ var testActions = map[string]configs.Action{ func TestEnosysStub_SingleArch(t *testing.T) { for _, arch := range testArches { - arches := []string{arch} + var arches []string + // "native" indicates a blank architecture field for seccomp, to test + // the case where the running architecture was not included in the + // architecture. Docker doesn't always set the architecture for some + // reason (namely for ppc64le). + if arch != "native" { + arches = append(arches, arch) + } t.Run("arch="+arch, func(t *testing.T) { for name, action := range testActions { t.Run("action="+name, func(t *testing.T) { @@ -277,7 +307,16 @@ func TestEnosysStub_SingleArch(t *testing.T) { func TestEnosysStub_MultiArch(t *testing.T) { for end := 0; end < len(testArches); end++ { for start := 0; start < end; start++ { - arches := testArches[start:end] + var arches []string + for _, arch := range testArches[start:end] { + // "native" indicates a blank architecture field for seccomp, to test + // the case where the running architecture was not included in the + // architecture. Docker doesn't always set the architecture for some + // reason (namely for ppc64le). + if arch != "native" { + arches = append(arches, arch) + } + } if len(arches) <= 1 { continue }