From d4f65b5d2497b2fd9c45f06b71deb4ab084a5b66 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 13 Sep 2012 13:06:29 +0100 Subject: KEYS: Add payload preparsing opportunity prior to key instantiate or update Give the key type the opportunity to preparse the payload prior to the instantiation and update routines being called. This is done with the provision of two new key type operations: int (*preparse)(struct key_preparsed_payload *prep); void (*free_preparse)(struct key_preparsed_payload *prep); If the first operation is present, then it is called before key creation (in the add/update case) or before the key semaphore is taken (in the update and instantiate cases). The second operation is called to clean up if the first was called. preparse() is given the opportunity to fill in the following structure: struct key_preparsed_payload { char *description; void *type_data[2]; void *payload; const void *data; size_t datalen; size_t quotalen; }; Before the preparser is called, the first three fields will have been cleared, the payload pointer and size will be stored in data and datalen and the default quota size from the key_type struct will be stored into quotalen. The preparser may parse the payload in any way it likes and may store data in the type_data[] and payload fields for use by the instantiate() and update() ops. The preparser may also propose a description for the key by attaching it as a string to the description field. This can be used by passing a NULL or "" description to the add_key() system call or the key_create_or_update() function. This cannot work with request_key() as that required the description to tell the upcall about the key to be created. This, for example permits keys that store PGP public keys to generate their own name from the user ID and public key fingerprint in the key. The instantiate() and update() operations are then modified to look like this: int (*instantiate)(struct key *key, struct key_preparsed_payload *prep); int (*update)(struct key *key, struct key_preparsed_payload *prep); and the new payload data is passed in *prep, whether or not it was preparsed. Signed-off-by: David Howells --- Documentation/security/keys.txt | 50 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt index aa0dbd74b71..7d9ca92022d 100644 --- a/Documentation/security/keys.txt +++ b/Documentation/security/keys.txt @@ -412,6 +412,10 @@ The main syscalls are: to the keyring. In this case, an error will be generated if the process does not have permission to write to the keyring. + If the key type supports it, if the description is NULL or an empty + string, the key type will try and generate a description from the content + of the payload. + The payload is optional, and the pointer can be NULL if not required by the type. The payload is plen in size, and plen can be zero for an empty payload. @@ -1114,12 +1118,53 @@ The structure has a number of fields, some of which are mandatory: it should return 0. - (*) int (*instantiate)(struct key *key, const void *data, size_t datalen); + (*) int (*preparse)(struct key_preparsed_payload *prep); + + This optional method permits the key type to attempt to parse payload + before a key is created (add key) or the key semaphore is taken (update or + instantiate key). The structure pointed to by prep looks like: + + struct key_preparsed_payload { + char *description; + void *type_data[2]; + void *payload; + const void *data; + size_t datalen; + size_t quotalen; + }; + + Before calling the method, the caller will fill in data and datalen with + the payload blob parameters; quotalen will be filled in with the default + quota size from the key type and the rest will be cleared. + + If a description can be proposed from the payload contents, that should be + attached as a string to the description field. This will be used for the + key description if the caller of add_key() passes NULL or "". + + The method can attach anything it likes to type_data[] and payload. These + are merely passed along to the instantiate() or update() operations. + + The method should return 0 if success ful or a negative error code + otherwise. + + + (*) void (*free_preparse)(struct key_preparsed_payload *prep); + + This method is only required if the preparse() method is provided, + otherwise it is unused. It cleans up anything attached to the + description, type_data and payload fields of the key_preparsed_payload + struct as filled in by the preparse() method. + + + (*) int (*instantiate)(struct key *key, struct key_preparsed_payload *prep); This method is called to attach a payload to a key during construction. The payload attached need not bear any relation to the data passed to this function. + The prep->data and prep->datalen fields will define the original payload + blob. If preparse() was supplied then other fields may be filled in also. + If the amount of data attached to the key differs from the size in keytype->def_datalen, then key_payload_reserve() should be called. @@ -1135,6 +1180,9 @@ The structure has a number of fields, some of which are mandatory: If this type of key can be updated, then this method should be provided. It is called to update a key's payload from the blob of data provided. + The prep->data and prep->datalen fields will define the original payload + blob. If preparse() was supplied then other fields may be filled in also. + key_payload_reserve() should be called if the data length might change before any changes are actually made. Note that if this succeeds, the type is committed to changing the key because it's already been altered, so all -- cgit v1.2.3-70-g09d2 From 87b526d349b04c31d7b3a40b434eb3f825d22305 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 1 Oct 2012 11:40:45 -0700 Subject: seccomp: Make syscall skipping and nr changes more consistent This fixes two issues that could cause incompatibility between kernel versions: - If a tracer uses SECCOMP_RET_TRACE to select a syscall number higher than the largest known syscall, emulate the unknown vsyscall by returning -ENOSYS. (This is unlikely to make a noticeable difference on x86-64 due to the way the system call entry works.) - On x86-64 with vsyscall=emulate, skipped vsyscalls were buggy. This updates the documentation accordingly. Signed-off-by: Andy Lutomirski Acked-by: Will Drewry Signed-off-by: James Morris --- Documentation/prctl/seccomp_filter.txt | 74 ++++++++++++++++++++-- arch/x86/kernel/vsyscall_64.c | 110 ++++++++++++++++++--------------- kernel/seccomp.c | 13 +++- 3 files changed, 137 insertions(+), 60 deletions(-) (limited to 'Documentation') diff --git a/Documentation/prctl/seccomp_filter.txt b/Documentation/prctl/seccomp_filter.txt index 597c3c58137..1e469ef7577 100644 --- a/Documentation/prctl/seccomp_filter.txt +++ b/Documentation/prctl/seccomp_filter.txt @@ -95,12 +95,15 @@ SECCOMP_RET_KILL: SECCOMP_RET_TRAP: Results in the kernel sending a SIGSYS signal to the triggering - task without executing the system call. The kernel will - rollback the register state to just before the system call - entry such that a signal handler in the task will be able to - inspect the ucontext_t->uc_mcontext registers and emulate - system call success or failure upon return from the signal - handler. + task without executing the system call. siginfo->si_call_addr + will show the address of the system call instruction, and + siginfo->si_syscall and siginfo->si_arch will indicate which + syscall was attempted. The program counter will be as though + the syscall happened (i.e. it will not point to the syscall + instruction). The return value register will contain an arch- + dependent value -- if resuming execution, set it to something + sensible. (The architecture dependency is because replacing + it with -ENOSYS could overwrite some useful information.) The SECCOMP_RET_DATA portion of the return value will be passed as si_errno. @@ -123,6 +126,18 @@ SECCOMP_RET_TRACE: the BPF program return value will be available to the tracer via PTRACE_GETEVENTMSG. + The tracer can skip the system call by changing the syscall number + to -1. Alternatively, the tracer can change the system call + requested by changing the system call to a valid syscall number. If + the tracer asks to skip the system call, then the system call will + appear to return the value that the tracer puts in the return value + register. + + The seccomp check will not be run again after the tracer is + notified. (This means that seccomp-based sandboxes MUST NOT + allow use of ptrace, even of other sandboxed processes, without + extreme care; ptracers can use this mechanism to escape.) + SECCOMP_RET_ALLOW: Results in the system call being executed. @@ -161,3 +176,50 @@ architecture supports both ptrace_event and seccomp, it will be able to support seccomp filter with minor fixup: SIGSYS support and seccomp return value checking. Then it must just add CONFIG_HAVE_ARCH_SECCOMP_FILTER to its arch-specific Kconfig. + + + +Caveats +------- + +The vDSO can cause some system calls to run entirely in userspace, +leading to surprises when you run programs on different machines that +fall back to real syscalls. To minimize these surprises on x86, make +sure you test with +/sys/devices/system/clocksource/clocksource0/current_clocksource set to +something like acpi_pm. + +On x86-64, vsyscall emulation is enabled by default. (vsyscalls are +legacy variants on vDSO calls.) Currently, emulated vsyscalls will honor seccomp, with a few oddities: + +- A return value of SECCOMP_RET_TRAP will set a si_call_addr pointing to + the vsyscall entry for the given call and not the address after the + 'syscall' instruction. Any code which wants to restart the call + should be aware that (a) a ret instruction has been emulated and (b) + trying to resume the syscall will again trigger the standard vsyscall + emulation security checks, making resuming the syscall mostly + pointless. + +- A return value of SECCOMP_RET_TRACE will signal the tracer as usual, + but the syscall may not be changed to another system call using the + orig_rax register. It may only be changed to -1 order to skip the + currently emulated call. Any other change MAY terminate the process. + The rip value seen by the tracer will be the syscall entry address; + this is different from normal behavior. The tracer MUST NOT modify + rip or rsp. (Do not rely on other changes terminating the process. + They might work. For example, on some kernels, choosing a syscall + that only exists in future kernels will be correctly emulated (by + returning -ENOSYS). + +To detect this quirky behavior, check for addr & ~0x0C00 == +0xFFFFFFFFFF600000. (For SECCOMP_RET_TRACE, use rip. For +SECCOMP_RET_TRAP, use siginfo->si_call_addr.) Do not check any other +condition: future kernels may improve vsyscall emulation and current +kernels in vsyscall=native mode will behave differently, but the +instructions at 0xF...F600{0,4,8,C}00 will not be system calls in these +cases. + +Note that modern systems are unlikely to use vsyscalls at all -- they +are a legacy feature and they are considerably slower than standard +syscalls. New code will use the vDSO, and vDSO-issued system calls +are indistinguishable from normal system calls. diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 8d141b30904..b2e58a248b3 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -136,19 +136,6 @@ static int addr_to_vsyscall_nr(unsigned long addr) return nr; } -#ifdef CONFIG_SECCOMP -static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr) -{ - if (!seccomp_mode(&tsk->seccomp)) - return 0; - task_pt_regs(tsk)->orig_ax = syscall_nr; - task_pt_regs(tsk)->ax = syscall_nr; - return __secure_computing(syscall_nr); -} -#else -#define vsyscall_seccomp(_tsk, _nr) 0 -#endif - static bool write_ok_or_segv(unsigned long ptr, size_t size) { /* @@ -181,10 +168,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) { struct task_struct *tsk; unsigned long caller; - int vsyscall_nr; + int vsyscall_nr, syscall_nr, tmp; int prev_sig_on_uaccess_error; long ret; - int skip; /* * No point in checking CS -- the only way to get here is a user mode @@ -216,56 +202,84 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) } tsk = current; - /* - * With a real vsyscall, page faults cause SIGSEGV. We want to - * preserve that behavior to make writing exploits harder. - */ - prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; - current_thread_info()->sig_on_uaccess_error = 1; /* + * Check for access_ok violations and find the syscall nr. + * * NULL is a valid user pointer (in the access_ok sense) on 32-bit and * 64-bit, so we don't need to special-case it here. For all the * vsyscalls, NULL means "don't write anything" not "write it at * address 0". */ - ret = -EFAULT; - skip = 0; switch (vsyscall_nr) { case 0: - skip = vsyscall_seccomp(tsk, __NR_gettimeofday); - if (skip) - break; - if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || - !write_ok_or_segv(regs->si, sizeof(struct timezone))) - break; + !write_ok_or_segv(regs->si, sizeof(struct timezone))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_gettimeofday; + break; + + case 1: + if (!write_ok_or_segv(regs->di, sizeof(time_t))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_time; + break; + + case 2: + if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || + !write_ok_or_segv(regs->si, sizeof(unsigned))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_getcpu; + break; + } + + /* + * Handle seccomp. regs->ip must be the original value. + * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. + * + * We could optimize the seccomp disabled case, but performance + * here doesn't matter. + */ + regs->orig_ax = syscall_nr; + regs->ax = -ENOSYS; + tmp = secure_computing(syscall_nr); + if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { + warn_bad_vsyscall(KERN_DEBUG, regs, + "seccomp tried to change syscall nr or ip"); + do_exit(SIGSYS); + } + if (tmp) + goto do_ret; /* skip requested */ + /* + * With a real vsyscall, page faults cause SIGSEGV. We want to + * preserve that behavior to make writing exploits harder. + */ + prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; + current_thread_info()->sig_on_uaccess_error = 1; + + ret = -EFAULT; + switch (vsyscall_nr) { + case 0: ret = sys_gettimeofday( (struct timeval __user *)regs->di, (struct timezone __user *)regs->si); break; case 1: - skip = vsyscall_seccomp(tsk, __NR_time); - if (skip) - break; - - if (!write_ok_or_segv(regs->di, sizeof(time_t))) - break; - ret = sys_time((time_t __user *)regs->di); break; case 2: - skip = vsyscall_seccomp(tsk, __NR_getcpu); - if (skip) - break; - - if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || - !write_ok_or_segv(regs->si, sizeof(unsigned))) - break; - ret = sys_getcpu((unsigned __user *)regs->di, (unsigned __user *)regs->si, NULL); @@ -274,12 +288,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; - if (skip) { - if ((long)regs->ax <= 0L) /* seccomp errno emulation */ - goto do_ret; - goto done; /* seccomp trace/trap */ - } - +check_fault: if (ret == -EFAULT) { /* Bad news -- userspace fed a bad pointer to a vsyscall. */ warn_bad_vsyscall(KERN_INFO, regs, @@ -302,7 +311,6 @@ do_ret: /* Emulate a ret instruction. */ regs->ip = caller; regs->sp += 8; -done: return true; sigsegv: diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ee376beedaf..5af44b59377 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -396,25 +396,29 @@ int __secure_computing(int this_syscall) #ifdef CONFIG_SECCOMP_FILTER case SECCOMP_MODE_FILTER: { int data; + struct pt_regs *regs = task_pt_regs(current); ret = seccomp_run_filters(this_syscall); data = ret & SECCOMP_RET_DATA; ret &= SECCOMP_RET_ACTION; switch (ret) { case SECCOMP_RET_ERRNO: /* Set the low-order 16-bits as a errno. */ - syscall_set_return_value(current, task_pt_regs(current), + syscall_set_return_value(current, regs, -data, 0); goto skip; case SECCOMP_RET_TRAP: /* Show the handler the original registers. */ - syscall_rollback(current, task_pt_regs(current)); + syscall_rollback(current, regs); /* Let the filter pass back 16 bits of data. */ seccomp_send_sigsys(this_syscall, data); goto skip; case SECCOMP_RET_TRACE: /* Skip these calls if there is no tracer. */ - if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) + if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { + syscall_set_return_value(current, regs, + -ENOSYS, 0); goto skip; + } /* Allow the BPF to provide the event message */ ptrace_event(PTRACE_EVENT_SECCOMP, data); /* @@ -425,6 +429,9 @@ int __secure_computing(int this_syscall) */ if (fatal_signal_pending(current)) break; + if (syscall_get_nr(current, regs) < 0) + goto skip; /* Explicit request to skip. */ + return 0; case SECCOMP_RET_ALLOW: return 0; -- cgit v1.2.3-70-g09d2 From f8aa23a55f813c9bddec2a6176e0e67274e6e7c1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 2 Oct 2012 19:24:56 +0100 Subject: KEYS: Use keyring_alloc() to create special keyrings Use keyring_alloc() to create special keyrings now that it has a permissions parameter rather than using key_alloc() + key_instantiate_and_link(). Also document and export keyring_alloc() so that modules can use it too. Signed-off-by: David Howells --- Documentation/security/keys.txt | 17 +++++++++++++++++ fs/cifs/cifsacl.c | 12 ++++-------- fs/nfs/idmap.c | 12 ++++-------- net/dns_resolver/dns_key.c | 13 +++++-------- security/keys/keyring.c | 1 + 5 files changed, 31 insertions(+), 24 deletions(-) (limited to 'Documentation') diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt index aa0dbd74b71..a4f9125c033 100644 --- a/Documentation/security/keys.txt +++ b/Documentation/security/keys.txt @@ -990,6 +990,23 @@ payload contents" for more information. reference pointer if successful. +(*) A keyring can be created by: + + struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid, + const struct cred *cred, + key_perm_t perm, + unsigned long flags, + struct key *dest); + + This creates a keyring with the given attributes and returns it. If dest + is not NULL, the new keyring will be linked into the keyring to which it + points. No permission checks are made upon the destination keyring. + + Error EDQUOT can be returned if the keyring would overload the quota (pass + KEY_ALLOC_NOT_IN_QUOTA in flags if the keyring shouldn't be accounted + towards the user's quota). Error ENOMEM can also be returned. + + (*) To check the validity of a key, this function can be called: int validate_key(struct key *key); diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 05f4dc263a2..a8a753c8fcd 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -537,19 +537,15 @@ init_cifs_idmap(void) if (!cred) return -ENOMEM; - keyring = key_alloc(&key_type_keyring, ".cifs_idmap", 0, 0, cred, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ, - KEY_ALLOC_NOT_IN_QUOTA); + keyring = keyring_alloc(".cifs_idmap", 0, 0, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto failed_put_cred; } - ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); - if (ret < 0) - goto failed_put_key; - ret = register_key_type(&cifs_idmap_key_type); if (ret < 0) goto failed_put_key; diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index a850079467d..957134b4c0f 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -192,19 +192,15 @@ static int nfs_idmap_init_keyring(void) if (!cred) return -ENOMEM; - keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ, - KEY_ALLOC_NOT_IN_QUOTA); + keyring = keyring_alloc(".id_resolver", 0, 0, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto failed_put_cred; } - ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); - if (ret < 0) - goto failed_put_key; - ret = register_key_type(&key_type_id_resolver); if (ret < 0) goto failed_put_key; diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index d9507dd0581..f2c379d835e 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -259,19 +259,15 @@ static int __init init_dns_resolver(void) if (!cred) return -ENOMEM; - keyring = key_alloc(&key_type_keyring, ".dns_resolver", 0, 0, cred, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ, - KEY_ALLOC_NOT_IN_QUOTA); + keyring = keyring_alloc(".dns_resolver", 0, 0, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto failed_put_cred; } - ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); - if (ret < 0) - goto failed_put_key; - ret = register_key_type(&key_type_dns_resolver); if (ret < 0) goto failed_put_key; @@ -303,3 +299,4 @@ static void __exit exit_dns_resolver(void) module_init(init_dns_resolver) module_exit(exit_dns_resolver) MODULE_LICENSE("GPL"); + diff --git a/security/keys/keyring.c b/security/keys/keyring.c index cf704a92083..8c25558da14 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -275,6 +275,7 @@ struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid, return keyring; } +EXPORT_SYMBOL(keyring_alloc); /** * keyring_search_aux - Search a keyring tree for a key matching some criteria -- cgit v1.2.3-70-g09d2