From 47c459beabe969c6751e2ea8d1f85c5fa1652d6c Mon Sep 17 00:00:00 2001 From: Ganapatrao Kulkarni Date: Thu, 7 Jul 2016 10:18:17 +0530 Subject: [PATCH 01/16] arm64: Enable workaround for Cavium erratum 27456 on thunderx-81xx Cavium erratum 27456 commit 104a0c02e8b1 ("arm64: Add workaround for Cavium erratum 27456") is applicable for thunderx-81xx pass1.0 SoC as well. Adding code to enable to 81xx. Signed-off-by: Ganapatrao Kulkarni Reviewed-by: Andrew Pinski Signed-off-by: Will Deacon --- arch/arm64/include/asm/cputype.h | 2 ++ arch/arm64/kernel/cpu_errata.c | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 87e1985f3be8..9d9fd4b9a72e 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -80,12 +80,14 @@ #define APM_CPU_PART_POTENZA 0x000 #define CAVIUM_CPU_PART_THUNDERX 0x0A1 +#define CAVIUM_CPU_PART_THUNDERX_81XX 0x0A2 #define BRCM_CPU_PART_VULCAN 0x516 #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) +#define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) #ifndef __ASSEMBLY__ diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index d42789499f17..af716b65110d 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -98,6 +98,12 @@ const struct arm64_cpu_capabilities arm64_errata[] = { MIDR_RANGE(MIDR_THUNDERX, 0x00, (1 << MIDR_VARIANT_SHIFT) | 1), }, + { + /* Cavium ThunderX, T81 pass 1.0 */ + .desc = "Cavium erratum 27456", + .capability = ARM64_WORKAROUND_CAVIUM_27456, + MIDR_RANGE(MIDR_THUNDERX_81XX, 0x00, 0x00), + }, #endif { } -- 2.34.1 From 78c4e172412de5d0456dc00d2b34050aa0b683b5 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 5 Jul 2016 17:32:29 -0400 Subject: [PATCH 02/16] Revert "ecryptfs: forbid opening files without mmap handler" This reverts commit 2f36db71009304b3f0b95afacd8eba1f9f046b87. It fixed a local root exploit but also introduced a dependency on the lower file system implementing an mmap operation just to open a file, which is a bit of a heavy hammer. The right fix is to have mmap depend on the existence of the mmap handler instead. Signed-off-by: Jeff Mahoney Cc: stable@vger.kernel.org Signed-off-by: Tyler Hicks --- fs/ecryptfs/kthread.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index e818f5ac7a26..866bb18efefe 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -25,7 +25,6 @@ #include #include #include -#include #include "ecryptfs_kernel.h" struct ecryptfs_open_req { @@ -148,7 +147,7 @@ int ecryptfs_privileged_open(struct file **lower_file, flags |= IS_RDONLY(d_inode(lower_dentry)) ? O_RDONLY : O_RDWR; (*lower_file) = dentry_open(&req.path, flags, cred); if (!IS_ERR(*lower_file)) - goto have_file; + goto out; if ((flags & O_ACCMODE) == O_RDONLY) { rc = PTR_ERR((*lower_file)); goto out; @@ -166,16 +165,8 @@ int ecryptfs_privileged_open(struct file **lower_file, mutex_unlock(&ecryptfs_kthread_ctl.mux); wake_up(&ecryptfs_kthread_ctl.wait); wait_for_completion(&req.done); - if (IS_ERR(*lower_file)) { + if (IS_ERR(*lower_file)) rc = PTR_ERR(*lower_file); - goto out; - } -have_file: - if ((*lower_file)->f_op->mmap == NULL) { - fput(*lower_file); - *lower_file = NULL; - rc = -EMEDIUMTYPE; - } out: return rc; } -- 2.34.1 From 30a46a4647fd1df9cf52e43bf467f0d9265096ca Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 7 Jul 2016 13:41:11 -0700 Subject: [PATCH 03/16] apparmor: fix oops, validate buffer size in apparmor_setprocattr() When proc_pid_attr_write() was changed to use memdup_user apparmor's (interface violating) assumption that the setprocattr buffer was always a single page was violated. The size test is not strictly speaking needed as proc_pid_attr_write() will reject anything larger, but for the sake of robustness we can keep it in. SMACK and SELinux look safe to me, but somebody else should probably have a look just in case. Based on original patch from Vegard Nossum modified for the case that apparmor provides null termination. Fixes: bb646cdb12e75d82258c2f2e7746d5952d3e321a Reported-by: Vegard Nossum Cc: Al Viro Cc: John Johansen Cc: Paul Moore Cc: Stephen Smalley Cc: Eric Paris Cc: Casey Schaufler Cc: stable@kernel.org Signed-off-by: John Johansen Reviewed-by: Tyler Hicks Signed-off-by: James Morris --- security/apparmor/lsm.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 2660fbcf94d1..7798e1608f4f 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -500,34 +500,34 @@ static int apparmor_setprocattr(struct task_struct *task, char *name, { struct common_audit_data sa; struct apparmor_audit_data aad = {0,}; - char *command, *args = value; + char *command, *largs = NULL, *args = value; size_t arg_size; int error; if (size == 0) return -EINVAL; - /* args points to a PAGE_SIZE buffer, AppArmor requires that - * the buffer must be null terminated or have size <= PAGE_SIZE -1 - * so that AppArmor can null terminate them - */ - if (args[size - 1] != '\0') { - if (size == PAGE_SIZE) - return -EINVAL; - args[size] = '\0'; - } - /* task can only write its own attributes */ if (current != task) return -EACCES; - args = value; + /* AppArmor requires that the buffer must be null terminated atm */ + if (args[size - 1] != '\0') { + /* null terminate */ + largs = args = kmalloc(size + 1, GFP_KERNEL); + if (!args) + return -ENOMEM; + memcpy(args, value, size); + args[size] = '\0'; + } + + error = -EINVAL; args = strim(args); command = strsep(&args, " "); if (!args) - return -EINVAL; + goto out; args = skip_spaces(args); if (!*args) - return -EINVAL; + goto out; arg_size = size - (args - (char *) value); if (strcmp(name, "current") == 0) { @@ -553,10 +553,12 @@ static int apparmor_setprocattr(struct task_struct *task, char *name, goto fail; } else /* only support the "current" and "exec" process attributes */ - return -EINVAL; + goto fail; if (!error) error = size; +out: + kfree(largs); return error; fail: @@ -565,9 +567,9 @@ fail: aad.profile = aa_current_profile(); aad.op = OP_SETPROCATTR; aad.info = name; - aad.error = -EINVAL; + aad.error = error = -EINVAL; aa_audit_msg(AUDIT_APPARMOR_DENIED, &sa, NULL); - return -EINVAL; + goto out; } static int apparmor_task_setrlimit(struct task_struct *task, -- 2.34.1 From 7469be95a487319514adce2304ad2af3553d2fc9 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 7 Jul 2016 01:32:04 -0600 Subject: [PATCH 04/16] xenbus: don't bail early from xenbus_dev_request_and_reply() xenbus_dev_request_and_reply() needs to track whether a transaction is open. For XS_TRANSACTION_START messages it calls transaction_start() and for XS_TRANSACTION_END messages it calls transaction_end(). If sending an XS_TRANSACTION_START message fails or responds with an an error, the transaction is not open and transaction_end() must be called. If sending an XS_TRANSACTION_END message fails, the transaction is still open, but if an error response is returned the transaction is closed. Commit 027bd7e89906 ("xen/xenbus: Avoid synchronous wait on XenBus stalling shutdown/restart") introduced a regression where failed XS_TRANSACTION_START messages were leaving the transaction open. This can cause problems with suspend (and migration) as all transactions must be closed before suspending. It appears that the problematic change was added accidentally, so just remove it. Signed-off-by: Jan Beulich Cc: Konrad Rzeszutek Wilk Cc: Signed-off-by: David Vrabel --- drivers/xen/xenbus/xenbus_xs.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index 374b12af8812..0bd3d47ad24d 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -249,9 +249,6 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) mutex_unlock(&xs_state.request_mutex); - if (IS_ERR(ret)) - return ret; - if ((msg->type == XS_TRANSACTION_END) || ((req_msg.type == XS_TRANSACTION_START) && (msg->type == XS_ERROR))) -- 2.34.1 From e5a79475a7ae171fef82608c6e11f51bb85a6745 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 7 Jul 2016 01:32:35 -0600 Subject: [PATCH 05/16] xenbus: simplify xenbus_dev_request_and_reply() No need to retain a local copy of the full request message, only the type is really needed. Signed-off-by: Jan Beulich Signed-off-by: David Vrabel --- drivers/xen/xenbus/xenbus_xs.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index 0bd3d47ad24d..22f7cd711c57 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -232,10 +232,10 @@ static void transaction_resume(void) void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) { void *ret; - struct xsd_sockmsg req_msg = *msg; + enum xsd_sockmsg_type type = msg->type; int err; - if (req_msg.type == XS_TRANSACTION_START) + if (type == XS_TRANSACTION_START) transaction_start(); mutex_lock(&xs_state.request_mutex); @@ -250,8 +250,7 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) mutex_unlock(&xs_state.request_mutex); if ((msg->type == XS_TRANSACTION_END) || - ((req_msg.type == XS_TRANSACTION_START) && - (msg->type == XS_ERROR))) + ((type == XS_TRANSACTION_START) && (msg->type == XS_ERROR))) transaction_end(); return ret; -- 2.34.1 From 6f2d9d99213514360034c6d52d2c3919290b3504 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 8 Jul 2016 06:15:07 -0600 Subject: [PATCH 06/16] xen/acpi: allow xen-acpi-processor driver to load on Xen 4.7 As of Xen 4.7 PV CPUID doesn't expose either of CPUID[1].ECX[7] and CPUID[0x80000007].EDX[7] anymore, causing the driver to fail to load on both Intel and AMD systems. Doing any kind of hardware capability checks in the driver as a prerequisite was wrong anyway: With the hypervisor being in charge, all such checking should be done by it. If ACPI data gets uploaded despite some missing capability, the hypervisor is free to ignore part or all of that data. Ditch the entire check_prereq() function, and do the only valid check (xen_initial_domain()) in the caller in its place. Signed-off-by: Jan Beulich Cc: Signed-off-by: David Vrabel --- drivers/xen/xen-acpi-processor.c | 35 +++----------------------------- 1 file changed, 3 insertions(+), 32 deletions(-) diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c index 076970a54f89..4ce10bcca18b 100644 --- a/drivers/xen/xen-acpi-processor.c +++ b/drivers/xen/xen-acpi-processor.c @@ -423,36 +423,7 @@ upload: return 0; } -static int __init check_prereq(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - if (!xen_initial_domain()) - return -ENODEV; - - if (!acpi_gbl_FADT.smi_command) - return -ENODEV; - - if (c->x86_vendor == X86_VENDOR_INTEL) { - if (!cpu_has(c, X86_FEATURE_EST)) - return -ENODEV; - return 0; - } - if (c->x86_vendor == X86_VENDOR_AMD) { - /* Copied from powernow-k8.h, can't include ../cpufreq/powernow - * as we get compile warnings for the static functions. - */ -#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007 -#define USE_HW_PSTATE 0x00000080 - u32 eax, ebx, ecx, edx; - cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); - if ((edx & USE_HW_PSTATE) != USE_HW_PSTATE) - return -ENODEV; - return 0; - } - return -ENODEV; -} /* acpi_perf_data is a pointer to percpu data. */ static struct acpi_processor_performance __percpu *acpi_perf_data; @@ -509,10 +480,10 @@ struct notifier_block xen_acpi_processor_resume_nb = { static int __init xen_acpi_processor_init(void) { unsigned int i; - int rc = check_prereq(); + int rc; - if (rc) - return rc; + if (!xen_initial_domain()) + return -ENODEV; nr_acpi_bits = get_max_acpi_id() + 1; acpi_ids_done = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL); -- 2.34.1 From f0fe970df3838c202ef6c07a4c2b36838ef0a88b Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 5 Jul 2016 17:32:30 -0400 Subject: [PATCH 07/16] ecryptfs: don't allow mmap when the lower fs doesn't support it There are legitimate reasons to disallow mmap on certain files, notably in sysfs or procfs. We shouldn't emulate mmap support on file systems that don't offer support natively. CVE-2016-1583 Signed-off-by: Jeff Mahoney Cc: stable@vger.kernel.org [tyhicks: clean up f_op check by using ecryptfs_file_to_lower()] Signed-off-by: Tyler Hicks --- fs/ecryptfs/file.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 53d0141b9c20..ca4e83750214 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -169,6 +169,19 @@ out: return rc; } +static int ecryptfs_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct file *lower_file = ecryptfs_file_to_lower(file); + /* + * Don't allow mmap on top of file systems that don't support it + * natively. If FILESYSTEM_MAX_STACK_DEPTH > 2 or ecryptfs + * allows recursive mounting, this will need to be extended. + */ + if (!lower_file->f_op->mmap) + return -ENODEV; + return generic_file_mmap(file, vma); +} + /** * ecryptfs_open * @inode: inode specifying file to open @@ -403,7 +416,7 @@ const struct file_operations ecryptfs_main_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = ecryptfs_mmap, .open = ecryptfs_open, .flush = ecryptfs_flush, .release = ecryptfs_release, -- 2.34.1 From 7f556567036cb7f89aabe2f0954b08566b4efb53 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 10 Jul 2016 16:46:32 -0700 Subject: [PATCH 08/16] tmpfs: fix regression hang in fallocate undo The well-spotted fallocate undo fix is good in most cases, but not when fallocate failed on the very first page. index 0 then passes lend -1 to shmem_undo_range(), and that has two bad effects: (a) that it will undo every fallocation throughout the file, unrestricted by the current range; but more importantly (b) it can cause the undo to hang, because lend -1 is treated as truncation, which makes it keep on retrying until every page has gone, but those already fully instantiated will never go away. Big thank you to xfstests generic/269 which demonstrates this. Fixes: b9b4bb26af01 ("tmpfs: don't undo fallocate past its last page") Cc: stable@vger.kernel.org Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/shmem.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 24463b67b6ef..171dee7a131f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2225,9 +2225,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, error = shmem_getpage(inode, index, &page, SGP_FALLOC); if (error) { /* Remove the !PageUptodate pages we added */ - shmem_undo_range(inode, - (loff_t)start << PAGE_SHIFT, - ((loff_t)index << PAGE_SHIFT) - 1, true); + if (index > start) { + shmem_undo_range(inode, + (loff_t)start << PAGE_SHIFT, + ((loff_t)index << PAGE_SHIFT) - 1, true); + } goto undone; } -- 2.34.1 From 92d21ac74a9e3c09b0b01c764e530657e4c85c49 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 10 Jul 2016 20:24:59 -0700 Subject: [PATCH 09/16] Linux 4.7-rc7 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0d504893df6e..81b22628025a 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 4 PATCHLEVEL = 7 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Psychotic Stoned Sheep # *DOCUMENTATION* -- 2.34.1 From a4949d83eb08908ec1d40bf1237ae68a3ab65a6e Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 22 Dec 2015 08:29:56 -0500 Subject: [PATCH 10/16] Restartable sequences system call (v7) MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Expose a new system call allowing each thread to register one userspace memory area to be used as an ABI between kernel and user-space for two purposes: user-space restartable sequences and quick access to read the current CPU number value from user-space. * Restartable sequences (per-cpu atomics) The restartable critical sections (percpu atomics) work has been started by Paul Turner and Andrew Hunter. It lets the kernel handle restart of critical sections. [1] [2] The re-implementation proposed here brings a few simplifications to the ABI which facilitates porting to other architectures and speeds up the user-space fast path. A locking-based fall-back, purely implemented in user-space, is proposed here to deal with debugger single-stepping. This fallback interacts with rseq_start() and rseq_finish(), which force retries in response to concurrent lock-based activity. Here are benchmarks of counter increment in various scenarios compared to restartable sequences: ARMv7 Processor rev 4 (v7l) Machine model: Cubietruck Counter increment speed (ns/increment) 1 thread 2 threads global increment (baseline) 6 N/A percpu rseq increment 50 52 percpu rseq spinlock 94 94 global atomic increment 48 74 (__sync_add_and_fetch_4) global atomic CAS 50 172 (__sync_val_compare_and_swap_4) global pthread mutex 148 862 ARMv7 Processor rev 10 (v7l) Machine model: Wandboard Counter increment speed (ns/increment) 1 thread 4 threads global increment (baseline) 7 N/A percpu rseq increment 50 50 percpu rseq spinlock 82 84 global atomic increment 44 262 (__sync_add_and_fetch_4) global atomic CAS 46 316 (__sync_val_compare_and_swap_4) global pthread mutex 146 1400 x86-64 Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz: Counter increment speed (ns/increment) 1 thread 8 threads global increment (baseline) 3.0 N/A percpu rseq increment 3.6 3.8 percpu rseq spinlock 5.6 6.2 global LOCK; inc 8.0 166.4 global LOCK; cmpxchg 13.4 435.2 global pthread mutex 25.2 1363.6 * Reading the current CPU number Speeding up reading the current CPU number on which the caller thread is running is done by keeping the current CPU number up do date within the cpu_id field of the memory area registered by the thread. This is done by making scheduler migration set the TIF_NOTIFY_RESUME flag on the current thread. Upon return to user-space, a notify-resume handler updates the current CPU value within the registered user-space memory area. User-space can then read the current CPU number directly from memory. Keeping the current cpu id in a memory area shared between kernel and user-space is an improvement over current mechanisms available to read the current CPU number, which has the following benefits over alternative approaches: - 35x speedup on ARM vs system call through glibc - 20x speedup on x86 compared to calling glibc, which calls vdso executing a "lsl" instruction, - 14x speedup on x86 compared to inlined "lsl" instruction, - Unlike vdso approaches, this cpu_id value can be read from an inline assembly, which makes it a useful building block for restartable sequences. - The approach of reading the cpu id through memory mapping shared between kernel and user-space is portable (e.g. ARM), which is not the case for the lsl-based x86 vdso. On x86, yet another possible approach would be to use the gs segment selector to point to user-space per-cpu data. This approach performs similarly to the cpu id cache, but it has two disadvantages: it is not portable, and it is incompatible with existing applications already using the gs segment selector for other purposes. Benchmarking various approaches for reading the current CPU number: ARMv7 Processor rev 4 (v7l) Machine model: Cubietruck - Baseline (empty loop): 8.4 ns - Read CPU from rseq cpu_id: 16.7 ns - Read CPU from rseq cpu_id (lazy register): 19.8 ns - glibc 2.19-0ubuntu6.6 getcpu: 301.8 ns - getcpu system call: 234.9 ns x86-64 Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz: - Baseline (empty loop): 0.8 ns - Read CPU from rseq cpu_id: 0.8 ns - Read CPU from rseq cpu_id (lazy register): 0.8 ns - Read using gs segment selector: 0.8 ns - "lsl" inline assembly: 13.0 ns - glibc 2.19-0ubuntu6 getcpu: 16.6 ns - getcpu system call: 53.9 ns - Speed Running 10 runs of hackbench -l 100000 seems to indicate, contrary to expectations, that enabling CONFIG_RSEQ slightly accelerates the scheduler: Configuration: 2 sockets * 8-core Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz (directly on hardware, hyperthreading disabled in BIOS, energy saving disabled in BIOS, turboboost disabled in BIOS, cpuidle.off=1 kernel parameter), with a Linux v4.6 defconfig+localyesconfig, restartable sequences series applied. * CONFIG_RSEQ=n avg.: 41.37 s std.dev.: 0.36 s * CONFIG_RSEQ=y avg.: 40.46 s std.dev.: 0.33 s - Size On x86-64, between CONFIG_RSEQ=n/y, the text size increase of vmlinux is 2855 bytes, and the data size increase of vmlinux is 1024 bytes. * CONFIG_RSEQ=n text data bss dec hex filename 9964559 4256280 962560 15183399 e7ae27 vmlinux.norseq * CONFIG_RSEQ=y text data bss dec hex filename 9967414 4257304 962560 15187278 e7bd4e vmlinux.rseq [1] https://lwn.net/Articles/650333/ [2] http://www.linuxplumbersconf.org/2013/ocw/system/presentations/1695/original/LPC%20-%20PerCpu%20Atomics.pdf Link: http://lkml.kernel.org/r/20151027235635.16059.11630.stgit@pjt-glaptop.roam.corp.google.com Link: http://lkml.kernel.org/r/20150624222609.6116.86035.stgit@kitami.mtv.corp.google.com Signed-off-by: Mathieu Desnoyers CC: Thomas Gleixner CC: Paul Turner CC: Andrew Hunter CC: Peter Zijlstra CC: Andy Lutomirski CC: Andi Kleen CC: Dave Watson CC: Chris Lameter CC: Ingo Molnar CC: "H. Peter Anvin" CC: Ben Maurer CC: Steven Rostedt CC: "Paul E. McKenney" CC: Josh Triplett CC: Linus Torvalds CC: Andrew Morton CC: Russell King CC: Catalin Marinas CC: Will Deacon CC: Michael Kerrisk CC: Boqun Feng CC: linux-api@vger.kernel.org --- Changes since v1: - Return -1, errno=EINVAL if cpu_cache pointer is not aligned on sizeof(int32_t). - Update man page to describe the pointer alignement requirements and update atomicity guarantees. - Add MAINTAINERS file GETCPU_CACHE entry. - Remove dynamic memory allocation: go back to having a single getcpu_cache entry per thread. Update documentation accordingly. - Rebased on Linux 4.4. Changes since v2: - Introduce a "cmd" argument, along with an enum with GETCPU_CACHE_GET and GETCPU_CACHE_SET. Introduce a uapi header linux/getcpu_cache.h defining this enumeration. - Split resume notifier architecture implementation from the system call wire up in the following arch-specific patches. - Man pages updates. - Handle 32-bit compat pointers. - Simplify handling of getcpu_cache GETCPU_CACHE_SET compiler barrier: set the current cpu cache pointer before doing the cache update, and set it back to NULL if the update fails. Setting it back to NULL on error ensures that no resume notifier will trigger a SIGSEGV if a migration happened concurrently. Changes since v3: - Fix __user annotations in compat code, - Update memory ordering comments. - Rebased on kernel v4.5-rc5. Changes since v4: - Inline getcpu_cache_fork, getcpu_cache_execve, and getcpu_cache_exit. - Add new line between if() and switch() to improve readability. - Added sched switch benchmarks (hackbench) and size overhead comparison to change log. Changes since v5: - Rename "getcpu_cache" to "thread_local_abi", allowing to extend this system call to cover future features such as restartable critical sections. Generalizing this system call ensures that we can add features similar to the cpu_id field within the same cache-line without having to track one pointer per feature within the task struct. - Add a tlabi_nr parameter to the system call, thus allowing to extend the ABI beyond the initial 64-byte structure by registering structures with tlabi_nr greater than 0. The initial ABI structure is associated with tlabi_nr 0. - Rebased on kernel v4.5. Changes since v6: - Integrate "restartable sequences" v2 patchset from Paul Turner. - Add handling of single-stepping purely in user-space, with a fallback to locking after 2 rseq failures to ensure progress, and by exposing a __rseq_table section to debuggers so they know where to put breakpoints when dealing with rseq assembly blocks which can be aborted at any point. - make the code and ABI generic: porting the kernel implementation simply requires to wire up the signal handler and return to user-space hooks, and allocate the syscall number. - extend testing with a fully configurable test program. See param_spinlock_test -h for details. - handling of rseq ENOSYS in user-space, also with a fallback to locking. - modify Paul Turner's rseq ABI to only require a single TLS store on the user-space fast-path, removing the need to populate two additional registers. This is made possible by introducing struct rseq_cs into the ABI to describe a critical section start_ip, post_commit_ip, and abort_ip. - Rebased on kernel v4.7-rc7. Man page associated: RSEQ(2) Linux Programmer's Manual RSEQ(2) NAME rseq - Restartable sequences and cpu number cache SYNOPSIS #include int rseq(struct rseq * rseq, int flags); DESCRIPTION The rseq() ABI accelerates user-space operations on per-cpu data by defining a shared data structure ABI between each user- space thread and the kernel. The rseq argument is a pointer to the thread-local rseq struc‐ ture to be shared between kernel and user-space. A NULL rseq value can be used to check whether rseq is registered for the current thread. The layout of struct rseq is as follows: Structure alignment This structure needs to be aligned on multiples of 64 bytes. Structure size This structure has a fixed size of 128 bytes. Fields cpu_id Cache of the CPU number on which the calling thread is running. event_counter Restartable sequences event_counter field. rseq_cs Restartable sequences rseq_cs field. Points to a struct rseq_cs. The layout of struct rseq_cs is as follows: Structure alignment This structure needs to be aligned on multiples of 64 bytes. Structure size This structure has a fixed size of 192 bytes. Fields start_ip Instruction pointer address of the first instruction of the sequence of consecutive assembly instructions. post_commit_ip Instruction pointer address after the last instruction of the sequence of consecutive assembly instructions. abort_ip Instruction pointer address where to move the execution flow in case of abort of the sequence of consecutive assembly instructions. The flags argument is currently unused and must be specified as 0. Typically, a library or application will keep the rseq struc‐ ture in a thread-local storage variable, or other memory areas belonging to each thread. It is recommended to perform volatile reads of the thread-local cache to prevent the compiler from doing load tearing. An alternative approach is to read each field from inline assembly. Each thread is responsible for registering its rseq structure. Only one rseq structure address can be registered per thread. Once set, the rseq address is idempotent for a given thread. In a typical usage scenario, the thread registering the rseq structure will be performing loads and stores from/to that structure. It is however also allowed to read that structure from other threads. The rseq field updates performed by the kernel provide single-copy atomicity semantics, which guarantee that other threads performing single-copy atomic reads of the cpu number cache will always observe a consistent value. Memory registered as rseq structure should never be deallocated before the thread which registered it exits: specifically, it should not be freed, and the library containing the registered thread-local storage should not be dlclose'd. Violating this constraint may cause a SIGSEGV signal to be delivered to the thread. Unregistration of associated rseq structure is implicitly per‐ formed when a thread or process exit. RETURN VALUE A return value of 0 indicates success. On error, -1 is returned, and errno is set appropriately. ERRORS EINVAL Either flags is non-zero, or rseq contains an address which is not appropriately aligned. ENOSYS The rseq() system call is not implemented by this ker‐ nel. EFAULT rseq is an invalid address. EBUSY The rseq argument contains a non-NULL address which dif‐ fers from the memory location already registered for this thread. ENOENT The rseq argument is NULL, but no memory location is currently registered for this thread. VERSIONS The rseq() system call was added in Linux 4.X (TODO). CONFORMING TO rseq() is Linux-specific. EXAMPLE The following code uses the rseq() system call to keep a thread-local storage variable up to date with the current CPU number, with a fallback on sched_getcpu(3) if the cache is not available. For example simplicity, it is done in main(), but multithreaded programs would need to invoke rseq() from each program thread. #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include static __thread volatile struct rseq rseq_state = { .u.e.cpu_id = -1, }; static int sys_rseq(volatile struct rseq *rseq_abi, int flags) { return syscall(__NR_rseq, rseq_abi, flags); } static int32_t rseq_current_cpu_raw(void) { return rseq_state.u.e.cpu_id; } static int32_t rseq_current_cpu(void) { int32_t cpu; cpu = rseq_current_cpu_raw(); if (cpu < 0) cpu = sched_getcpu(); return cpu; } static int rseq_init_current_thread(void) { int rc; rc = sys_rseq(&rseq_state, 0); if (rc) { fprintf(stderr, "Error: sys_rseq(...) failed(%d): %s\n", errno, strerror(errno)); return -1; } return 0; } int main(int argc, char **argv) { if (rseq_init_current_thread()) { fprintf(stderr, "Unable to initialize restartable sequences.\n"); fprintf(stderr, "Using sched_getcpu() as fallback.\n"); } printf("Current CPU number: %d\n", rseq_current_cpu()); exit(EXIT_SUCCESS); } SEE ALSO sched_getcpu(3) Linux 2016-07-19 RSEQ(2) --- MAINTAINERS | 7 ++ arch/Kconfig | 7 ++ fs/exec.c | 1 + include/linux/sched.h | 68 +++++++++++ include/uapi/linux/Kbuild | 1 + include/uapi/linux/rseq.h | 85 ++++++++++++++ init/Kconfig | 13 +++ kernel/Makefile | 1 + kernel/fork.c | 2 + kernel/rseq.c | 231 ++++++++++++++++++++++++++++++++++++++ kernel/sched/core.c | 1 + kernel/sys_ni.c | 3 + 12 files changed, 420 insertions(+) create mode 100644 include/uapi/linux/rseq.h create mode 100644 kernel/rseq.c diff --git a/MAINTAINERS b/MAINTAINERS index 1209323b7e43..daef027e4e13 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5085,6 +5085,13 @@ M: Joe Perches S: Maintained F: scripts/get_maintainer.pl +RESTARTABLE SEQUENCES SUPPORT +M: Mathieu Desnoyers +L: linux-kernel@vger.kernel.org +S: Supported +F: kernel/rseq.c +F: include/uapi/linux/rseq.h + GFS2 FILE SYSTEM M: Steven Whitehouse M: Bob Peterson diff --git a/arch/Kconfig b/arch/Kconfig index 15996290fed4..2c23e26c5f4f 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -242,6 +242,13 @@ config HAVE_REGS_AND_STACK_ACCESS_API declared in asm/ptrace.h For example the kprobes-based event tracer needs this API. +config HAVE_RSEQ + bool + depends on HAVE_REGS_AND_STACK_ACCESS_API + help + This symbol should be selected by an architecture if it + supports an implementation of restartable sequences. + config HAVE_CLK bool help diff --git a/fs/exec.c b/fs/exec.c index 887c1c955df8..e912d8713b23 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1707,6 +1707,7 @@ static int do_execveat_common(int fd, struct filename *filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; + rseq_execve(current); acct_update_integrals(current); task_numa_free(current); free_bprm(bprm); diff --git a/include/linux/sched.h b/include/linux/sched.h index 253538f29ade..5c4b90076715 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -59,6 +59,7 @@ struct sched_param { #include #include #include +#include #include @@ -1918,6 +1919,10 @@ struct task_struct { #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; #endif +#ifdef CONFIG_RSEQ + struct rseq __user *rseq; + uint32_t rseq_event_counter; +#endif /* CPU-specific state of this task */ struct thread_struct thread; /* @@ -3387,4 +3392,67 @@ void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, void cpufreq_remove_update_util_hook(int cpu); #endif /* CONFIG_CPU_FREQ */ +#ifdef CONFIG_RSEQ +static inline void rseq_set_notify_resume(struct task_struct *t) +{ + if (t->rseq) + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); +} +void __rseq_handle_notify_resume(struct pt_regs *regs); +static inline void rseq_handle_notify_resume(struct pt_regs *regs) +{ + if (current->rseq) + __rseq_handle_notify_resume(regs); +} +/* + * If parent process has a registered restartable sequences area, the + * child inherits. Only applies when forking a process, not a thread. In + * case a parent fork() in the middle of a restartable sequence, set the + * resume notifier to force the child to retry. + */ +static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +{ + if (clone_flags & CLONE_THREAD) { + t->rseq = NULL; + t->rseq_event_counter = 0; + } else { + t->rseq = current->rseq; + t->rseq_event_counter = current->rseq_event_counter; + rseq_set_notify_resume(t); + } +} +static inline void rseq_execve(struct task_struct *t) +{ + t->rseq = NULL; + t->rseq_event_counter = 0; +} +static inline void rseq_sched_out(struct task_struct *t) +{ + rseq_set_notify_resume(t); +} +static inline void rseq_signal_deliver(struct pt_regs *regs) +{ + rseq_handle_notify_resume(regs); +} +#else +static inline void rseq_set_notify_resume(struct task_struct *t) +{ +} +static inline void rseq_handle_notify_resume(struct pt_regs *regs) +{ +} +static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +{ +} +static inline void rseq_execve(struct task_struct *t) +{ +} +static inline void rseq_sched_out(struct task_struct *t) +{ +} +static inline void rseq_signal_deliver(struct pt_regs *regs) +{ +} +#endif + #endif diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 8bdae34d1f9a..2e64fb826193 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -403,6 +403,7 @@ header-y += tcp_metrics.h header-y += telephony.h header-y += termios.h header-y += thermal.h +header-y += rseq.h header-y += time.h header-y += times.h header-y += timex.h diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h new file mode 100644 index 000000000000..3e79fa9482b2 --- /dev/null +++ b/include/uapi/linux/rseq.h @@ -0,0 +1,85 @@ +#ifndef _UAPI_LINUX_RSEQ_H +#define _UAPI_LINUX_RSEQ_H + +/* + * linux/rseq.h + * + * Restartable sequences system call API + * + * Copyright (c) 2015-2016 Mathieu Desnoyers + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifdef __KERNEL__ +# include +#else /* #ifdef __KERNEL__ */ +# include +#endif /* #else #ifdef __KERNEL__ */ + +#include + +#ifdef __LP64__ +# define RSEQ_FIELD_u32_u64(field) uint64_t field +#elif defined(__BYTE_ORDER) ? \ + __BYTE_ORDER == __BIG_ENDIAN : defined(__BIG_ENDIAN) +# define RSEQ_FIELD_u32_u64(field) uint32_t _padding ## field, field +#else +# define RSEQ_FIELD_u32_u64(field) uint32_t field, _padding ## field +#endif + +struct rseq_cs { + RSEQ_FIELD_u32_u64(start_ip); + RSEQ_FIELD_u32_u64(post_commit_ip); + RSEQ_FIELD_u32_u64(abort_ip); +} __attribute__((aligned(sizeof(uint64_t)))); + +struct rseq { + union { + struct { + /* + * Restartable sequences cpu_id field. + * Updated by the kernel, and read by user-space with + * single-copy atomicity semantics. Aligned on 32-bit. + * Negative values are reserved for user-space. + */ + int32_t cpu_id; + /* + * Restartable sequences event_counter field. + * Updated by the kernel, and read by user-space with + * single-copy atomicity semantics. Aligned on 32-bit. + */ + uint32_t event_counter; + } e; + /* + * On architectures with 64-bit aligned reads, both cpu_id and + * event_counter can be read with single-copy atomicity + * semantics. + */ + uint64_t v; + } u; + /* + * Restartable sequences rseq_cs field. + * Updated by user-space, read by the kernel with + * single-copy atomicity semantics. Aligned on 64-bit. + */ + RSEQ_FIELD_u32_u64(rseq_cs); +} __attribute__((aligned(sizeof(uint64_t)))); + +#endif /* _UAPI_LINUX_RSEQ_H */ diff --git a/init/Kconfig b/init/Kconfig index c02d89777713..545b7eda1b33 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1653,6 +1653,19 @@ config MEMBARRIER If unsure, say Y. +config RSEQ + bool "Enable rseq() system call" if EXPERT + default y + depends on HAVE_RSEQ + help + Enable the restartable sequences system call. It provides a + user-space cache for the current CPU number value, which + speeds up getting the current CPU number from user-space, + as well as an ABI to speed up user-space operations on + per-CPU data. + + If unsure, say Y. + config EMBEDDED bool "Embedded system" option allnoconfig_y diff --git a/kernel/Makefile b/kernel/Makefile index e2ec54e2b952..4c6d8b5ad2bf 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -112,6 +112,7 @@ obj-$(CONFIG_TORTURE_TEST) += torture.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_HAS_IOMEM) += memremap.o +obj-$(CONFIG_RSEQ) += rseq.o $(obj)/configs.o: $(obj)/config_data.h diff --git a/kernel/fork.c b/kernel/fork.c index 4a7ec0c6c88c..cc7756b25b0a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1591,6 +1591,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, */ copy_seccomp(p); + rseq_fork(p, clone_flags); + /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the diff --git a/kernel/rseq.c b/kernel/rseq.c new file mode 100644 index 000000000000..e1c847bf9910 --- /dev/null +++ b/kernel/rseq.c @@ -0,0 +1,231 @@ +/* + * Restartable sequences system call + * + * Restartable sequences are a lightweight interface that allows + * user-level code to be executed atomically relative to scheduler + * preemption and signal delivery. Typically used for implementing + * per-cpu operations. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Copyright (C) 2015, Google, Inc., + * Paul Turner and Andrew Hunter + * Copyright (C) 2015-2016, EfficiOS Inc., + * Mathieu Desnoyers + */ + +#include +#include +#include +#include +#include +#include + +/* + * Each restartable sequence assembly block defines a "struct rseq_cs" + * structure which describes the post_commit_ip address, and the + * abort_ip address where the kernel should move the thread instruction + * pointer if a rseq critical section assembly block is preempted or if + * a signal is delivered on top of a rseq critical section assembly + * block. It also contains a start_ip, which is the address of the start + * of the rseq assembly block, which is useful to debuggers. + * + * The algorithm for a restartable sequence assembly block is as + * follows: + * + * rseq_start() + * + * 0. Userspace loads the current event counter value from the + * event_counter field of the registered struct rseq TLS area, + * + * rseq_finish() + * + * Steps [1]-[3] (inclusive) need to be a sequence of instructions in + * userspace that can handle being moved to the abort_ip between any + * of those instructions. + * + * The abort_ip address needs to be equal or above the post_commit_ip. + * Step [4] and the failure code step [F1] need to be at addresses + * equal or above the post_commit_ip. + * + * 1. Userspace stores the address of the struct rseq cs rseq + * assembly block descriptor into the rseq_cs field of the + * registered struct rseq TLS area. + * + * 2. Userspace tests to see whether the current event counter values + * match those loaded at [0]. Manually jumping to [F1] in case of + * a mismatch. + * + * Note that if we are preempted or interrupted by a signal + * after [1] and before post_commit_ip, then the kernel also + * performs the comparison performed in [2], and conditionally + * clears rseq_cs, then jumps us to abort_ip. + * + * 3. Userspace critical section final instruction before + * post_commit_ip is the commit. The critical section is + * self-terminating. + * [post_commit_ip] + * + * 4. Userspace clears the rseq_cs field of the struct rseq + * TLS area. + * + * 5. Return true. + * + * On failure at [2]: + * + * F1. Userspace clears the rseq_cs field of the struct rseq + * TLS area. Followed by step [F2]. + * + * [abort_ip] + * F2. Return false. + */ + +static int rseq_increment_event_counter(struct task_struct *t) +{ + if (__put_user(++t->rseq_event_counter, + &t->rseq->u.e.event_counter)) + return -1; + return 0; +} + +static int rseq_get_rseq_cs(struct task_struct *t, + void __user **post_commit_ip, + void __user **abort_ip) +{ + unsigned long ptr; + struct rseq_cs __user *rseq_cs; + + if (__get_user(ptr, &t->rseq->rseq_cs)) + return -1; + if (!ptr) + return 0; +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) { + rseq_cs = compat_ptr((compat_uptr_t)ptr); + if (get_user(ptr, &rseq_cs->post_commit_ip)) + return -1; + *post_commit_ip = compat_ptr((compat_uptr_t)ptr); + if (get_user(ptr, &rseq_cs->abort_ip)) + return -1; + *abort_ip = compat_ptr((compat_uptr_t)ptr); + return 0; + } +#endif + rseq_cs = (struct rseq_cs __user *)ptr; + if (get_user(ptr, &rseq_cs->post_commit_ip)) + return -1; + *post_commit_ip = (void __user *)ptr; + if (get_user(ptr, &rseq_cs->abort_ip)) + return -1; + *abort_ip = (void __user *)ptr; + return 0; +} + +static int rseq_ip_fixup(struct pt_regs *regs) +{ + struct task_struct *t = current; + void __user *post_commit_ip = NULL; + void __user *abort_ip = NULL; + + if (rseq_get_rseq_cs(t, &post_commit_ip, &abort_ip)) + return -1; + + /* Handle potentially being within a critical section. */ + if ((void __user *)instruction_pointer(regs) < post_commit_ip) { + /* + * We need to clear rseq_cs upon entry into a signal + * handler nested on top of a rseq assembly block, so + * the signal handler will not be fixed up if itself + * interrupted by a nested signal handler or preempted. + */ + if (clear_user(&t->rseq->rseq_cs, + sizeof(t->rseq->rseq_cs))) + return -1; + + /* + * We set this after potentially failing in + * clear_user so that the signal arrives at the + * faulting rip. + */ + instruction_pointer_set(regs, (unsigned long)abort_ip); + } + return 0; +} + +/* + * This resume handler should always be executed between any of: + * - preemption, + * - signal delivery, + * and return to user-space. + */ +void __rseq_handle_notify_resume(struct pt_regs *regs) +{ + struct task_struct *t = current; + + if (unlikely(t->flags & PF_EXITING)) + return; + if (!access_ok(VERIFY_WRITE, t->rseq, sizeof(*t->rseq))) + goto error; + if (__put_user(raw_smp_processor_id(), &t->rseq->u.e.cpu_id)) + goto error; + if (rseq_increment_event_counter(t)) + goto error; + if (rseq_ip_fixup(regs)) + goto error; + return; + +error: + force_sig(SIGSEGV, t); +} + +/* + * sys_rseq - setup restartable sequences for caller thread. + */ +SYSCALL_DEFINE2(rseq, struct rseq __user *, rseq, int, flags) +{ + if (unlikely(flags)) + return -EINVAL; + if (!rseq) { + if (!current->rseq) + return -ENOENT; + return 0; + } + + if (current->rseq) { + /* + * If rseq is already registered, check whether + * the provided address differs from the prior + * one. + */ + if (current->rseq != rseq) + return -EBUSY; + } else { + /* + * If there was no rseq previously registered, + * we need to ensure the provided rseq is + * properly aligned and valid. + */ + if (!IS_ALIGNED((unsigned long)rseq, sizeof(uint64_t))) + return -EINVAL; + if (!access_ok(VERIFY_WRITE, rseq, sizeof(*rseq))) + return -EFAULT; + current->rseq = rseq; + /* + * If rseq was previously inactive, and has just + * been registered, ensure the cpu_id and + * event_counter fields are updated before + * returning to user-space. + */ + rseq_set_notify_resume(current); + } + + return 0; +} diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 51d7105f529a..fbef0c3ab04a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2664,6 +2664,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, { sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); + rseq_sched_out(prev); fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); prepare_arch_switch(next); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 2c5e3a8e00d7..c653f78df402 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -250,3 +250,6 @@ cond_syscall(sys_execveat); /* membarrier */ cond_syscall(sys_membarrier); + +/* restartable sequence */ +cond_syscall(sys_rseq); -- 2.34.1 From a881b254fb838d6f7dea4ace01ee12c941c6a4f2 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 6 Jun 2016 09:37:36 -0400 Subject: [PATCH 11/16] tracing: instrument restartable sequences Signed-off-by: Mathieu Desnoyers CC: Thomas Gleixner CC: Paul Turner CC: Andrew Hunter CC: Peter Zijlstra CC: Andy Lutomirski CC: Andi Kleen CC: Dave Watson CC: Chris Lameter CC: Ingo Molnar CC: "H. Peter Anvin" CC: Ben Maurer CC: Steven Rostedt CC: "Paul E. McKenney" CC: Josh Triplett CC: Linus Torvalds CC: Andrew Morton CC: Russell King CC: Catalin Marinas CC: Will Deacon CC: Michael Kerrisk CC: Boqun Feng CC: linux-api@vger.kernel.org --- include/trace/events/rseq.h | 60 +++++++++++++++++++++++++++++++++++++ kernel/rseq.c | 18 +++++++++-- 2 files changed, 75 insertions(+), 3 deletions(-) create mode 100644 include/trace/events/rseq.h diff --git a/include/trace/events/rseq.h b/include/trace/events/rseq.h new file mode 100644 index 000000000000..83fd31e09b2f --- /dev/null +++ b/include/trace/events/rseq.h @@ -0,0 +1,60 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rseq + +#if !defined(_TRACE_RSEQ_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RSEQ_H + +#include + +TRACE_EVENT(rseq_inc, + + TP_PROTO(uint32_t event_counter, int ret), + + TP_ARGS(event_counter, ret), + + TP_STRUCT__entry( + __field(uint32_t, event_counter) + __field(int, ret) + ), + + TP_fast_assign( + __entry->event_counter = event_counter; + __entry->ret = ret; + ), + + TP_printk("event_counter=%u ret=%d", + __entry->event_counter, __entry->ret) +); + +TRACE_EVENT(rseq_ip_fixup, + + TP_PROTO(void __user *regs_ip, void __user *post_commit_ip, + void __user *abort_ip, uint32_t kevcount, int ret), + + TP_ARGS(regs_ip, post_commit_ip, abort_ip, kevcount, ret), + + TP_STRUCT__entry( + __field(void __user *, regs_ip) + __field(void __user *, post_commit_ip) + __field(void __user *, abort_ip) + __field(uint32_t, kevcount) + __field(int, ret) + ), + + TP_fast_assign( + __entry->regs_ip = regs_ip; + __entry->post_commit_ip = post_commit_ip; + __entry->abort_ip = abort_ip; + __entry->kevcount = kevcount; + __entry->ret = ret; + ), + + TP_printk("regs_ip=%p post_commit_ip=%p abort_ip=%p kevcount=%u ret=%d", + __entry->regs_ip, __entry->post_commit_ip, __entry->abort_ip, + __entry->kevcount, __entry->ret) +); + +#endif /* _TRACE_SOCK_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/rseq.c b/kernel/rseq.c index e1c847bf9910..cab326a85846 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -29,6 +29,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + /* * Each restartable sequence assembly block defines a "struct rseq_cs" * structure which describes the post_commit_ip address, and the @@ -90,8 +93,12 @@ static int rseq_increment_event_counter(struct task_struct *t) { - if (__put_user(++t->rseq_event_counter, - &t->rseq->u.e.event_counter)) + int ret; + + ret = __put_user(++t->rseq_event_counter, + &t->rseq->u.e.event_counter); + trace_rseq_inc(t->rseq_event_counter, ret); + if (ret) return -1; return 0; } @@ -134,8 +141,13 @@ static int rseq_ip_fixup(struct pt_regs *regs) struct task_struct *t = current; void __user *post_commit_ip = NULL; void __user *abort_ip = NULL; + int ret; - if (rseq_get_rseq_cs(t, &post_commit_ip, &abort_ip)) + ret = rseq_get_rseq_cs(t, &post_commit_ip, &abort_ip); + trace_rseq_ip_fixup((void __user *)instruction_pointer(regs), + post_commit_ip, abort_ip, t->rseq_event_counter, + ret); + if (ret) return -1; /* Handle potentially being within a critical section. */ -- 2.34.1 From b1b1d89ff6ff8c9dceabb755d054c7b70f1157da Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 27 Jan 2016 17:09:24 -0500 Subject: [PATCH 12/16] Restartable sequences: ARM 32 architecture support Call the rseq_handle_notify_resume() function on return to userspace if TIF_NOTIFY_RESUME thread flag is set. Increment the event counter and perform fixup on the pre-signal frame when a signal is delivered on top of a restartable sequence critical section. Signed-off-by: Mathieu Desnoyers CC: Russell King CC: Catalin Marinas CC: Will Deacon CC: Thomas Gleixner CC: Paul Turner CC: Andrew Hunter CC: Peter Zijlstra CC: Andy Lutomirski CC: Andi Kleen CC: Dave Watson CC: Chris Lameter CC: Ingo Molnar CC: Ben Maurer CC: Steven Rostedt CC: "Paul E. McKenney" CC: Josh Triplett CC: Linus Torvalds CC: Andrew Morton CC: Boqun Feng CC: linux-api@vger.kernel.org --- arch/arm/Kconfig | 1 + arch/arm/kernel/signal.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 90542db1220d..636e14b513bf 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -75,6 +75,7 @@ config ARM select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE) select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_RSEQ select HAVE_SYSCALL_TRACEPOINTS select HAVE_UID16 select HAVE_VIRT_CPU_ACCOUNTING_GEN diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 7b8f2141427b..907da02a6333 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -474,6 +474,12 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) sigset_t *oldset = sigmask_to_save(); int ret; + /* + * Increment event counter and perform fixup for the pre-signal + * frame. + */ + rseq_signal_deliver(regs); + /* * Set up the stack frame */ @@ -594,6 +600,7 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) } else { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + rseq_handle_notify_resume(regs); } } local_irq_disable(); -- 2.34.1 From 043def5bd3a942710b2a29bd143edb201cb79b35 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 27 Jan 2016 17:10:44 -0500 Subject: [PATCH 13/16] Restartable sequences: wire up ARM 32 system call Wire up the rseq system call on 32-bit ARM. This provides an ABI improving the speed of a user-space getcpu operation on ARM by skipping the getcpu system call on the fast path, as well as improving the speed of user-space operations on per-cpu data compared to using load-linked/store-conditional. Signed-off-by: Mathieu Desnoyers CC: Russell King CC: Catalin Marinas CC: Will Deacon CC: Thomas Gleixner CC: Paul Turner CC: Andrew Hunter CC: Peter Zijlstra CC: Andy Lutomirski CC: Andi Kleen CC: Dave Watson CC: Chris Lameter CC: Ingo Molnar CC: Ben Maurer CC: Steven Rostedt CC: "Paul E. McKenney" CC: Josh Triplett CC: Linus Torvalds CC: Andrew Morton CC: Boqun Feng CC: linux-api@vger.kernel.org --- arch/arm/include/uapi/asm/unistd.h | 1 + arch/arm/kernel/calls.S | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h index 2cb9dc770e1d..8f61c79fc7ce 100644 --- a/arch/arm/include/uapi/asm/unistd.h +++ b/arch/arm/include/uapi/asm/unistd.h @@ -420,6 +420,7 @@ #define __NR_copy_file_range (__NR_SYSCALL_BASE+391) #define __NR_preadv2 (__NR_SYSCALL_BASE+392) #define __NR_pwritev2 (__NR_SYSCALL_BASE+393) +#define __NR_rseq (__NR_SYSCALL_BASE+394) /* * The following SWIs are ARM private. diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S index 703fa0f3cd8f..0865c04376df 100644 --- a/arch/arm/kernel/calls.S +++ b/arch/arm/kernel/calls.S @@ -403,6 +403,7 @@ CALL(sys_copy_file_range) CALL(sys_preadv2) CALL(sys_pwritev2) + CALL(sys_rseq) #ifndef syscalls_counted .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls #define syscalls_counted -- 2.34.1 From b29fe5414af361e6017d20b695f30464ad021146 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 27 Jan 2016 17:12:07 -0500 Subject: [PATCH 14/16] Restartable sequences: x86 32/64 architecture support Call the rseq_handle_notify_resume() function on return to userspace if TIF_NOTIFY_RESUME thread flag is set. Increment the event counter and perform fixup on the pre-signal frame when a signal is delivered on top of a restartable sequence critical section. Signed-off-by: Mathieu Desnoyers CC: Russell King CC: Catalin Marinas CC: Will Deacon CC: Thomas Gleixner CC: Paul Turner CC: Andrew Hunter CC: Peter Zijlstra CC: Andy Lutomirski CC: Andi Kleen CC: Dave Watson CC: Chris Lameter CC: Ingo Molnar CC: "H. Peter Anvin" CC: Ben Maurer CC: Steven Rostedt CC: "Paul E. McKenney" CC: Josh Triplett CC: Linus Torvalds CC: Andrew Morton CC: Boqun Feng CC: linux-api@vger.kernel.org --- arch/x86/Kconfig | 1 + arch/x86/entry/common.c | 1 + arch/x86/kernel/signal.c | 6 ++++++ 3 files changed, 8 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d9a94da0c29f..1db7b06fc010 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -140,6 +140,7 @@ config X86 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_RSEQ select HAVE_SYSCALL_TRACEPOINTS select HAVE_UID16 if X86_32 || IA32_EMULATION select HAVE_UNSTABLE_SCHED_CLOCK diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index ec138e538c44..3877dbafba08 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -231,6 +231,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) if (cached_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + rseq_handle_notify_resume(regs); } if (cached_flags & _TIF_USER_RETURN_NOTIFY) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 22cc2f9f8aec..0f4da5a4bdf2 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -683,6 +683,12 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) sigset_t *set = sigmask_to_save(); compat_sigset_t *cset = (compat_sigset_t *) set; + /* + * Increment event counter and perform fixup for the pre-signal + * frame. + */ + rseq_signal_deliver(regs); + /* Set up the stack frame */ if (is_ia32_frame()) { if (ksig->ka.sa.sa_flags & SA_SIGINFO) -- 2.34.1 From c2b758348b125e7e5be2035febe7fe58b25e06f7 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 27 Jan 2016 17:13:45 -0500 Subject: [PATCH 15/16] Restartable sequences: wire up x86 32/64 system call Wire up the rseq system call on x86 32/64. This provides an ABI improving the speed of a user-space getcpu operation on x86 by removing the need to perform a function call, "lsl" instruction, or system call on the fast path, as well as improving the speed of user-space operations on per-cpu data. Signed-off-by: Mathieu Desnoyers CC: Russell King CC: Catalin Marinas CC: Will Deacon CC: Thomas Gleixner CC: Paul Turner CC: Andrew Hunter CC: Peter Zijlstra CC: Andy Lutomirski CC: Andi Kleen CC: Dave Watson CC: Chris Lameter CC: Ingo Molnar CC: "H. Peter Anvin" CC: Ben Maurer CC: Steven Rostedt CC: "Paul E. McKenney" CC: Josh Triplett CC: Linus Torvalds CC: Andrew Morton CC: Boqun Feng CC: linux-api@vger.kernel.org --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 4cddd17153fb..15fb98c4b4b0 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -386,3 +386,4 @@ 377 i386 copy_file_range sys_copy_file_range 378 i386 preadv2 sys_preadv2 compat_sys_preadv2 379 i386 pwritev2 sys_pwritev2 compat_sys_pwritev2 +380 i386 rseq sys_rseq diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 555263e385c9..c7f3c7e98ad8 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -335,6 +335,7 @@ 326 common copy_file_range sys_copy_file_range 327 64 preadv2 sys_preadv2 328 64 pwritev2 sys_pwritev2 +329 common rseq sys_rseq # # x32-specific system call numbers start at 512 to avoid cache impact -- 2.34.1 From be4663d494bce0b60b26203695dc7bcc3dc1bc36 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 30 May 2016 09:46:53 +0200 Subject: [PATCH 16/16] Restartable sequences: self-tests Implements two basic tests of RSEQ functionality, and one more exhaustive parameterizable test. The first, "basic_test" only asserts that RSEQ works moderately correctly. E.g. that: - The CPUID pointer works - Code infinitely looping within a critical section will eventually be interrupted. - Critical sections are interrupted by signals. "basic_percpu_ops_test" is a slightly more "realistic" variant, implementing a few simple per-cpu operations and testing their correctness. "param_test" is a parametrizable restartable sequences test. See the "--help" output for usage. As part of those tests, a helper library "rseq" implements a user-space API around restartable sequences. It takes care of ensuring progress in case of debugger single-stepping with a fall-back to locking, and exposes the instruction pointer addresses where the rseq assembly blocks begin and end, as well as the associated abort instruction pointer, in the __rseq_table section. This section allows debuggers may know where to place breakpoints when single-stepping through assembly blocks which may be aborted at any point by the kernel. Signed-off-by: Mathieu Desnoyers CC: Russell King CC: Catalin Marinas CC: Will Deacon CC: Thomas Gleixner CC: Paul Turner CC: Andrew Hunter CC: Peter Zijlstra CC: Andy Lutomirski CC: Andi Kleen CC: Dave Watson CC: Chris Lameter CC: Ingo Molnar CC: "H. Peter Anvin" CC: Ben Maurer CC: Steven Rostedt CC: "Paul E. McKenney" CC: Josh Triplett CC: Linus Torvalds CC: Andrew Morton CC: Boqun Feng CC: linux-api@vger.kernel.org --- tools/testing/selftests/rseq/.gitignore | 3 + tools/testing/selftests/rseq/Makefile | 13 + .../selftests/rseq/basic_percpu_ops_test.c | 279 +++++++ tools/testing/selftests/rseq/basic_test.c | 106 +++ tools/testing/selftests/rseq/param_test.c | 707 ++++++++++++++++++ tools/testing/selftests/rseq/rseq.c | 200 +++++ tools/testing/selftests/rseq/rseq.h | 449 +++++++++++ 7 files changed, 1757 insertions(+) create mode 100644 tools/testing/selftests/rseq/.gitignore create mode 100644 tools/testing/selftests/rseq/Makefile create mode 100644 tools/testing/selftests/rseq/basic_percpu_ops_test.c create mode 100644 tools/testing/selftests/rseq/basic_test.c create mode 100644 tools/testing/selftests/rseq/param_test.c create mode 100644 tools/testing/selftests/rseq/rseq.c create mode 100644 tools/testing/selftests/rseq/rseq.h diff --git a/tools/testing/selftests/rseq/.gitignore b/tools/testing/selftests/rseq/.gitignore new file mode 100644 index 000000000000..2596e26bcf0a --- /dev/null +++ b/tools/testing/selftests/rseq/.gitignore @@ -0,0 +1,3 @@ +basic_percpu_ops_test +basic_test +param_test diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile new file mode 100644 index 000000000000..3d1ad8eceb73 --- /dev/null +++ b/tools/testing/selftests/rseq/Makefile @@ -0,0 +1,13 @@ +CFLAGS += -O2 -Wall -g -I../../../../usr/include/ +LDFLAGS += -lpthread + +TESTS = basic_test basic_percpu_ops_test param_test + +all: $(TESTS) +%: %.c rseq.h rseq.c + $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) + +include ../lib.mk + +clean: + $(RM) $(TESTS) diff --git a/tools/testing/selftests/rseq/basic_percpu_ops_test.c b/tools/testing/selftests/rseq/basic_percpu_ops_test.c new file mode 100644 index 000000000000..4667dc50fc4c --- /dev/null +++ b/tools/testing/selftests/rseq/basic_percpu_ops_test.c @@ -0,0 +1,279 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +#include "rseq.h" + +static struct rseq_lock rseq_lock; + +struct percpu_lock_entry { + intptr_t v; +} __attribute__((aligned(128))); + +struct percpu_lock { + struct percpu_lock_entry c[CPU_SETSIZE]; +}; + +struct test_data_entry { + int count; +} __attribute__((aligned(128))); + +struct spinlock_test_data { + struct percpu_lock lock; + struct test_data_entry c[CPU_SETSIZE]; + int reps; +}; + +struct percpu_list_node { + intptr_t data; + struct percpu_list_node *next; +}; + +struct percpu_list_entry { + struct percpu_list_node *head; +} __attribute__((aligned(128))); + +struct percpu_list { + struct percpu_list_entry c[CPU_SETSIZE]; +}; + +/* A simple percpu spinlock. Returns the cpu lock was acquired on. */ +int rseq_percpu_lock(struct percpu_lock *lock) +{ + struct rseq_state rseq_state; + intptr_t *targetptr, newval; + int cpu; + bool result; + + for (;;) { + do_rseq(&rseq_lock, rseq_state, cpu, result, targetptr, newval, + { + if (unlikely(lock->c[cpu].v)) { + result = false; + } else { + newval = 1; + targetptr = (intptr_t *)&lock->c[cpu].v; + } + }); + if (likely(result)) + break; + } + /* + * Acquire semantic when taking lock after control dependency. + * Matches smp_store_release(). + */ + smp_acquire__after_ctrl_dep(); + return cpu; +} + +void rseq_percpu_unlock(struct percpu_lock *lock, int cpu) +{ + assert(lock->c[cpu].v == 1); + /* + * Release lock, with release semantic. Matches + * smp_acquire__after_ctrl_dep(). + */ + smp_store_release(&lock->c[cpu].v, 0); +} + +void *test_percpu_spinlock_thread(void *arg) +{ + struct spinlock_test_data *data = arg; + int i, cpu; + + if (rseq_init_current_thread()) + abort(); + for (i = 0; i < data->reps; i++) { + cpu = rseq_percpu_lock(&data->lock); + data->c[cpu].count++; + rseq_percpu_unlock(&data->lock, cpu); + } + + return NULL; +} + +/* + * A simple test which implements a sharded counter using a per-cpu + * lock. Obviously real applications might prefer to simply use a + * per-cpu increment; however, this is reasonable for a test and the + * lock can be extended to synchronize more complicated operations. + */ +void test_percpu_spinlock(void) +{ + const int num_threads = 200; + int i, sum; + pthread_t test_threads[num_threads]; + struct spinlock_test_data data; + + memset(&data, 0, sizeof(data)); + data.reps = 5000; + + for (i = 0; i < num_threads; i++) + pthread_create(&test_threads[i], NULL, + test_percpu_spinlock_thread, &data); + + for (i = 0; i < num_threads; i++) + pthread_join(test_threads[i], NULL); + + sum = 0; + for (i = 0; i < CPU_SETSIZE; i++) + sum += data.c[i].count; + + assert(sum == data.reps * num_threads); +} + +int percpu_list_push(struct percpu_list *list, struct percpu_list_node *node) +{ + struct rseq_state rseq_state; + intptr_t *targetptr, newval; + int cpu; + bool result; + + do_rseq(&rseq_lock, rseq_state, cpu, result, targetptr, newval, + { + newval = (intptr_t)node; + targetptr = (intptr_t *)&list->c[cpu].head; + node->next = list->c[cpu].head; + }); + + return cpu; +} + +/* + * Unlike a traditional lock-less linked list; the availability of a + * rseq primitive allows us to implement pop without concerns over + * ABA-type races. + */ +struct percpu_list_node *percpu_list_pop(struct percpu_list *list) +{ + struct percpu_list_node *head, *next; + struct rseq_state rseq_state; + intptr_t *targetptr, newval; + int cpu; + bool result; + + do_rseq(&rseq_lock, rseq_state, cpu, result, targetptr, newval, + { + head = list->c[cpu].head; + if (!head) { + result = false; + } else { + next = head->next; + newval = (intptr_t) next; + targetptr = (intptr_t *)&list->c[cpu].head; + } + }); + + return head; +} + +void *test_percpu_list_thread(void *arg) +{ + int i; + struct percpu_list *list = (struct percpu_list *)arg; + + if (rseq_init_current_thread()) + abort(); + + for (i = 0; i < 100000; i++) { + struct percpu_list_node *node = percpu_list_pop(list); + + sched_yield(); /* encourage shuffling */ + if (node) + percpu_list_push(list, node); + } + + return NULL; +} + +/* Simultaneous modification to a per-cpu linked list from many threads. */ +void test_percpu_list(void) +{ + int i, j; + long sum = 0, expected_sum = 0; + struct percpu_list list; + pthread_t test_threads[200]; + cpu_set_t allowed_cpus; + + memset(&list, 0, sizeof(list)); + + /* Generate list entries for every usable cpu. */ + sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus); + for (i = 0; i < CPU_SETSIZE; i++) { + if (!CPU_ISSET(i, &allowed_cpus)) + continue; + for (j = 1; j <= 100; j++) { + struct percpu_list_node *node; + + expected_sum += j; + + node = malloc(sizeof(*node)); + assert(node); + node->data = j; + node->next = list.c[i].head; + list.c[i].head = node; + } + } + + for (i = 0; i < 200; i++) + assert(pthread_create(&test_threads[i], NULL, + test_percpu_list_thread, &list) == 0); + + for (i = 0; i < 200; i++) + pthread_join(test_threads[i], NULL); + + for (i = 0; i < CPU_SETSIZE; i++) { + cpu_set_t pin_mask; + struct percpu_list_node *node; + + if (!CPU_ISSET(i, &allowed_cpus)) + continue; + + CPU_ZERO(&pin_mask); + CPU_SET(i, &pin_mask); + sched_setaffinity(0, sizeof(pin_mask), &pin_mask); + + while ((node = percpu_list_pop(&list))) { + sum += node->data; + free(node); + } + } + + /* + * All entries should now be accounted for (unless some external + * actor is interfering with our allowed affinity while this + * test is running). + */ + assert(sum == expected_sum); +} + +int main(int argc, char **argv) +{ + if (rseq_init_lock(&rseq_lock)) { + perror("rseq_init_lock"); + return -1; + } + if (rseq_init_current_thread()) + goto error; + printf("spinlock\n"); + test_percpu_spinlock(); + printf("percpu_list\n"); + test_percpu_list(); + + if (rseq_destroy_lock(&rseq_lock)) { + perror("rseq_destroy_lock"); + return -1; + } + return 0; + +error: + if (rseq_destroy_lock(&rseq_lock)) + perror("rseq_destroy_lock"); + return -1; +} + diff --git a/tools/testing/selftests/rseq/basic_test.c b/tools/testing/selftests/rseq/basic_test.c new file mode 100644 index 000000000000..e8fdcd6ed51c --- /dev/null +++ b/tools/testing/selftests/rseq/basic_test.c @@ -0,0 +1,106 @@ +/* + * Basic test coverage for critical regions and rseq_current_cpu(). + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#include "rseq.h" + +volatile int signals_delivered; +volatile __thread struct rseq_state sigtest_start; +static struct rseq_lock rseq_lock; + +void test_cpu_pointer(void) +{ + cpu_set_t affinity, test_affinity; + int i; + + sched_getaffinity(0, sizeof(affinity), &affinity); + CPU_ZERO(&test_affinity); + for (i = 0; i < CPU_SETSIZE; i++) { + if (CPU_ISSET(i, &affinity)) { + CPU_SET(i, &test_affinity); + sched_setaffinity(0, sizeof(test_affinity), + &test_affinity); + assert(rseq_current_cpu() == sched_getcpu()); + assert(rseq_current_cpu() == i); + CPU_CLR(i, &test_affinity); + } + } + sched_setaffinity(0, sizeof(affinity), &affinity); +} + +/* + * This depends solely on some environmental event triggering a counter + * increase. + */ +void test_critical_section(void) +{ + struct rseq_state start; + uint32_t event_counter; + + start = rseq_start(&rseq_lock); + event_counter = start.event_counter; + do { + start = rseq_start(&rseq_lock); + } while (start.event_counter == event_counter); +} + +void test_signal_interrupt_handler(int signo) +{ + struct rseq_state current; + + current = rseq_start(&rseq_lock); + /* + * The potential critical section bordered by 'start' must be + * invalid. + */ + assert(current.event_counter != sigtest_start.event_counter); + signals_delivered++; +} + +void test_signal_interrupts(void) +{ + struct itimerval it = { { 0, 1 }, { 0, 1 } }; + + setitimer(ITIMER_PROF, &it, NULL); + signal(SIGPROF, test_signal_interrupt_handler); + + do { + sigtest_start = rseq_start(&rseq_lock); + } while (signals_delivered < 10); + setitimer(ITIMER_PROF, NULL, NULL); +} + +int main(int argc, char **argv) +{ + if (rseq_init_lock(&rseq_lock)) { + perror("rseq_init_lock"); + return -1; + } + if (rseq_init_current_thread()) + goto init_thread_error; + printf("testing current cpu\n"); + test_cpu_pointer(); + printf("testing critical section\n"); + test_critical_section(); + printf("testing critical section is interrupted by signal\n"); + test_signal_interrupts(); + + if (rseq_destroy_lock(&rseq_lock)) { + perror("rseq_destroy_lock"); + return -1; + } + return 0; + +init_thread_error: + if (rseq_destroy_lock(&rseq_lock)) + perror("rseq_destroy_lock"); + return -1; +} diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c new file mode 100644 index 000000000000..f95fba5a1b2a --- /dev/null +++ b/tools/testing/selftests/rseq/param_test.c @@ -0,0 +1,707 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline pid_t gettid(void) +{ + return syscall(__NR_gettid); +} + +#define NR_INJECT 9 +static int loop_cnt[NR_INJECT + 1]; + +static int opt_modulo; + +static int opt_yield, opt_signal, opt_sleep, opt_fallback_cnt = 3, + opt_disable_rseq, opt_threads = 200, + opt_reps = 5000, opt_disable_mod = 0, opt_test = 's'; + +static __thread unsigned int signals_delivered; + +static struct rseq_lock rseq_lock; + +#ifndef BENCHMARK + +static __thread unsigned int yield_mod_cnt, nr_retry; + +#define printf_nobench(fmt, ...) printf(fmt, ## __VA_ARGS__) + +#define RSEQ_INJECT_INPUT \ + , [loop_cnt_1]"m"(loop_cnt[1]) \ + , [loop_cnt_2]"m"(loop_cnt[2]) \ + , [loop_cnt_3]"m"(loop_cnt[3]) \ + , [loop_cnt_4]"m"(loop_cnt[4]) + +#if defined(__x86_64__) || defined(__i386__) + +#define INJECT_ASM_REG "eax" + +#define RSEQ_INJECT_CLOBBER \ + , INJECT_ASM_REG + +#define RSEQ_INJECT_ASM(n) \ + "mov %[loop_cnt_" #n "], %%" INJECT_ASM_REG "\n\t" \ + "test %%" INJECT_ASM_REG ",%%" INJECT_ASM_REG "\n\t" \ + "jz 333f\n\t" \ + "222:\n\t" \ + "dec %%" INJECT_ASM_REG "\n\t" \ + "jnz 222b\n\t" \ + "333:\n\t" + +#elif defined(__ARMEL__) + +#define INJECT_ASM_REG "r4" + +#define RSEQ_INJECT_CLOBBER \ + , INJECT_ASM_REG + +#define RSEQ_INJECT_ASM(n) \ + "ldr " INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t" \ + "cmp " INJECT_ASM_REG ", #0\n\t" \ + "beq 333f\n\t" \ + "222:\n\t" \ + "subs " INJECT_ASM_REG ", #1\n\t" \ + "bne 222b\n\t" \ + "333:\n\t" + +#else +#error unsupported target +#endif + +#define RSEQ_INJECT_FAILED \ + nr_retry++; + +#define RSEQ_INJECT_C(n) \ +{ \ + int loc_i, loc_nr_loops = loop_cnt[n]; \ + \ + for (loc_i = 0; loc_i < loc_nr_loops; loc_i++) { \ + barrier(); \ + } \ + if (loc_nr_loops == -1 && opt_modulo) { \ + if (yield_mod_cnt == opt_modulo - 1) { \ + if (opt_sleep > 0) \ + poll(NULL, 0, opt_sleep); \ + if (opt_yield) \ + sched_yield(); \ + if (opt_signal) \ + raise(SIGUSR1); \ + yield_mod_cnt = 0; \ + } else { \ + yield_mod_cnt++; \ + } \ + } \ +} + +#define RSEQ_FALLBACK_CNT \ + opt_fallback_cnt + +#else + +#define printf_nobench(fmt, ...) + +#endif /* BENCHMARK */ + +#include "rseq.h" + +struct percpu_lock_entry { + intptr_t v; +} __attribute__((aligned(128))); + +struct percpu_lock { + struct percpu_lock_entry c[CPU_SETSIZE]; +}; + +struct test_data_entry { + int count; +} __attribute__((aligned(128))); + +struct spinlock_test_data { + struct percpu_lock lock; + struct test_data_entry c[CPU_SETSIZE]; +}; + +struct spinlock_thread_test_data { + struct spinlock_test_data *data; + int reps; + int reg; +}; + +struct inc_test_data { + struct test_data_entry c[CPU_SETSIZE]; +}; + +struct inc_thread_test_data { + struct inc_test_data *data; + int reps; + int reg; +}; + +struct percpu_list_node { + intptr_t data; + struct percpu_list_node *next; +}; + +struct percpu_list_entry { + struct percpu_list_node *head; +} __attribute__((aligned(128))); + +struct percpu_list { + struct percpu_list_entry c[CPU_SETSIZE]; +}; + +/* A simple percpu spinlock. Returns the cpu lock was acquired on. */ +static int rseq_percpu_lock(struct percpu_lock *lock) +{ + struct rseq_state rseq_state; + intptr_t *targetptr, newval; + int cpu; + bool result; + + for (;;) { + do_rseq(&rseq_lock, rseq_state, cpu, result, targetptr, newval, + { + if (unlikely(lock->c[cpu].v)) { + result = false; + } else { + newval = 1; + targetptr = (intptr_t *)&lock->c[cpu].v; + } + }); + if (likely(result)) + break; + } + /* + * Acquire semantic when taking lock after control dependency. + * Matches smp_store_release(). + */ + smp_acquire__after_ctrl_dep(); + return cpu; +} + +static void rseq_percpu_unlock(struct percpu_lock *lock, int cpu) +{ + assert(lock->c[cpu].v == 1); + /* + * Release lock, with release semantic. Matches + * smp_acquire__after_ctrl_dep(). + */ + smp_store_release(&lock->c[cpu].v, 0); +} + +void *test_percpu_spinlock_thread(void *arg) +{ + struct spinlock_thread_test_data *thread_data = arg; + struct spinlock_test_data *data = thread_data->data; + int i, cpu; + + if (!opt_disable_rseq && thread_data->reg + && rseq_init_current_thread()) + abort(); + for (i = 0; i < thread_data->reps; i++) { + cpu = rseq_percpu_lock(&data->lock); + data->c[cpu].count++; + rseq_percpu_unlock(&data->lock, cpu); +#ifndef BENCHMARK + if (i != 0 && !(i % (thread_data->reps / 10))) + printf("tid %d: count %d\n", (int) gettid(), i); +#endif + } + printf_nobench("tid %d: number of retry: %d, signals delivered: %u, nr_fallback %u, nr_fallback_wait %u\n", + (int) gettid(), nr_retry, signals_delivered, + __rseq_thread_state.fallback_cnt, + __rseq_thread_state.fallback_wait_cnt); + return NULL; +} + +/* + * A simple test which implements a sharded counter using a per-cpu + * lock. Obviously real applications might prefer to simply use a + * per-cpu increment; however, this is reasonable for a test and the + * lock can be extended to synchronize more complicated operations. + */ +void test_percpu_spinlock(void) +{ + const int num_threads = opt_threads; + int i, sum, ret; + pthread_t test_threads[num_threads]; + struct spinlock_test_data data; + struct spinlock_thread_test_data thread_data[num_threads]; + + memset(&data, 0, sizeof(data)); + for (i = 0; i < num_threads; i++) { + thread_data[i].reps = opt_reps; + if (opt_disable_mod <= 0 || (i % opt_disable_mod)) + thread_data[i].reg = 1; + else + thread_data[i].reg = 0; + thread_data[i].data = &data; + ret = pthread_create(&test_threads[i], NULL, + test_percpu_spinlock_thread, &thread_data[i]); + if (ret) { + errno = ret; + perror("pthread_create"); + abort(); + } + } + + for (i = 0; i < num_threads; i++) { + pthread_join(test_threads[i], NULL); + if (ret) { + errno = ret; + perror("pthread_join"); + abort(); + } + } + + sum = 0; + for (i = 0; i < CPU_SETSIZE; i++) + sum += data.c[i].count; + + assert(sum == opt_reps * num_threads); +} + +void *test_percpu_inc_thread(void *arg) +{ + struct inc_thread_test_data *thread_data = arg; + struct inc_test_data *data = thread_data->data; + int i; + + if (!opt_disable_rseq && thread_data->reg + && rseq_init_current_thread()) + abort(); + for (i = 0; i < thread_data->reps; i++) { + struct rseq_state rseq_state; + intptr_t *targetptr, newval; + int cpu; + bool result; + + do_rseq(&rseq_lock, rseq_state, cpu, result, targetptr, newval, + { + newval = (intptr_t)data->c[cpu].count + 1; + targetptr = (intptr_t *)&data->c[cpu].count; + }); + +#ifndef BENCHMARK + if (i != 0 && !(i % (thread_data->reps / 10))) + printf("tid %d: count %d\n", (int) gettid(), i); +#endif + } + printf_nobench("tid %d: number of retry: %d, signals delivered: %u, nr_fallback %u, nr_fallback_wait %u\n", + (int) gettid(), nr_retry, signals_delivered, + __rseq_thread_state.fallback_cnt, + __rseq_thread_state.fallback_wait_cnt); + return NULL; +} + +void test_percpu_inc(void) +{ + const int num_threads = opt_threads; + int i, sum, ret; + pthread_t test_threads[num_threads]; + struct inc_test_data data; + struct inc_thread_test_data thread_data[num_threads]; + + memset(&data, 0, sizeof(data)); + for (i = 0; i < num_threads; i++) { + thread_data[i].reps = opt_reps; + if (opt_disable_mod <= 0 || (i % opt_disable_mod)) + thread_data[i].reg = 1; + else + thread_data[i].reg = 0; + thread_data[i].data = &data; + ret = pthread_create(&test_threads[i], NULL, + test_percpu_inc_thread, &thread_data[i]); + if (ret) { + errno = ret; + perror("pthread_create"); + abort(); + } + } + + for (i = 0; i < num_threads; i++) { + pthread_join(test_threads[i], NULL); + if (ret) { + errno = ret; + perror("pthread_join"); + abort(); + } + } + + sum = 0; + for (i = 0; i < CPU_SETSIZE; i++) + sum += data.c[i].count; + + assert(sum == opt_reps * num_threads); +} + +int percpu_list_push(struct percpu_list *list, struct percpu_list_node *node) +{ + struct rseq_state rseq_state; + intptr_t *targetptr, newval; + int cpu; + bool result; + + do_rseq(&rseq_lock, rseq_state, cpu, result, targetptr, newval, + { + newval = (intptr_t)node; + targetptr = (intptr_t *)&list->c[cpu].head; + node->next = list->c[cpu].head; + }); + + return cpu; +} + +/* + * Unlike a traditional lock-less linked list; the availability of a + * rseq primitive allows us to implement pop without concerns over + * ABA-type races. + */ +struct percpu_list_node *percpu_list_pop(struct percpu_list *list) +{ + struct percpu_list_node *head, *next; + struct rseq_state rseq_state; + intptr_t *targetptr, newval; + int cpu; + bool result; + + do_rseq(&rseq_lock, rseq_state, cpu, result, targetptr, newval, + { + head = list->c[cpu].head; + if (!head) { + result = false; + } else { + next = head->next; + newval = (intptr_t) next; + targetptr = (intptr_t *) &list->c[cpu].head; + } + }); + + return head; +} + +void *test_percpu_list_thread(void *arg) +{ + int i; + struct percpu_list *list = (struct percpu_list *)arg; + + if (rseq_init_current_thread()) + abort(); + + for (i = 0; i < opt_reps; i++) { + struct percpu_list_node *node = percpu_list_pop(list); + + if (opt_yield) + sched_yield(); /* encourage shuffling */ + if (node) + percpu_list_push(list, node); + } + + return NULL; +} + +/* Simultaneous modification to a per-cpu linked list from many threads. */ +void test_percpu_list(void) +{ + const int num_threads = opt_threads; + int i, j, ret; + long sum = 0, expected_sum = 0; + struct percpu_list list; + pthread_t test_threads[num_threads]; + cpu_set_t allowed_cpus; + + memset(&list, 0, sizeof(list)); + + /* Generate list entries for every usable cpu. */ + sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus); + for (i = 0; i < CPU_SETSIZE; i++) { + if (!CPU_ISSET(i, &allowed_cpus)) + continue; + for (j = 1; j <= 100; j++) { + struct percpu_list_node *node; + + expected_sum += j; + + node = malloc(sizeof(*node)); + assert(node); + node->data = j; + node->next = list.c[i].head; + list.c[i].head = node; + } + } + + for (i = 0; i < num_threads; i++) { + ret = pthread_create(&test_threads[i], NULL, + test_percpu_list_thread, &list); + if (ret) { + errno = ret; + perror("pthread_create"); + abort(); + } + } + + for (i = 0; i < num_threads; i++) { + pthread_join(test_threads[i], NULL); + if (ret) { + errno = ret; + perror("pthread_join"); + abort(); + } + } + + for (i = 0; i < CPU_SETSIZE; i++) { + cpu_set_t pin_mask; + struct percpu_list_node *node; + + if (!CPU_ISSET(i, &allowed_cpus)) + continue; + + CPU_ZERO(&pin_mask); + CPU_SET(i, &pin_mask); + sched_setaffinity(0, sizeof(pin_mask), &pin_mask); + + while ((node = percpu_list_pop(&list))) { + sum += node->data; + free(node); + } + } + + /* + * All entries should now be accounted for (unless some external + * actor is interfering with our allowed affinity while this + * test is running). + */ + assert(sum == expected_sum); +} + +static void test_signal_interrupt_handler(int signo) +{ + signals_delivered++; +} + +static int set_signal_handler(void) +{ + int ret = 0; + struct sigaction sa; + sigset_t sigset; + + ret = sigemptyset(&sigset); + if (ret < 0) { + perror("sigemptyset"); + return ret; + } + + sa.sa_handler = test_signal_interrupt_handler; + sa.sa_mask = sigset; + sa.sa_flags = 0; + ret = sigaction(SIGUSR1, &sa, NULL); + if (ret < 0) { + perror("sigaction"); + return ret; + } + + printf_nobench("Signal handler set for SIGUSR1\n"); + + return ret; +} + +static void show_usage(int argc, char **argv) +{ + printf("Usage : %s \n", + argv[0]); + printf("OPTIONS:\n"); + printf(" [-1 loops] Number of loops for delay injection 1\n"); + printf(" [-2 loops] Number of loops for delay injection 2\n"); + printf(" [-3 loops] Number of loops for delay injection 3\n"); + printf(" [-4 loops] Number of loops for delay injection 4\n"); + printf(" [-5 loops] Number of loops for delay injection 5 (-1 to enable -m)\n"); + printf(" [-6 loops] Number of loops for delay injection 6 (-1 to enable -m)\n"); + printf(" [-7 loops] Number of loops for delay injection 7 (-1 to enable -m)\n"); + printf(" [-8 loops] Number of loops for delay injection 8 (-1 to enable -m)\n"); + printf(" [-9 loops] Number of loops for delay injection 9 (-1 to enable -m)\n"); + printf(" [-m N] Yield/sleep/kill every modulo N (default 0: disabled) (>= 0)\n"); + printf(" [-y] Yield\n"); + printf(" [-k] Kill thread with signal\n"); + printf(" [-s S] S: =0: disabled (default), >0: sleep time (ms)\n"); + printf(" [-f N] Use fallback every N failure (>= 1)\n"); + printf(" [-t N] Number of threads (default 200)\n"); + printf(" [-r N] Number of repetitions per thread (default 5000)\n"); + printf(" [-d] Disable rseq system call (no initialization)\n"); + printf(" [-D M] Disable rseq for each M threads\n"); + printf(" [-T test] Choose test: (s)pinlock, (l)ist, (i)ncrement\n"); + printf(" [-h] Show this help.\n"); + printf("\n"); +} + +int main(int argc, char **argv) +{ + int i; + + if (rseq_init_lock(&rseq_lock)) { + perror("rseq_init_lock"); + return -1; + } + if (set_signal_handler()) + goto error; + for (i = 1; i < argc; i++) { + if (argv[i][0] != '-') + continue; + switch (argv[i][1]) { + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (argc < i + 2) { + show_usage(argc, argv); + goto error; + } + loop_cnt[argv[i][1] - '0'] = atol(argv[i + 1]); + i++; + break; + case 'm': + if (argc < i + 2) { + show_usage(argc, argv); + goto error; + } + opt_modulo = atol(argv[i + 1]); + if (opt_modulo < 0) { + show_usage(argc, argv); + goto error; + } + i++; + break; + case 's': + if (argc < i + 2) { + show_usage(argc, argv); + goto error; + } + opt_sleep = atol(argv[i + 1]); + if (opt_sleep < 0) { + show_usage(argc, argv); + goto error; + } + i++; + break; + case 'y': + opt_yield = 1; + break; + case 'k': + opt_signal = 1; + break; + case 'd': + opt_disable_rseq = 1; + break; + case 'D': + if (argc < i + 2) { + show_usage(argc, argv); + goto error; + } + opt_disable_mod = atol(argv[i + 1]); + if (opt_disable_mod < 0) { + show_usage(argc, argv); + goto error; + } + i++; + break; + case 'f': + if (argc < i + 2) { + show_usage(argc, argv); + goto error; + } + opt_fallback_cnt = atol(argv[i + 1]); + if (opt_fallback_cnt < 1) { + show_usage(argc, argv); + goto error; + } + i++; + break; + case 't': + if (argc < i + 2) { + show_usage(argc, argv); + goto error; + } + opt_threads = atol(argv[i + 1]); + if (opt_threads < 0) { + show_usage(argc, argv); + goto error; + } + i++; + break; + case 'r': + if (argc < i + 2) { + show_usage(argc, argv); + goto error; + } + opt_reps = atol(argv[i + 1]); + if (opt_reps < 0) { + show_usage(argc, argv); + goto error; + } + i++; + break; + case 'h': + show_usage(argc, argv); + goto end; + case 'T': + if (argc < i + 2) { + show_usage(argc, argv); + goto error; + } + opt_test = *argv[i + 1]; + switch (opt_test) { + case 's': + case 'l': + case 'i': + break; + default: + show_usage(argc, argv); + goto error; + } + i++; + break; + default: + show_usage(argc, argv); + goto error; + } + } + + if (!opt_disable_rseq && rseq_init_current_thread()) + goto error; + switch (opt_test) { + case 's': + printf_nobench("spinlock\n"); + test_percpu_spinlock(); + break; + case 'l': + printf_nobench("linked list\n"); + test_percpu_list(); + break; + case 'i': + printf_nobench("counter increment\n"); + test_percpu_inc(); + break; + } +end: + return 0; + +error: + if (rseq_destroy_lock(&rseq_lock)) + perror("rseq_destroy_lock"); + return -1; +} diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c new file mode 100644 index 000000000000..f411be2c77bc --- /dev/null +++ b/tools/testing/selftests/rseq/rseq.c @@ -0,0 +1,200 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rseq.h" + +#ifdef __NR_membarrier +# define membarrier(...) syscall(__NR_membarrier, __VA_ARGS__) +#else +# define membarrier(...) -ENOSYS +#endif + +__thread volatile struct rseq_thread_state __rseq_thread_state = { + .abi.u.e.cpu_id = -1, +}; + +int rseq_has_sys_membarrier; + +static int sys_rseq(volatile struct rseq *rseq_abi, int flags) +{ + return syscall(__NR_rseq, rseq_abi, flags); +} + +int rseq_init_current_thread(void) +{ + int rc; + + rc = sys_rseq(&__rseq_thread_state.abi, 0); + if (rc) { + fprintf(stderr, "Error: sys_rseq(...) failed(%d): %s\n", + errno, strerror(errno)); + return -1; + } + assert(rseq_current_cpu() >= 0); + return 0; +} + +int rseq_init_lock(struct rseq_lock *rlock) +{ + int ret; + + ret = pthread_mutex_init(&rlock->lock, NULL); + if (ret) { + errno = ret; + return -1; + } + rlock->state = RSEQ_LOCK_STATE_RESTART; + return 0; +} + +int rseq_destroy_lock(struct rseq_lock *rlock) +{ + int ret; + + ret = pthread_mutex_destroy(&rlock->lock); + if (ret) { + errno = ret; + return -1; + } + return 0; +} + +static void signal_off_save(sigset_t *oldset) +{ + sigset_t set; + int ret; + + sigfillset(&set); + ret = pthread_sigmask(SIG_BLOCK, &set, oldset); + if (ret) + abort(); +} + +static void signal_restore(sigset_t oldset) +{ + int ret; + + ret = pthread_sigmask(SIG_SETMASK, &oldset, NULL); + if (ret) + abort(); +} + +static void rseq_fallback_lock(struct rseq_lock *rlock) +{ + signal_off_save((sigset_t *)&__rseq_thread_state.sigmask_saved); + pthread_mutex_lock(&rlock->lock); + __rseq_thread_state.fallback_cnt++; + /* + * For concurrent threads arriving before we set LOCK: + * reading cpu_id after setting the state to LOCK + * ensures they restart. + */ + ACCESS_ONCE(rlock->state) = RSEQ_LOCK_STATE_LOCK; + /* + * For concurrent threads arriving after we set LOCK: + * those will grab the lock, so we are protected by + * mutual exclusion. + */ +} + +void rseq_fallback_wait(struct rseq_lock *rlock) +{ + signal_off_save((sigset_t *)&__rseq_thread_state.sigmask_saved); + pthread_mutex_lock(&rlock->lock); + __rseq_thread_state.fallback_wait_cnt++; + pthread_mutex_unlock(&rlock->lock); + signal_restore(__rseq_thread_state.sigmask_saved); +} + +static void rseq_fallback_unlock(struct rseq_lock *rlock, int cpu_at_start) +{ + /* + * Concurrent rseq arriving before we set state back to RESTART + * grab the lock. Those arriving after we set state back to + * RESTART will perform restartable critical sections. The next + * owner of the lock will take take of making sure it prevents + * concurrent restartable sequences from completing. We may be + * writing from another CPU, so update the state with a store + * release semantic to ensure restartable sections will see our + * side effect (writing to *p) before they enter their + * restartable critical section. + * + * In cases where we observe that we are on the right CPU after the + * critical section, program order ensures that following restartable + * critical sections will see our stores, so we don't have to use + * store-release or membarrier. + * + * Use sys_membarrier when available to remove the memory barrier + * implied by smp_load_acquire(). + */ + barrier(); + if (likely(rseq_current_cpu() == cpu_at_start)) { + ACCESS_ONCE(rlock->state) = RSEQ_LOCK_STATE_RESTART; + } else { + if (!has_fast_acquire_release() && rseq_has_sys_membarrier) { + if (membarrier(MEMBARRIER_CMD_SHARED, 0)) + abort(); + ACCESS_ONCE(rlock->state) = RSEQ_LOCK_STATE_RESTART; + } else { + /* + * Store with release semantic to ensure + * restartable sections will see our side effect + * (writing to *p) before they enter their + * restartable critical section. Matches + * smp_load_acquire() in rseq_start(). + */ + smp_store_release(&rlock->state, + RSEQ_LOCK_STATE_RESTART); + } + } + pthread_mutex_unlock(&rlock->lock); + signal_restore(__rseq_thread_state.sigmask_saved); +} + +int rseq_fallback_current_cpu(void) +{ + int cpu; + + cpu = sched_getcpu(); + if (cpu < 0) { + perror("sched_getcpu()"); + abort(); + } + return cpu; +} + +int rseq_fallback_begin(struct rseq_lock *rlock) +{ + rseq_fallback_lock(rlock); + return rseq_fallback_current_cpu(); +} + +void rseq_fallback_end(struct rseq_lock *rlock, int cpu) +{ + rseq_fallback_unlock(rlock, cpu); +} + +/* Handle non-initialized rseq for this thread. */ +void rseq_fallback_noinit(struct rseq_state *rseq_state) +{ + rseq_state->lock_state = RSEQ_LOCK_STATE_FAIL; + rseq_state->cpu_id = 0; +} + +void __attribute__((constructor)) rseq_init(void) +{ + int ret; + + ret = membarrier(MEMBARRIER_CMD_QUERY, 0); + if (ret >= 0 && (ret & MEMBARRIER_CMD_SHARED)) + rseq_has_sys_membarrier = 1; +} diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h new file mode 100644 index 000000000000..791e14cf42ae --- /dev/null +++ b/tools/testing/selftests/rseq/rseq.h @@ -0,0 +1,449 @@ +#ifndef RSEQ_H +#define RSEQ_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Empty code injection macros, override when testing. + * It is important to consider that the ASM injection macros need to be + * fully reentrant (e.g. do not modify the stack). + */ +#ifndef RSEQ_INJECT_ASM +#define RSEQ_INJECT_ASM(n) +#endif + +#ifndef RSEQ_INJECT_C +#define RSEQ_INJECT_C(n) +#endif + +#ifndef RSEQ_INJECT_INPUT +#define RSEQ_INJECT_INPUT +#endif + +#ifndef RSEQ_INJECT_CLOBBER +#define RSEQ_INJECT_CLOBBER +#endif + +#ifndef RSEQ_INJECT_FAILED +#define RSEQ_INJECT_FAILED +#endif + +#ifndef RSEQ_FALLBACK_CNT +#define RSEQ_FALLBACK_CNT 3 +#endif + +struct rseq_thread_state { + struct rseq abi; /* Kernel ABI. */ + uint32_t fallback_wait_cnt; + uint32_t fallback_cnt; + sigset_t sigmask_saved; +}; + +extern __thread volatile struct rseq_thread_state __rseq_thread_state; +extern int rseq_has_sys_membarrier; + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#define barrier() __asm__ __volatile__("" : : : "memory") + +#define ACCESS_ONCE(x) (*(__volatile__ __typeof__(x) *)&(x)) +#define WRITE_ONCE(x, v) __extension__ ({ ACCESS_ONCE(x) = (v); }) +#define READ_ONCE(x) ACCESS_ONCE(x) + +#ifdef __x86_64__ + +#define smp_mb() __asm__ __volatile__ ("mfence" : : : "memory") +#define smp_rmb() barrier() +#define smp_wmb() barrier() + +#define smp_load_acquire(p) \ +__extension__ ({ \ + __typeof(*p) ____p1 = READ_ONCE(*p); \ + barrier(); \ + ____p1; \ +}) + +#define smp_acquire__after_ctrl_dep() smp_rmb() + +#define smp_store_release(p, v) \ +do { \ + barrier(); \ + WRITE_ONCE(*p, v); \ +} while (0) + +#define has_fast_acquire_release() 1 +#define has_single_copy_load_64() 1 + +#elif __i386__ + +/* + * Support older 32-bit architectures that do not implement fence + * instructions. + */ +#define smp_mb() \ + __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory") +#define smp_rmb() \ + __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory") +#define smp_wmb() \ + __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory") + +#define smp_load_acquire(p) \ +__extension__ ({ \ + __typeof(*p) ____p1 = READ_ONCE(*p); \ + smp_mb(); \ + ____p1; \ +}) + +#define smp_acquire__after_ctrl_dep() smp_rmb() + +#define smp_store_release(p, v) \ +do { \ + smp_mb(); \ + WRITE_ONCE(*p, v); \ +} while (0) + +#define has_fast_acquire_release() 0 +#define has_single_copy_load_64() 0 + +#elif defined(__ARMEL__) + +#define smp_mb() __asm__ __volatile__ ("dmb" : : : "memory") +#define smp_rmb() __asm__ __volatile__ ("dmb" : : : "memory") +#define smp_wmb() __asm__ __volatile__ ("dmb" : : : "memory") + +#define smp_load_acquire(p) \ +__extension__ ({ \ + __typeof(*p) ____p1 = READ_ONCE(*p); \ + smp_mb(); \ + ____p1; \ +}) + +#define smp_acquire__after_ctrl_dep() smp_rmb() + +#define smp_store_release(p, v) \ +do { \ + smp_mb(); \ + WRITE_ONCE(*p, v); \ +} while (0) + +#define has_fast_acquire_release() 0 +#define has_single_copy_load_64() 1 + +#else +#error unsupported target +#endif + +enum rseq_lock_state { + RSEQ_LOCK_STATE_RESTART = 0, + RSEQ_LOCK_STATE_LOCK = 1, + RSEQ_LOCK_STATE_FAIL = 2, +}; + +struct rseq_lock { + pthread_mutex_t lock; + int32_t state; /* enum rseq_lock_state */ +}; + +/* State returned by rseq_start, passed as argument to rseq_finish. */ +struct rseq_state { + volatile struct rseq_thread_state *rseqp; + int32_t cpu_id; /* cpu_id at start. */ + uint32_t event_counter; /* event_counter at start. */ + int32_t lock_state; /* Lock state at start. */ +}; + +/* + * Initialize rseq for the current thread. Must be called once by any + * thread which uses restartable sequences, before they start using + * restartable sequences. If initialization is not invoked, or if it + * fails, the restartable critical sections will fall-back on locking + * (rseq_lock). + */ +int rseq_init_current_thread(void); + +/* + * The fallback lock should be initialized before being used by any + * thread, and destroyed after all threads are done using it. This lock + * should be used by all rseq calls associated with shared data, either + * between threads, or between processes in a shared memory. + * + * There may be many rseq_lock per process, e.g. one per protected data + * structure. + */ +int rseq_init_lock(struct rseq_lock *rlock); +int rseq_destroy_lock(struct rseq_lock *rlock); + +/* + * Restartable sequence fallback prototypes. Fallback on locking when + * rseq is not initialized, not available on the system, or during + * single-stepping to ensure forward progress. + */ +int rseq_fallback_begin(struct rseq_lock *rlock); +void rseq_fallback_end(struct rseq_lock *rlock, int cpu); +void rseq_fallback_wait(struct rseq_lock *rlock); +void rseq_fallback_noinit(struct rseq_state *rseq_state); + +/* + * Restartable sequence fallback for reading the current CPU number. + */ +int rseq_fallback_current_cpu(void); + +static inline int32_t rseq_cpu_at_start(struct rseq_state start_value) +{ + return start_value.cpu_id; +} + +static inline int32_t rseq_current_cpu_raw(void) +{ + return ACCESS_ONCE(__rseq_thread_state.abi.u.e.cpu_id); +} + +static inline int32_t rseq_current_cpu(void) +{ + int32_t cpu; + + cpu = rseq_current_cpu_raw(); + if (unlikely(cpu < 0)) + cpu = rseq_fallback_current_cpu(); + return cpu; +} + +static inline __attribute__((always_inline)) +struct rseq_state rseq_start(struct rseq_lock *rlock) +{ + struct rseq_state result; + + result.rseqp = &__rseq_thread_state; + if (has_single_copy_load_64()) { + union { + struct { + uint32_t cpu_id; + uint32_t event_counter; + } e; + uint64_t v; + } u; + + u.v = ACCESS_ONCE(result.rseqp->abi.u.v); + result.event_counter = u.e.event_counter; + result.cpu_id = u.e.cpu_id; + } else { + result.event_counter = + ACCESS_ONCE(result.rseqp->abi.u.e.event_counter); + /* load event_counter before cpu_id. */ + RSEQ_INJECT_C(5) + result.cpu_id = ACCESS_ONCE(result.rseqp->abi.u.e.cpu_id); + } + /* + * Read event counter before lock state and cpu_id. This ensures + * that when the state changes from RESTART to LOCK, if we have + * some threads that have already seen the RESTART still in + * flight, they will necessarily be preempted/signalled before a + * thread can see the LOCK state for that same CPU. That + * preemption/signalling will cause them to restart, so they + * don't interfere with the lock. + */ + RSEQ_INJECT_C(6) + + if (!has_fast_acquire_release() && likely(rseq_has_sys_membarrier)) { + result.lock_state = ACCESS_ONCE(rlock->state); + barrier(); + } else { + /* + * Load lock state with acquire semantic. Matches + * smp_store_release() in rseq_fallback_end(). + */ + result.lock_state = smp_load_acquire(&rlock->state); + } + if (unlikely(result.cpu_id < 0)) + rseq_fallback_noinit(&result); + /* + * We need to ensure that the compiler does not re-order the + * loads of any protected values before we read the current + * state. + */ + barrier(); + return result; +} + +static inline __attribute__((always_inline)) +bool rseq_finish(struct rseq_lock *rlock, + intptr_t *p, intptr_t to_write, + struct rseq_state start_value) +{ + RSEQ_INJECT_C(9) + + if (unlikely(start_value.lock_state != RSEQ_LOCK_STATE_RESTART)) { + if (start_value.lock_state == RSEQ_LOCK_STATE_LOCK) + rseq_fallback_wait(rlock); + return false; + } + +#ifdef __x86_64__ + /* + * The __rseq_table section can be used by debuggers to better + * handle single-stepping through the restartable critical + * sections. + */ + __asm__ __volatile__ goto ( + ".pushsection __rseq_table, \"aw\"\n\t" + ".balign 8\n\t" + "4:\n\t" + ".quad 1f, 2f, 3f\n\t" + ".popsection\n\t" + "1:\n\t" + RSEQ_INJECT_ASM(1) + "movq $4b, (%[rseq_cs])\n\t" + RSEQ_INJECT_ASM(2) + "cmpl %[start_event_counter], %[current_event_counter]\n\t" + "jnz 3f\n\t" + RSEQ_INJECT_ASM(3) + "movq %[to_write], (%[target])\n\t" + "2:\n\t" + RSEQ_INJECT_ASM(4) + "movq $0, (%[rseq_cs])\n\t" + "jmp %l[succeed]\n\t" + "3: movq $0, (%[rseq_cs])\n\t" + : /* no outputs */ + : [start_event_counter]"r"(start_value.event_counter), + [current_event_counter]"m"(start_value.rseqp->abi.u.e.event_counter), + [to_write]"r"(to_write), + [target]"r"(p), + [rseq_cs]"r"(&start_value.rseqp->abi.rseq_cs) + RSEQ_INJECT_INPUT + : "memory", "cc" + RSEQ_INJECT_CLOBBER + : succeed + ); +#elif defined(__i386__) + /* + * The __rseq_table section can be used by debuggers to better + * handle single-stepping through the restartable critical + * sections. + */ + __asm__ __volatile__ goto ( + ".pushsection __rseq_table, \"aw\"\n\t" + ".balign 8\n\t" + "4:\n\t" + ".long 1f, 0x0, 2f, 0x0, 3f, 0x0\n\t" + ".popsection\n\t" + "1:\n\t" + RSEQ_INJECT_ASM(1) + "movl $4b, (%[rseq_cs])\n\t" + RSEQ_INJECT_ASM(2) + "cmpl %[start_event_counter], %[current_event_counter]\n\t" + "jnz 3f\n\t" + RSEQ_INJECT_ASM(3) + "movl %[to_write], (%[target])\n\t" + "2:\n\t" + RSEQ_INJECT_ASM(4) + "movl $0, (%[rseq_cs])\n\t" + "jmp %l[succeed]\n\t" + "3: movl $0, (%[rseq_cs])\n\t" + : /* no outputs */ + : [start_event_counter]"r"(start_value.event_counter), + [current_event_counter]"m"(start_value.rseqp->abi.u.e.event_counter), + [to_write]"r"(to_write), + [target]"r"(p), + [rseq_cs]"r"(&start_value.rseqp->abi.rseq_cs) + RSEQ_INJECT_INPUT + : "memory", "cc" + RSEQ_INJECT_CLOBBER + : succeed + ); +#elif defined(__ARMEL__) + { + /* + * The __rseq_table section can be used by debuggers to better + * handle single-stepping through the restartable critical + * sections. + */ + __asm__ __volatile__ goto ( + ".pushsection __rseq_table, \"aw\"\n\t" + ".balign 8\n\t" + ".word 1f, 0x0, 2f, 0x0, 3f, 0x0\n\t" + ".popsection\n\t" + "1:\n\t" + RSEQ_INJECT_ASM(1) + "adr r0, 4f\n\t" + "str r0, [%[rseq_cs]]\n\t" + RSEQ_INJECT_ASM(2) + "ldr r0, %[current_event_counter]\n\t" + "mov r1, #0\n\t" + "cmp %[start_event_counter], r0\n\t" + "bne 3f\n\t" + RSEQ_INJECT_ASM(3) + "str %[to_write], [%[target]]\n\t" + "2:\n\t" + RSEQ_INJECT_ASM(4) + "str r1, [%[rseq_cs]]\n\t" + "b %l[succeed]\n\t" + ".balign 8\n\t" + "4:\n\t" + ".word 1b, 0x0, 2b, 0x0, 3f, 0x0\n\t" + "3:\n\t" + "mov r1, #0\n\t" + "str r1, [%[rseq_cs]]\n\t" + : /* no outputs */ + : [start_event_counter]"r"(start_value.event_counter), + [current_event_counter]"m"(start_value.rseqp->abi.u.e.event_counter), + [to_write]"r"(to_write), + [rseq_cs]"r"(&start_value.rseqp->abi.rseq_cs), + [target]"r"(p) + RSEQ_INJECT_INPUT + : "r0", "r1", "memory", "cc" + RSEQ_INJECT_CLOBBER + : succeed + ); + } +#else +#error unsupported target +#endif + RSEQ_INJECT_FAILED + return false; +succeed: + return true; +} + +/* + * Helper macro doing two restartable critical section attempts, and if + * they fail, fallback on locking. + */ +#define do_rseq(_lock, _rseq_state, _cpu, _result, _targetptr, _newval, \ + _code) \ + do { \ + _rseq_state = rseq_start(_lock); \ + _cpu = rseq_cpu_at_start(_rseq_state); \ + _result = true; \ + _code \ + if (unlikely(!_result)) \ + break; \ + if (likely(rseq_finish(_lock, _targetptr, _newval, \ + _rseq_state))) \ + break; \ + _rseq_state = rseq_start(_lock); \ + _cpu = rseq_cpu_at_start(_rseq_state); \ + _result = true; \ + _code \ + if (unlikely(!_result)) \ + break; \ + if (likely(rseq_finish(_lock, _targetptr, _newval, \ + _rseq_state))) \ + break; \ + _cpu = rseq_fallback_begin(_lock); \ + _result = true; \ + _code \ + if (likely(_result)) \ + *(_targetptr) = (_newval); \ + rseq_fallback_end(_lock, _cpu); \ + } while (0) + +#endif /* RSEQ_H_ */ -- 2.34.1