From 47c459beabe969c6751e2ea8d1f85c5fa1652d6c Mon Sep 17 00:00:00 2001
From: Ganapatrao Kulkarni <gkulkarni@caviumnetworks.com>
Date: Thu, 7 Jul 2016 10:18:17 +0530
Subject: [PATCH 01/16] arm64: Enable workaround for Cavium erratum 27456 on
 thunderx-81xx

Cavium erratum 27456 commit 104a0c02e8b1
("arm64: Add workaround for Cavium erratum 27456")
is applicable for thunderx-81xx pass1.0 SoC as well.
Adding code to enable to 81xx.

Signed-off-by: Ganapatrao Kulkarni <gkulkarni@cavium.com>
Reviewed-by: Andrew Pinski <apinski@cavium.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/cputype.h | 2 ++
 arch/arm64/kernel/cpu_errata.c   | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index 87e1985f3be8..9d9fd4b9a72e 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -80,12 +80,14 @@
 #define APM_CPU_PART_POTENZA		0x000
 
 #define CAVIUM_CPU_PART_THUNDERX	0x0A1
+#define CAVIUM_CPU_PART_THUNDERX_81XX	0x0A2
 
 #define BRCM_CPU_PART_VULCAN		0x516
 
 #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53)
 #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57)
 #define MIDR_THUNDERX	MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
+#define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index d42789499f17..af716b65110d 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -98,6 +98,12 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 		MIDR_RANGE(MIDR_THUNDERX, 0x00,
 			   (1 << MIDR_VARIANT_SHIFT) | 1),
 	},
+	{
+	/* Cavium ThunderX, T81 pass 1.0 */
+		.desc = "Cavium erratum 27456",
+		.capability = ARM64_WORKAROUND_CAVIUM_27456,
+		MIDR_RANGE(MIDR_THUNDERX_81XX, 0x00, 0x00),
+	},
 #endif
 	{
 	}
-- 
2.34.1


From 78c4e172412de5d0456dc00d2b34050aa0b683b5 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Tue, 5 Jul 2016 17:32:29 -0400
Subject: [PATCH 02/16] Revert "ecryptfs: forbid opening files without mmap
 handler"

This reverts commit 2f36db71009304b3f0b95afacd8eba1f9f046b87.

It fixed a local root exploit but also introduced a dependency on
the lower file system implementing an mmap operation just to open a file,
which is a bit of a heavy hammer.  The right fix is to have mmap depend
on the existence of the mmap handler instead.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Cc: stable@vger.kernel.org
Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
---
 fs/ecryptfs/kthread.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index e818f5ac7a26..866bb18efefe 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -25,7 +25,6 @@
 #include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/mount.h>
-#include <linux/file.h>
 #include "ecryptfs_kernel.h"
 
 struct ecryptfs_open_req {
@@ -148,7 +147,7 @@ int ecryptfs_privileged_open(struct file **lower_file,
 	flags |= IS_RDONLY(d_inode(lower_dentry)) ? O_RDONLY : O_RDWR;
 	(*lower_file) = dentry_open(&req.path, flags, cred);
 	if (!IS_ERR(*lower_file))
-		goto have_file;
+		goto out;
 	if ((flags & O_ACCMODE) == O_RDONLY) {
 		rc = PTR_ERR((*lower_file));
 		goto out;
@@ -166,16 +165,8 @@ int ecryptfs_privileged_open(struct file **lower_file,
 	mutex_unlock(&ecryptfs_kthread_ctl.mux);
 	wake_up(&ecryptfs_kthread_ctl.wait);
 	wait_for_completion(&req.done);
-	if (IS_ERR(*lower_file)) {
+	if (IS_ERR(*lower_file))
 		rc = PTR_ERR(*lower_file);
-		goto out;
-	}
-have_file:
-	if ((*lower_file)->f_op->mmap == NULL) {
-		fput(*lower_file);
-		*lower_file = NULL;
-		rc = -EMEDIUMTYPE;
-	}
 out:
 	return rc;
 }
-- 
2.34.1


From 30a46a4647fd1df9cf52e43bf467f0d9265096ca Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Thu, 7 Jul 2016 13:41:11 -0700
Subject: [PATCH 03/16] apparmor: fix oops, validate buffer size in
 apparmor_setprocattr()

When proc_pid_attr_write() was changed to use memdup_user apparmor's
(interface violating) assumption that the setprocattr buffer was always
a single page was violated.

The size test is not strictly speaking needed as proc_pid_attr_write()
will reject anything larger, but for the sake of robustness we can keep
it in.

SMACK and SELinux look safe to me, but somebody else should probably
have a look just in case.

Based on original patch from Vegard Nossum <vegard.nossum@oracle.com>
modified for the case that apparmor provides null termination.

Fixes: bb646cdb12e75d82258c2f2e7746d5952d3e321a
Reported-by: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: John Johansen <john.johansen@canonical.com>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Eric Paris <eparis@parisplace.org>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Cc: stable@kernel.org
Signed-off-by: John Johansen <john.johansen@canonical.com>
Reviewed-by: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 security/apparmor/lsm.c | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 2660fbcf94d1..7798e1608f4f 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -500,34 +500,34 @@ static int apparmor_setprocattr(struct task_struct *task, char *name,
 {
 	struct common_audit_data sa;
 	struct apparmor_audit_data aad = {0,};
-	char *command, *args = value;
+	char *command, *largs = NULL, *args = value;
 	size_t arg_size;
 	int error;
 
 	if (size == 0)
 		return -EINVAL;
-	/* args points to a PAGE_SIZE buffer, AppArmor requires that
-	 * the buffer must be null terminated or have size <= PAGE_SIZE -1
-	 * so that AppArmor can null terminate them
-	 */
-	if (args[size - 1] != '\0') {
-		if (size == PAGE_SIZE)
-			return -EINVAL;
-		args[size] = '\0';
-	}
-
 	/* task can only write its own attributes */
 	if (current != task)
 		return -EACCES;
 
-	args = value;
+	/* AppArmor requires that the buffer must be null terminated atm */
+	if (args[size - 1] != '\0') {
+		/* null terminate */
+		largs = args = kmalloc(size + 1, GFP_KERNEL);
+		if (!args)
+			return -ENOMEM;
+		memcpy(args, value, size);
+		args[size] = '\0';
+	}
+
+	error = -EINVAL;
 	args = strim(args);
 	command = strsep(&args, " ");
 	if (!args)
-		return -EINVAL;
+		goto out;
 	args = skip_spaces(args);
 	if (!*args)
-		return -EINVAL;
+		goto out;
 
 	arg_size = size - (args - (char *) value);
 	if (strcmp(name, "current") == 0) {
@@ -553,10 +553,12 @@ static int apparmor_setprocattr(struct task_struct *task, char *name,
 			goto fail;
 	} else
 		/* only support the "current" and "exec" process attributes */
-		return -EINVAL;
+		goto fail;
 
 	if (!error)
 		error = size;
+out:
+	kfree(largs);
 	return error;
 
 fail:
@@ -565,9 +567,9 @@ fail:
 	aad.profile = aa_current_profile();
 	aad.op = OP_SETPROCATTR;
 	aad.info = name;
-	aad.error = -EINVAL;
+	aad.error = error = -EINVAL;
 	aa_audit_msg(AUDIT_APPARMOR_DENIED, &sa, NULL);
-	return -EINVAL;
+	goto out;
 }
 
 static int apparmor_task_setrlimit(struct task_struct *task,
-- 
2.34.1


From 7469be95a487319514adce2304ad2af3553d2fc9 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Thu, 7 Jul 2016 01:32:04 -0600
Subject: [PATCH 04/16] xenbus: don't bail early from
 xenbus_dev_request_and_reply()

xenbus_dev_request_and_reply() needs to track whether a transaction is
open.  For XS_TRANSACTION_START messages it calls transaction_start()
and for XS_TRANSACTION_END messages it calls transaction_end().

If sending an XS_TRANSACTION_START message fails or responds with an
an error, the transaction is not open and transaction_end() must be
called.

If sending an XS_TRANSACTION_END message fails, the transaction is
still open, but if an error response is returned the transaction is
closed.

Commit 027bd7e89906 ("xen/xenbus: Avoid synchronous wait on XenBus
stalling shutdown/restart") introduced a regression where failed
XS_TRANSACTION_START messages were leaving the transaction open.  This
can cause problems with suspend (and migration) as all transactions
must be closed before suspending.

It appears that the problematic change was added accidentally, so just
remove it.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 drivers/xen/xenbus/xenbus_xs.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index 374b12af8812..0bd3d47ad24d 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -249,9 +249,6 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
 
 	mutex_unlock(&xs_state.request_mutex);
 
-	if (IS_ERR(ret))
-		return ret;
-
 	if ((msg->type == XS_TRANSACTION_END) ||
 	    ((req_msg.type == XS_TRANSACTION_START) &&
 	     (msg->type == XS_ERROR)))
-- 
2.34.1


From e5a79475a7ae171fef82608c6e11f51bb85a6745 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Thu, 7 Jul 2016 01:32:35 -0600
Subject: [PATCH 05/16] xenbus: simplify xenbus_dev_request_and_reply()

No need to retain a local copy of the full request message, only the
type is really needed.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 drivers/xen/xenbus/xenbus_xs.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index 0bd3d47ad24d..22f7cd711c57 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -232,10 +232,10 @@ static void transaction_resume(void)
 void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
 {
 	void *ret;
-	struct xsd_sockmsg req_msg = *msg;
+	enum xsd_sockmsg_type type = msg->type;
 	int err;
 
-	if (req_msg.type == XS_TRANSACTION_START)
+	if (type == XS_TRANSACTION_START)
 		transaction_start();
 
 	mutex_lock(&xs_state.request_mutex);
@@ -250,8 +250,7 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
 	mutex_unlock(&xs_state.request_mutex);
 
 	if ((msg->type == XS_TRANSACTION_END) ||
-	    ((req_msg.type == XS_TRANSACTION_START) &&
-	     (msg->type == XS_ERROR)))
+	    ((type == XS_TRANSACTION_START) && (msg->type == XS_ERROR)))
 		transaction_end();
 
 	return ret;
-- 
2.34.1


From 6f2d9d99213514360034c6d52d2c3919290b3504 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Fri, 8 Jul 2016 06:15:07 -0600
Subject: [PATCH 06/16] xen/acpi: allow xen-acpi-processor driver to load on
 Xen 4.7

As of Xen 4.7 PV CPUID doesn't expose either of CPUID[1].ECX[7] and
CPUID[0x80000007].EDX[7] anymore, causing the driver to fail to load on
both Intel and AMD systems. Doing any kind of hardware capability
checks in the driver as a prerequisite was wrong anyway: With the
hypervisor being in charge, all such checking should be done by it. If
ACPI data gets uploaded despite some missing capability, the hypervisor
is free to ignore part or all of that data.

Ditch the entire check_prereq() function, and do the only valid check
(xen_initial_domain()) in the caller in its place.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 drivers/xen/xen-acpi-processor.c | 35 +++-----------------------------
 1 file changed, 3 insertions(+), 32 deletions(-)

diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c
index 076970a54f89..4ce10bcca18b 100644
--- a/drivers/xen/xen-acpi-processor.c
+++ b/drivers/xen/xen-acpi-processor.c
@@ -423,36 +423,7 @@ upload:
 
 	return 0;
 }
-static int __init check_prereq(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-
-	if (!xen_initial_domain())
-		return -ENODEV;
-
-	if (!acpi_gbl_FADT.smi_command)
-		return -ENODEV;
-
-	if (c->x86_vendor == X86_VENDOR_INTEL) {
-		if (!cpu_has(c, X86_FEATURE_EST))
-			return -ENODEV;
 
-		return 0;
-	}
-	if (c->x86_vendor == X86_VENDOR_AMD) {
-		/* Copied from powernow-k8.h, can't include ../cpufreq/powernow
-		 * as we get compile warnings for the static functions.
-		 */
-#define CPUID_FREQ_VOLT_CAPABILITIES    0x80000007
-#define USE_HW_PSTATE                   0x00000080
-		u32 eax, ebx, ecx, edx;
-		cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
-		if ((edx & USE_HW_PSTATE) != USE_HW_PSTATE)
-			return -ENODEV;
-		return 0;
-	}
-	return -ENODEV;
-}
 /* acpi_perf_data is a pointer to percpu data. */
 static struct acpi_processor_performance __percpu *acpi_perf_data;
 
@@ -509,10 +480,10 @@ struct notifier_block xen_acpi_processor_resume_nb = {
 static int __init xen_acpi_processor_init(void)
 {
 	unsigned int i;
-	int rc = check_prereq();
+	int rc;
 
-	if (rc)
-		return rc;
+	if (!xen_initial_domain())
+		return -ENODEV;
 
 	nr_acpi_bits = get_max_acpi_id() + 1;
 	acpi_ids_done = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL);
-- 
2.34.1


From f0fe970df3838c202ef6c07a4c2b36838ef0a88b Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Tue, 5 Jul 2016 17:32:30 -0400
Subject: [PATCH 07/16] ecryptfs: don't allow mmap when the lower fs doesn't
 support it

There are legitimate reasons to disallow mmap on certain files, notably
in sysfs or procfs.  We shouldn't emulate mmap support on file systems
that don't offer support natively.

CVE-2016-1583

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Cc: stable@vger.kernel.org
[tyhicks: clean up f_op check by using ecryptfs_file_to_lower()]
Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
---
 fs/ecryptfs/file.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 53d0141b9c20..ca4e83750214 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -169,6 +169,19 @@ out:
 	return rc;
 }
 
+static int ecryptfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct file *lower_file = ecryptfs_file_to_lower(file);
+	/*
+	 * Don't allow mmap on top of file systems that don't support it
+	 * natively.  If FILESYSTEM_MAX_STACK_DEPTH > 2 or ecryptfs
+	 * allows recursive mounting, this will need to be extended.
+	 */
+	if (!lower_file->f_op->mmap)
+		return -ENODEV;
+	return generic_file_mmap(file, vma);
+}
+
 /**
  * ecryptfs_open
  * @inode: inode specifying file to open
@@ -403,7 +416,7 @@ const struct file_operations ecryptfs_main_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = ecryptfs_compat_ioctl,
 #endif
-	.mmap = generic_file_mmap,
+	.mmap = ecryptfs_mmap,
 	.open = ecryptfs_open,
 	.flush = ecryptfs_flush,
 	.release = ecryptfs_release,
-- 
2.34.1


From 7f556567036cb7f89aabe2f0954b08566b4efb53 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 10 Jul 2016 16:46:32 -0700
Subject: [PATCH 08/16] tmpfs: fix regression hang in fallocate undo

The well-spotted fallocate undo fix is good in most cases, but not when
fallocate failed on the very first page.  index 0 then passes lend -1
to shmem_undo_range(), and that has two bad effects: (a) that it will
undo every fallocation throughout the file, unrestricted by the current
range; but more importantly (b) it can cause the undo to hang, because
lend -1 is treated as truncation, which makes it keep on retrying until
every page has gone, but those already fully instantiated will never go
away.  Big thank you to xfstests generic/269 which demonstrates this.

Fixes: b9b4bb26af01 ("tmpfs: don't undo fallocate past its last page")
Cc: stable@vger.kernel.org
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 24463b67b6ef..171dee7a131f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2225,9 +2225,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 			error = shmem_getpage(inode, index, &page, SGP_FALLOC);
 		if (error) {
 			/* Remove the !PageUptodate pages we added */
-			shmem_undo_range(inode,
-				(loff_t)start << PAGE_SHIFT,
-				((loff_t)index << PAGE_SHIFT) - 1, true);
+			if (index > start) {
+				shmem_undo_range(inode,
+				    (loff_t)start << PAGE_SHIFT,
+				    ((loff_t)index << PAGE_SHIFT) - 1, true);
+			}
 			goto undone;
 		}
 
-- 
2.34.1


From 92d21ac74a9e3c09b0b01c764e530657e4c85c49 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 10 Jul 2016 20:24:59 -0700
Subject: [PATCH 09/16] Linux 4.7-rc7

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 0d504893df6e..81b22628025a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 4
 PATCHLEVEL = 7
 SUBLEVEL = 0
-EXTRAVERSION = -rc6
+EXTRAVERSION = -rc7
 NAME = Psychotic Stoned Sheep
 
 # *DOCUMENTATION*
-- 
2.34.1


From a4949d83eb08908ec1d40bf1237ae68a3ab65a6e Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 22 Dec 2015 08:29:56 -0500
Subject: [PATCH 10/16] Restartable sequences system call (v7)
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Expose a new system call allowing each thread to register one userspace
memory area to be used as an ABI between kernel and user-space for two
purposes: user-space restartable sequences and quick access to read the
current CPU number value from user-space.

* Restartable sequences (per-cpu atomics)

The restartable critical sections (percpu atomics) work has been started
by Paul Turner and Andrew Hunter. It lets the kernel handle restart of
critical sections. [1] [2] The re-implementation proposed here brings a
few simplifications to the ABI which facilitates porting to other
architectures and speeds up the user-space fast path. A locking-based
fall-back, purely implemented in user-space, is proposed here to deal
with debugger single-stepping. This fallback interacts with rseq_start()
and rseq_finish(), which force retries in response to concurrent
lock-based activity.

Here are benchmarks of counter increment in various scenarios compared
to restartable sequences:

ARMv7 Processor rev 4 (v7l)
Machine model: Cubietruck

                      Counter increment speed (ns/increment)
                             1 thread    2 threads
global increment (baseline)      6           N/A
percpu rseq increment           50            52
percpu rseq spinlock            94            94
global atomic increment         48            74 (__sync_add_and_fetch_4)
global atomic CAS               50           172 (__sync_val_compare_and_swap_4)
global pthread mutex           148           862

ARMv7 Processor rev 10 (v7l)
Machine model: Wandboard

                      Counter increment speed (ns/increment)
                             1 thread    4 threads
global increment (baseline)      7           N/A
percpu rseq increment           50            50
percpu rseq spinlock            82            84
global atomic increment         44           262 (__sync_add_and_fetch_4)
global atomic CAS               46           316 (__sync_val_compare_and_swap_4)
global pthread mutex           146          1400

x86-64 Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz:

                      Counter increment speed (ns/increment)
                              1 thread           8 threads
global increment (baseline)      3.0                N/A
percpu rseq increment            3.6                3.8
percpu rseq spinlock             5.6                6.2
global LOCK; inc                 8.0              166.4
global LOCK; cmpxchg            13.4              435.2
global pthread mutex            25.2             1363.6

* Reading the current CPU number

Speeding up reading the current CPU number on which the caller thread is
running is done by keeping the current CPU number up do date within the
cpu_id field of the memory area registered by the thread. This is done
by making scheduler migration set the TIF_NOTIFY_RESUME flag on the
current thread. Upon return to user-space, a notify-resume handler
updates the current CPU value within the registered user-space memory
area. User-space can then read the current CPU number directly from
memory.

Keeping the current cpu id in a memory area shared between kernel and
user-space is an improvement over current mechanisms available to read
the current CPU number, which has the following benefits over
alternative approaches:

- 35x speedup on ARM vs system call through glibc
- 20x speedup on x86 compared to calling glibc, which calls vdso
  executing a "lsl" instruction,
- 14x speedup on x86 compared to inlined "lsl" instruction,
- Unlike vdso approaches, this cpu_id value can be read from an inline
  assembly, which makes it a useful building block for restartable
  sequences.
- The approach of reading the cpu id through memory mapping shared
  between kernel and user-space is portable (e.g. ARM), which is not the
  case for the lsl-based x86 vdso.

On x86, yet another possible approach would be to use the gs segment
selector to point to user-space per-cpu data. This approach performs
similarly to the cpu id cache, but it has two disadvantages: it is
not portable, and it is incompatible with existing applications already
using the gs segment selector for other purposes.

Benchmarking various approaches for reading the current CPU number:

ARMv7 Processor rev 4 (v7l)
Machine model: Cubietruck
- Baseline (empty loop):                                    8.4 ns
- Read CPU from rseq cpu_id:                               16.7 ns
- Read CPU from rseq cpu_id (lazy register):               19.8 ns
- glibc 2.19-0ubuntu6.6 getcpu:                           301.8 ns
- getcpu system call:                                     234.9 ns

x86-64 Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz:
- Baseline (empty loop):                                    0.8 ns
- Read CPU from rseq cpu_id:                                0.8 ns
- Read CPU from rseq cpu_id (lazy register):                0.8 ns
- Read using gs segment selector:                           0.8 ns
- "lsl" inline assembly:                                   13.0 ns
- glibc 2.19-0ubuntu6 getcpu:                              16.6 ns
- getcpu system call:                                      53.9 ns

- Speed

Running 10 runs of hackbench -l 100000 seems to indicate, contrary to
expectations, that enabling CONFIG_RSEQ slightly accelerates the
scheduler:

Configuration: 2 sockets * 8-core Intel(R) Xeon(R) CPU E5-2630 v3 @
2.40GHz (directly on hardware, hyperthreading disabled in BIOS, energy
saving disabled in BIOS, turboboost disabled in BIOS, cpuidle.off=1
kernel parameter), with a Linux v4.6 defconfig+localyesconfig,
restartable sequences series applied.

* CONFIG_RSEQ=n

avg.:      41.37 s
std.dev.:   0.36 s

* CONFIG_RSEQ=y

avg.:      40.46 s
std.dev.:   0.33 s

- Size

On x86-64, between CONFIG_RSEQ=n/y, the text size increase of vmlinux is
2855 bytes, and the data size increase of vmlinux is 1024 bytes.

* CONFIG_RSEQ=n

   text	   data	    bss	    dec	    hex	filename
9964559	4256280	 962560	15183399	 e7ae27	vmlinux.norseq

* CONFIG_RSEQ=y

   text	   data	    bss	    dec	    hex	filename
9967414	4257304	 962560	15187278	 e7bd4e	vmlinux.rseq

[1] https://lwn.net/Articles/650333/
[2] http://www.linuxplumbersconf.org/2013/ocw/system/presentations/1695/original/LPC%20-%20PerCpu%20Atomics.pdf

Link: http://lkml.kernel.org/r/20151027235635.16059.11630.stgit@pjt-glaptop.roam.corp.google.com
Link: http://lkml.kernel.org/r/20150624222609.6116.86035.stgit@kitami.mtv.corp.google.com
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Paul Turner <pjt@google.com>
CC: Andrew Hunter <ahh@google.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Andi Kleen <andi@firstfloor.org>
CC: Dave Watson <davejwatson@fb.com>
CC: Chris Lameter <cl@linux.com>
CC: Ingo Molnar <mingo@redhat.com>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Ben Maurer <bmaurer@fb.com>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
CC: Josh Triplett <josh@joshtriplett.org>
CC: Linus Torvalds <torvalds@linux-foundation.org>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Russell King <linux@arm.linux.org.uk>
CC: Catalin Marinas <catalin.marinas@arm.com>
CC: Will Deacon <will.deacon@arm.com>
CC: Michael Kerrisk <mtk.manpages@gmail.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: linux-api@vger.kernel.org
---

Changes since v1:
- Return -1, errno=EINVAL if cpu_cache pointer is not aligned on
  sizeof(int32_t).
- Update man page to describe the pointer alignement requirements and
  update atomicity guarantees.
- Add MAINTAINERS file GETCPU_CACHE entry.
- Remove dynamic memory allocation: go back to having a single
  getcpu_cache entry per thread. Update documentation accordingly.
- Rebased on Linux 4.4.

Changes since v2:
- Introduce a "cmd" argument, along with an enum with GETCPU_CACHE_GET
  and GETCPU_CACHE_SET. Introduce a uapi header linux/getcpu_cache.h
  defining this enumeration.
- Split resume notifier architecture implementation from the system call
  wire up in the following arch-specific patches.
- Man pages updates.
- Handle 32-bit compat pointers.
- Simplify handling of getcpu_cache GETCPU_CACHE_SET compiler barrier:
  set the current cpu cache pointer before doing the cache update, and
  set it back to NULL if the update fails. Setting it back to NULL on
  error ensures that no resume notifier will trigger a SIGSEGV if a
  migration happened concurrently.

Changes since v3:
- Fix __user annotations in compat code,
- Update memory ordering comments.
- Rebased on kernel v4.5-rc5.

Changes since v4:
- Inline getcpu_cache_fork, getcpu_cache_execve, and getcpu_cache_exit.
- Add new line between if() and switch() to improve readability.
- Added sched switch benchmarks (hackbench) and size overhead comparison
  to change log.

Changes since v5:
- Rename "getcpu_cache" to "thread_local_abi", allowing to extend
  this system call to cover future features such as restartable critical
  sections. Generalizing this system call ensures that we can add
  features similar to the cpu_id field within the same cache-line
  without having to track one pointer per feature within the task
  struct.
- Add a tlabi_nr parameter to the system call, thus allowing to extend
  the ABI beyond the initial 64-byte structure by registering structures
  with tlabi_nr greater than 0. The initial ABI structure is associated
  with tlabi_nr 0.
- Rebased on kernel v4.5.

Changes since v6:
- Integrate "restartable sequences" v2 patchset from Paul Turner.
- Add handling of single-stepping purely in user-space, with a
  fallback to locking after 2 rseq failures to ensure progress, and
  by exposing a __rseq_table section to debuggers so they know where
  to put breakpoints when dealing with rseq assembly blocks which
  can be aborted at any point.
- make the code and ABI generic: porting the kernel implementation
  simply requires to wire up the signal handler and return to user-space
  hooks, and allocate the syscall number.
- extend testing with a fully configurable test program. See
  param_spinlock_test -h for details.
- handling of rseq ENOSYS in user-space, also with a fallback
  to locking.
- modify Paul Turner's rseq ABI to only require a single TLS store on
  the user-space fast-path, removing the need to populate two additional
  registers. This is made possible by introducing struct rseq_cs into
  the ABI to describe a critical section start_ip, post_commit_ip, and
  abort_ip.
- Rebased on kernel v4.7-rc7.

Man page associated:

RSEQ(2)                Linux Programmer's Manual               RSEQ(2)

NAME
       rseq - Restartable sequences and cpu number cache

SYNOPSIS
       #include <linux/rseq.h>

       int rseq(struct rseq * rseq, int flags);

DESCRIPTION
       The  rseq()  ABI  accelerates  user-space operations on per-cpu
       data by defining a shared data structure ABI between each user-
       space thread and the kernel.

       The  rseq argument is a pointer to the thread-local rseq strucâ
       ture to be shared between kernel and user-space.  A  NULL  rseq
       value  can  be used to check whether rseq is registered for the
       current thread.

       The layout of struct rseq is as follows:

       Structure alignment
              This structure needs to be aligned on  multiples  of  64
              bytes.

       Structure size
              This structure has a fixed size of 128 bytes.

       Fields

           cpu_id
              Cache  of  the CPU number on which the calling thread is
              running.

           event_counter
              Restartable sequences event_counter field.

           rseq_cs
              Restartable sequences rseq_cs field. Points to a  struct
              rseq_cs.

       The layout of struct rseq_cs is as follows:

       Structure alignment
              This  structure  needs  to be aligned on multiples of 64
              bytes.

       Structure size
              This structure has a fixed size of 192 bytes.

       Fields

           start_ip
              Instruction pointer address of the first instruction  of
              the sequence of consecutive assembly instructions.

           post_commit_ip
              Instruction  pointer  address after the last instruction
              of the sequence of consecutive assembly instructions.

           abort_ip
              Instruction pointer address where to move the  execution
              flow  in  case  of  abort of the sequence of consecutive
              assembly instructions.

       The flags argument is currently unused and must be specified as
       0.

       Typically,  a  library or application will keep the rseq strucâ
       ture in a thread-local storage variable, or other memory  areas
       belonging to each thread. It is recommended to perform volatile
       reads of the thread-local cache to prevent  the  compiler  from
       doing  load  tearing.  An  alternative approach is to read each
       field from inline assembly.

       Each thread is responsible for registering its rseq  structure.
       Only  one  rseq structure address can be registered per thread.
       Once set, the rseq address is idempotent for a given thread.

       In a typical usage scenario, the thread  registering  the  rseq
       structure  will  be  performing  loads  and stores from/to that
       structure. It is however also allowed to  read  that  structure
       from  other  threads.   The rseq field updates performed by the
       kernel provide single-copy atomicity semantics, which guarantee
       that  other  threads performing single-copy atomic reads of the
       cpu number cache will always observe a consistent value.

       Memory registered as rseq structure should never be deallocated
       before  the  thread which registered it exits: specifically, it
       should not be freed, and the library containing the  registered
       thread-local  storage  should  not be dlclose'd. Violating this
       constraint may cause a SIGSEGV signal to be  delivered  to  the
       thread.

       Unregistration  of associated rseq structure is implicitly perâ
       formed when a thread or process exit.

RETURN VALUE
       A return  value  of  0  indicates  success.  On  error,  -1  is
       returned, and errno is set appropriately.

ERRORS
       EINVAL Either  flags  is  non-zero, or rseq contains an address
              which is not appropriately aligned.

       ENOSYS The rseq() system call is not implemented by  this  kerâ
              nel.

       EFAULT rseq is an invalid address.

       EBUSY  The rseq argument contains a non-NULL address which difâ
              fers from the memory  location  already  registered  for
              this thread.

       ENOENT The  rseq  argument  is  NULL, but no memory location is
              currently registered for this thread.

VERSIONS
       The rseq() system call was added in Linux 4.X (TODO).

CONFORMING TO
       rseq() is Linux-specific.

EXAMPLE
       The following code uses  the  rseq()  system  call  to  keep  a
       thread-local  storage  variable up to date with the current CPU
       number, with a fallback on sched_getcpu(3) if the cache is  not
       available.  For  example  simplicity, it is done in main(), but
       multithreaded programs would need to invoke  rseq()  from  each
       program thread.

           #define _GNU_SOURCE
           #include <stdlib.h>
           #include <stdio.h>
           #include <unistd.h>
           #include <stdint.h>
           #include <sched.h>
           #include <stddef.h>
           #include <errno.h>
           #include <string.h>
           #include <sys/syscall.h>
           #include <linux/rseq.h>

           static __thread volatile struct rseq rseq_state = {
               .u.e.cpu_id = -1,
           };

           static int
           sys_rseq(volatile struct rseq *rseq_abi, int flags)
           {
               return syscall(__NR_rseq, rseq_abi, flags);
           }

           static int32_t
           rseq_current_cpu_raw(void)
           {
               return rseq_state.u.e.cpu_id;
           }

           static int32_t
           rseq_current_cpu(void)
           {
               int32_t cpu;

               cpu = rseq_current_cpu_raw();
               if (cpu < 0)
                   cpu = sched_getcpu();
               return cpu;
           }

           static int
           rseq_init_current_thread(void)
           {
               int rc;

               rc = sys_rseq(&rseq_state, 0);
               if (rc) {
                   fprintf(stderr, "Error: sys_rseq(...) failed(%d): %s\n",
                       errno, strerror(errno));
                   return -1;
               }
               return 0;
           }

           int
           main(int argc, char **argv)
           {
               if (rseq_init_current_thread()) {
                   fprintf(stderr,
                       "Unable to initialize restartable sequences.\n");
                   fprintf(stderr, "Using sched_getcpu() as fallback.\n");
               }
               printf("Current CPU number: %d\n", rseq_current_cpu());

               exit(EXIT_SUCCESS);
           }

SEE ALSO
       sched_getcpu(3)

Linux                         2016-07-19                       RSEQ(2)
---
 MAINTAINERS               |   7 ++
 arch/Kconfig              |   7 ++
 fs/exec.c                 |   1 +
 include/linux/sched.h     |  68 +++++++++++
 include/uapi/linux/Kbuild |   1 +
 include/uapi/linux/rseq.h |  85 ++++++++++++++
 init/Kconfig              |  13 +++
 kernel/Makefile           |   1 +
 kernel/fork.c             |   2 +
 kernel/rseq.c             | 231 ++++++++++++++++++++++++++++++++++++++
 kernel/sched/core.c       |   1 +
 kernel/sys_ni.c           |   3 +
 12 files changed, 420 insertions(+)
 create mode 100644 include/uapi/linux/rseq.h
 create mode 100644 kernel/rseq.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 1209323b7e43..daef027e4e13 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5085,6 +5085,13 @@ M:	Joe Perches <joe@perches.com>
 S:	Maintained
 F:	scripts/get_maintainer.pl
 
+RESTARTABLE SEQUENCES SUPPORT
+M:	Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+L:	linux-kernel@vger.kernel.org
+S:	Supported
+F:	kernel/rseq.c
+F:	include/uapi/linux/rseq.h
+
 GFS2 FILE SYSTEM
 M:	Steven Whitehouse <swhiteho@redhat.com>
 M:	Bob Peterson <rpeterso@redhat.com>
diff --git a/arch/Kconfig b/arch/Kconfig
index 15996290fed4..2c23e26c5f4f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -242,6 +242,13 @@ config HAVE_REGS_AND_STACK_ACCESS_API
 	  declared in asm/ptrace.h
 	  For example the kprobes-based event tracer needs this API.
 
+config HAVE_RSEQ
+	bool
+	depends on HAVE_REGS_AND_STACK_ACCESS_API
+	help
+	  This symbol should be selected by an architecture if it
+	  supports an implementation of restartable sequences.
+
 config HAVE_CLK
 	bool
 	help
diff --git a/fs/exec.c b/fs/exec.c
index 887c1c955df8..e912d8713b23 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1707,6 +1707,7 @@ static int do_execveat_common(int fd, struct filename *filename,
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
+	rseq_execve(current);
 	acct_update_integrals(current);
 	task_numa_free(current);
 	free_bprm(bprm);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 253538f29ade..5c4b90076715 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -59,6 +59,7 @@ struct sched_param {
 #include <linux/gfp.h>
 #include <linux/magic.h>
 #include <linux/cgroup-defs.h>
+#include <linux/rseq.h>
 
 #include <asm/processor.h>
 
@@ -1918,6 +1919,10 @@ struct task_struct {
 #ifdef CONFIG_MMU
 	struct task_struct *oom_reaper_list;
 #endif
+#ifdef CONFIG_RSEQ
+	struct rseq __user *rseq;
+	uint32_t rseq_event_counter;
+#endif
 /* CPU-specific state of this task */
 	struct thread_struct thread;
 /*
@@ -3387,4 +3392,67 @@ void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
 void cpufreq_remove_update_util_hook(int cpu);
 #endif /* CONFIG_CPU_FREQ */
 
+#ifdef CONFIG_RSEQ
+static inline void rseq_set_notify_resume(struct task_struct *t)
+{
+	if (t->rseq)
+		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+}
+void __rseq_handle_notify_resume(struct pt_regs *regs);
+static inline void rseq_handle_notify_resume(struct pt_regs *regs)
+{
+	if (current->rseq)
+		__rseq_handle_notify_resume(regs);
+}
+/*
+ * If parent process has a registered restartable sequences area, the
+ * child inherits. Only applies when forking a process, not a thread. In
+ * case a parent fork() in the middle of a restartable sequence, set the
+ * resume notifier to force the child to retry.
+ */
+static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
+{
+	if (clone_flags & CLONE_THREAD) {
+		t->rseq = NULL;
+		t->rseq_event_counter = 0;
+	} else {
+		t->rseq = current->rseq;
+		t->rseq_event_counter = current->rseq_event_counter;
+		rseq_set_notify_resume(t);
+	}
+}
+static inline void rseq_execve(struct task_struct *t)
+{
+	t->rseq = NULL;
+	t->rseq_event_counter = 0;
+}
+static inline void rseq_sched_out(struct task_struct *t)
+{
+	rseq_set_notify_resume(t);
+}
+static inline void rseq_signal_deliver(struct pt_regs *regs)
+{
+	rseq_handle_notify_resume(regs);
+}
+#else
+static inline void rseq_set_notify_resume(struct task_struct *t)
+{
+}
+static inline void rseq_handle_notify_resume(struct pt_regs *regs)
+{
+}
+static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
+{
+}
+static inline void rseq_execve(struct task_struct *t)
+{
+}
+static inline void rseq_sched_out(struct task_struct *t)
+{
+}
+static inline void rseq_signal_deliver(struct pt_regs *regs)
+{
+}
+#endif
+
 #endif
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 8bdae34d1f9a..2e64fb826193 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -403,6 +403,7 @@ header-y += tcp_metrics.h
 header-y += telephony.h
 header-y += termios.h
 header-y += thermal.h
+header-y += rseq.h
 header-y += time.h
 header-y += times.h
 header-y += timex.h
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
new file mode 100644
index 000000000000..3e79fa9482b2
--- /dev/null
+++ b/include/uapi/linux/rseq.h
@@ -0,0 +1,85 @@
+#ifndef _UAPI_LINUX_RSEQ_H
+#define _UAPI_LINUX_RSEQ_H
+
+/*
+ * linux/rseq.h
+ *
+ * Restartable sequences system call API
+ *
+ * Copyright (c) 2015-2016 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else	/* #ifdef __KERNEL__ */
+# include <stdint.h>
+#endif	/* #else #ifdef __KERNEL__ */
+
+#include <asm/byteorder.h>
+
+#ifdef __LP64__
+# define RSEQ_FIELD_u32_u64(field)	uint64_t field
+#elif defined(__BYTE_ORDER) ? \
+	__BYTE_ORDER == __BIG_ENDIAN : defined(__BIG_ENDIAN)
+# define RSEQ_FIELD_u32_u64(field)	uint32_t _padding ## field, field
+#else
+# define RSEQ_FIELD_u32_u64(field)	uint32_t field, _padding ## field
+#endif
+
+struct rseq_cs {
+	RSEQ_FIELD_u32_u64(start_ip);
+	RSEQ_FIELD_u32_u64(post_commit_ip);
+	RSEQ_FIELD_u32_u64(abort_ip);
+} __attribute__((aligned(sizeof(uint64_t))));
+
+struct rseq {
+	union {
+		struct {
+			/*
+			 * Restartable sequences cpu_id field.
+			 * Updated by the kernel, and read by user-space with
+			 * single-copy atomicity semantics. Aligned on 32-bit.
+			 * Negative values are reserved for user-space.
+			 */
+			int32_t cpu_id;
+			/*
+			 * Restartable sequences event_counter field.
+			 * Updated by the kernel, and read by user-space with
+			 * single-copy atomicity semantics. Aligned on 32-bit.
+			 */
+			uint32_t event_counter;
+		} e;
+		/*
+		 * On architectures with 64-bit aligned reads, both cpu_id and
+		 * event_counter can be read with single-copy atomicity
+		 * semantics.
+		 */
+		uint64_t v;
+	} u;
+	/*
+	 * Restartable sequences rseq_cs field.
+	 * Updated by user-space, read by the kernel with
+	 * single-copy atomicity semantics. Aligned on 64-bit.
+	 */
+	RSEQ_FIELD_u32_u64(rseq_cs);
+} __attribute__((aligned(sizeof(uint64_t))));
+
+#endif /* _UAPI_LINUX_RSEQ_H */
diff --git a/init/Kconfig b/init/Kconfig
index c02d89777713..545b7eda1b33 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1653,6 +1653,19 @@ config MEMBARRIER
 
 	  If unsure, say Y.
 
+config RSEQ
+	bool "Enable rseq() system call" if EXPERT
+	default y
+	depends on HAVE_RSEQ
+	help
+	  Enable the restartable sequences system call. It provides a
+	  user-space cache for the current CPU number value, which
+	  speeds up getting the current CPU number from user-space,
+	  as well as an ABI to speed up user-space operations on
+	  per-CPU data.
+
+	  If unsure, say Y.
+
 config EMBEDDED
 	bool "Embedded system"
 	option allnoconfig_y
diff --git a/kernel/Makefile b/kernel/Makefile
index e2ec54e2b952..4c6d8b5ad2bf 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -112,6 +112,7 @@ obj-$(CONFIG_TORTURE_TEST) += torture.o
 obj-$(CONFIG_MEMBARRIER) += membarrier.o
 
 obj-$(CONFIG_HAS_IOMEM) += memremap.o
+obj-$(CONFIG_RSEQ) += rseq.o
 
 $(obj)/configs.o: $(obj)/config_data.h
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 4a7ec0c6c88c..cc7756b25b0a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1591,6 +1591,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 */
 	copy_seccomp(p);
 
+	rseq_fork(p, clone_flags);
+
 	/*
 	 * Process group and session signals need to be delivered to just the
 	 * parent before the fork or both the parent and the child after the
diff --git a/kernel/rseq.c b/kernel/rseq.c
new file mode 100644
index 000000000000..e1c847bf9910
--- /dev/null
+++ b/kernel/rseq.c
@@ -0,0 +1,231 @@
+/*
+ * Restartable sequences system call
+ *
+ * Restartable sequences are a lightweight interface that allows
+ * user-level code to be executed atomically relative to scheduler
+ * preemption and signal delivery. Typically used for implementing
+ * per-cpu operations.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Copyright (C) 2015, Google, Inc.,
+ * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com>
+ * Copyright (C) 2015-2016, EfficiOS Inc.,
+ * Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/compat.h>
+#include <linux/rseq.h>
+#include <asm/ptrace.h>
+
+/*
+ * Each restartable sequence assembly block defines a "struct rseq_cs"
+ * structure which describes the post_commit_ip address, and the
+ * abort_ip address where the kernel should move the thread instruction
+ * pointer if a rseq critical section assembly block is preempted or if
+ * a signal is delivered on top of a rseq critical section assembly
+ * block. It also contains a start_ip, which is the address of the start
+ * of the rseq assembly block, which is useful to debuggers.
+ *
+ * The algorithm for a restartable sequence assembly block is as
+ * follows:
+ *
+ * rseq_start()
+ *
+ *   0. Userspace loads the current event counter value from the
+ *      event_counter field of the registered struct rseq TLS area,
+ *
+ * rseq_finish()
+ *
+ *   Steps [1]-[3] (inclusive) need to be a sequence of instructions in
+ *   userspace that can handle being moved to the abort_ip between any
+ *   of those instructions.
+ *
+ *   The abort_ip address needs to be equal or above the post_commit_ip.
+ *   Step [4] and the failure code step [F1] need to be at addresses
+ *   equal or above the post_commit_ip.
+ *
+ *   1.  Userspace stores the address of the struct rseq cs rseq
+ *       assembly block descriptor into the rseq_cs field of the
+ *       registered struct rseq TLS area.
+ *
+ *   2.  Userspace tests to see whether the current event counter values
+ *       match those loaded at [0]. Manually jumping to [F1] in case of
+ *       a mismatch.
+ *
+ *       Note that if we are preempted or interrupted by a signal
+ *       after [1] and before post_commit_ip, then the kernel also
+ *       performs the comparison performed in [2], and conditionally
+ *       clears rseq_cs, then jumps us to abort_ip.
+ *
+ *   3.  Userspace critical section final instruction before
+ *       post_commit_ip is the commit. The critical section is
+ *       self-terminating.
+ *       [post_commit_ip]
+ *
+ *   4.  Userspace clears the rseq_cs field of the struct rseq
+ *       TLS area.
+ *
+ *   5.  Return true.
+ *
+ *   On failure at [2]:
+ *
+ *   F1. Userspace clears the rseq_cs field of the struct rseq
+ *       TLS area. Followed by step [F2].
+ *
+ *       [abort_ip]
+ *   F2. Return false.
+ */
+
+static int rseq_increment_event_counter(struct task_struct *t)
+{
+	if (__put_user(++t->rseq_event_counter,
+			&t->rseq->u.e.event_counter))
+		return -1;
+	return 0;
+}
+
+static int rseq_get_rseq_cs(struct task_struct *t,
+		void __user **post_commit_ip,
+		void __user **abort_ip)
+{
+	unsigned long ptr;
+	struct rseq_cs __user *rseq_cs;
+
+	if (__get_user(ptr, &t->rseq->rseq_cs))
+		return -1;
+	if (!ptr)
+		return 0;
+#ifdef CONFIG_COMPAT
+	if (in_compat_syscall()) {
+		rseq_cs = compat_ptr((compat_uptr_t)ptr);
+		if (get_user(ptr, &rseq_cs->post_commit_ip))
+			return -1;
+		*post_commit_ip = compat_ptr((compat_uptr_t)ptr);
+		if (get_user(ptr, &rseq_cs->abort_ip))
+			return -1;
+		*abort_ip = compat_ptr((compat_uptr_t)ptr);
+		return 0;
+	}
+#endif
+	rseq_cs = (struct rseq_cs __user *)ptr;
+	if (get_user(ptr, &rseq_cs->post_commit_ip))
+		return -1;
+	*post_commit_ip = (void __user *)ptr;
+	if (get_user(ptr, &rseq_cs->abort_ip))
+		return -1;
+	*abort_ip = (void __user *)ptr;
+	return 0;
+}
+
+static int rseq_ip_fixup(struct pt_regs *regs)
+{
+	struct task_struct *t = current;
+	void __user *post_commit_ip = NULL;
+	void __user *abort_ip = NULL;
+
+	if (rseq_get_rseq_cs(t, &post_commit_ip, &abort_ip))
+		return -1;
+
+	/* Handle potentially being within a critical section. */
+	if ((void __user *)instruction_pointer(regs) < post_commit_ip) {
+		/*
+		 * We need to clear rseq_cs upon entry into a signal
+		 * handler nested on top of a rseq assembly block, so
+		 * the signal handler will not be fixed up if itself
+		 * interrupted by a nested signal handler or preempted.
+		 */
+		if (clear_user(&t->rseq->rseq_cs,
+				sizeof(t->rseq->rseq_cs)))
+			return -1;
+
+		/*
+		 * We set this after potentially failing in
+		 * clear_user so that the signal arrives at the
+		 * faulting rip.
+		 */
+		instruction_pointer_set(regs, (unsigned long)abort_ip);
+	}
+	return 0;
+}
+
+/*
+ * This resume handler should always be executed between any of:
+ * - preemption,
+ * - signal delivery,
+ * and return to user-space.
+ */
+void __rseq_handle_notify_resume(struct pt_regs *regs)
+{
+	struct task_struct *t = current;
+
+	if (unlikely(t->flags & PF_EXITING))
+		return;
+	if (!access_ok(VERIFY_WRITE, t->rseq, sizeof(*t->rseq)))
+		goto error;
+	if (__put_user(raw_smp_processor_id(), &t->rseq->u.e.cpu_id))
+		goto error;
+	if (rseq_increment_event_counter(t))
+		goto error;
+	if (rseq_ip_fixup(regs))
+		goto error;
+	return;
+
+error:
+	force_sig(SIGSEGV, t);
+}
+
+/*
+ * sys_rseq - setup restartable sequences for caller thread.
+ */
+SYSCALL_DEFINE2(rseq, struct rseq __user *, rseq, int, flags)
+{
+	if (unlikely(flags))
+		return -EINVAL;
+	if (!rseq) {
+		if (!current->rseq)
+			return -ENOENT;
+		return 0;
+	}
+
+	if (current->rseq) {
+		/*
+		 * If rseq is already registered, check whether
+		 * the provided address differs from the prior
+		 * one.
+		 */
+		if (current->rseq != rseq)
+			return -EBUSY;
+	} else {
+		/*
+		 * If there was no rseq previously registered,
+		 * we need to ensure the provided rseq is
+		 * properly aligned and valid.
+		 */
+		if (!IS_ALIGNED((unsigned long)rseq, sizeof(uint64_t)))
+			return -EINVAL;
+		if (!access_ok(VERIFY_WRITE, rseq, sizeof(*rseq)))
+			return -EFAULT;
+		current->rseq = rseq;
+		/*
+		 * If rseq was previously inactive, and has just
+		 * been registered, ensure the cpu_id and
+		 * event_counter fields are updated before
+		 * returning to user-space.
+		 */
+		rseq_set_notify_resume(current);
+	}
+
+	return 0;
+}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 51d7105f529a..fbef0c3ab04a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2664,6 +2664,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
 {
 	sched_info_switch(rq, prev, next);
 	perf_event_task_sched_out(prev, next);
+	rseq_sched_out(prev);
 	fire_sched_out_preempt_notifiers(prev, next);
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2c5e3a8e00d7..c653f78df402 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -250,3 +250,6 @@ cond_syscall(sys_execveat);
 
 /* membarrier */
 cond_syscall(sys_membarrier);
+
+/* restartable sequence */
+cond_syscall(sys_rseq);
-- 
2.34.1


From a881b254fb838d6f7dea4ace01ee12c941c6a4f2 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Mon, 6 Jun 2016 09:37:36 -0400
Subject: [PATCH 11/16] tracing: instrument restartable sequences

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Paul Turner <pjt@google.com>
CC: Andrew Hunter <ahh@google.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Andi Kleen <andi@firstfloor.org>
CC: Dave Watson <davejwatson@fb.com>
CC: Chris Lameter <cl@linux.com>
CC: Ingo Molnar <mingo@redhat.com>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Ben Maurer <bmaurer@fb.com>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
CC: Josh Triplett <josh@joshtriplett.org>
CC: Linus Torvalds <torvalds@linux-foundation.org>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Russell King <linux@arm.linux.org.uk>
CC: Catalin Marinas <catalin.marinas@arm.com>
CC: Will Deacon <will.deacon@arm.com>
CC: Michael Kerrisk <mtk.manpages@gmail.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: linux-api@vger.kernel.org
---
 include/trace/events/rseq.h | 60 +++++++++++++++++++++++++++++++++++++
 kernel/rseq.c               | 18 +++++++++--
 2 files changed, 75 insertions(+), 3 deletions(-)
 create mode 100644 include/trace/events/rseq.h

diff --git a/include/trace/events/rseq.h b/include/trace/events/rseq.h
new file mode 100644
index 000000000000..83fd31e09b2f
--- /dev/null
+++ b/include/trace/events/rseq.h
@@ -0,0 +1,60 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rseq
+
+#if !defined(_TRACE_RSEQ_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RSEQ_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(rseq_inc,
+
+	TP_PROTO(uint32_t event_counter, int ret),
+
+	TP_ARGS(event_counter, ret),
+
+	TP_STRUCT__entry(
+		__field(uint32_t, event_counter)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->event_counter = event_counter;
+		__entry->ret = ret;
+	),
+
+	TP_printk("event_counter=%u ret=%d",
+		__entry->event_counter, __entry->ret)
+);
+
+TRACE_EVENT(rseq_ip_fixup,
+
+	TP_PROTO(void __user *regs_ip, void __user *post_commit_ip,
+		void __user *abort_ip, uint32_t kevcount, int ret),
+
+	TP_ARGS(regs_ip, post_commit_ip, abort_ip, kevcount, ret),
+
+	TP_STRUCT__entry(
+		__field(void __user *, regs_ip)
+		__field(void __user *, post_commit_ip)
+		__field(void __user *, abort_ip)
+		__field(uint32_t, kevcount)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->regs_ip = regs_ip;
+		__entry->post_commit_ip = post_commit_ip;
+		__entry->abort_ip = abort_ip;
+		__entry->kevcount = kevcount;
+		__entry->ret = ret;
+	),
+
+	TP_printk("regs_ip=%p post_commit_ip=%p abort_ip=%p kevcount=%u ret=%d",
+		__entry->regs_ip, __entry->post_commit_ip, __entry->abort_ip,
+		__entry->kevcount, __entry->ret)
+);
+
+#endif /* _TRACE_SOCK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/rseq.c b/kernel/rseq.c
index e1c847bf9910..cab326a85846 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -29,6 +29,9 @@
 #include <linux/rseq.h>
 #include <asm/ptrace.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/rseq.h>
+
 /*
  * Each restartable sequence assembly block defines a "struct rseq_cs"
  * structure which describes the post_commit_ip address, and the
@@ -90,8 +93,12 @@
 
 static int rseq_increment_event_counter(struct task_struct *t)
 {
-	if (__put_user(++t->rseq_event_counter,
-			&t->rseq->u.e.event_counter))
+	int ret;
+
+	ret = __put_user(++t->rseq_event_counter,
+			&t->rseq->u.e.event_counter);
+	trace_rseq_inc(t->rseq_event_counter, ret);
+	if (ret)
 		return -1;
 	return 0;
 }
@@ -134,8 +141,13 @@ static int rseq_ip_fixup(struct pt_regs *regs)
 	struct task_struct *t = current;
 	void __user *post_commit_ip = NULL;
 	void __user *abort_ip = NULL;
+	int ret;
 
-	if (rseq_get_rseq_cs(t, &post_commit_ip, &abort_ip))
+	ret = rseq_get_rseq_cs(t, &post_commit_ip, &abort_ip);
+	trace_rseq_ip_fixup((void __user *)instruction_pointer(regs),
+		post_commit_ip, abort_ip, t->rseq_event_counter,
+		ret);
+	if (ret)
 		return -1;
 
 	/* Handle potentially being within a critical section. */
-- 
2.34.1


From b1b1d89ff6ff8c9dceabb755d054c7b70f1157da Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Wed, 27 Jan 2016 17:09:24 -0500
Subject: [PATCH 12/16] Restartable sequences: ARM 32 architecture support

Call the rseq_handle_notify_resume() function on return to
userspace if TIF_NOTIFY_RESUME thread flag is set.

Increment the event counter and perform fixup on the pre-signal frame
when a signal is delivered on top of a restartable sequence critical
section.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Russell King <linux@arm.linux.org.uk>
CC: Catalin Marinas <catalin.marinas@arm.com>
CC: Will Deacon <will.deacon@arm.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Paul Turner <pjt@google.com>
CC: Andrew Hunter <ahh@google.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Andi Kleen <andi@firstfloor.org>
CC: Dave Watson <davejwatson@fb.com>
CC: Chris Lameter <cl@linux.com>
CC: Ingo Molnar <mingo@redhat.com>
CC: Ben Maurer <bmaurer@fb.com>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
CC: Josh Triplett <josh@joshtriplett.org>
CC: Linus Torvalds <torvalds@linux-foundation.org>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: linux-api@vger.kernel.org
---
 arch/arm/Kconfig         | 1 +
 arch/arm/kernel/signal.c | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 90542db1220d..636e14b513bf 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -75,6 +75,7 @@ config ARM
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
 	select HAVE_REGS_AND_STACK_ACCESS_API
+	select HAVE_RSEQ
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_UID16
 	select HAVE_VIRT_CPU_ACCOUNTING_GEN
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 7b8f2141427b..907da02a6333 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -474,6 +474,12 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 	sigset_t *oldset = sigmask_to_save();
 	int ret;
 
+	/*
+	 * Increment event counter and perform fixup for the pre-signal
+	 * frame.
+	 */
+	rseq_signal_deliver(regs);
+
 	/*
 	 * Set up the stack frame
 	 */
@@ -594,6 +600,7 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
 			} else {
 				clear_thread_flag(TIF_NOTIFY_RESUME);
 				tracehook_notify_resume(regs);
+				rseq_handle_notify_resume(regs);
 			}
 		}
 		local_irq_disable();
-- 
2.34.1


From 043def5bd3a942710b2a29bd143edb201cb79b35 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Wed, 27 Jan 2016 17:10:44 -0500
Subject: [PATCH 13/16] Restartable sequences: wire up ARM 32 system call

Wire up the rseq system call on 32-bit ARM.

This provides an ABI improving the speed of a user-space getcpu
operation on ARM by skipping the getcpu system call on the fast path, as
well as improving the speed of user-space operations on per-cpu data
compared to using load-linked/store-conditional.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Russell King <linux@arm.linux.org.uk>
CC: Catalin Marinas <catalin.marinas@arm.com>
CC: Will Deacon <will.deacon@arm.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Paul Turner <pjt@google.com>
CC: Andrew Hunter <ahh@google.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Andi Kleen <andi@firstfloor.org>
CC: Dave Watson <davejwatson@fb.com>
CC: Chris Lameter <cl@linux.com>
CC: Ingo Molnar <mingo@redhat.com>
CC: Ben Maurer <bmaurer@fb.com>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
CC: Josh Triplett <josh@joshtriplett.org>
CC: Linus Torvalds <torvalds@linux-foundation.org>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: linux-api@vger.kernel.org
---
 arch/arm/include/uapi/asm/unistd.h | 1 +
 arch/arm/kernel/calls.S            | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h
index 2cb9dc770e1d..8f61c79fc7ce 100644
--- a/arch/arm/include/uapi/asm/unistd.h
+++ b/arch/arm/include/uapi/asm/unistd.h
@@ -420,6 +420,7 @@
 #define __NR_copy_file_range		(__NR_SYSCALL_BASE+391)
 #define __NR_preadv2			(__NR_SYSCALL_BASE+392)
 #define __NR_pwritev2			(__NR_SYSCALL_BASE+393)
+#define __NR_rseq			(__NR_SYSCALL_BASE+394)
 
 /*
  * The following SWIs are ARM private.
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index 703fa0f3cd8f..0865c04376df 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -403,6 +403,7 @@
 		CALL(sys_copy_file_range)
 		CALL(sys_preadv2)
 		CALL(sys_pwritev2)
+		CALL(sys_rseq)
 #ifndef syscalls_counted
 .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
 #define syscalls_counted
-- 
2.34.1


From b29fe5414af361e6017d20b695f30464ad021146 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Wed, 27 Jan 2016 17:12:07 -0500
Subject: [PATCH 14/16] Restartable sequences: x86 32/64 architecture support

Call the rseq_handle_notify_resume() function on return to userspace if
TIF_NOTIFY_RESUME thread flag is set.

Increment the event counter and perform fixup on the pre-signal frame
when a signal is delivered on top of a restartable sequence critical
section.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Russell King <linux@arm.linux.org.uk>
CC: Catalin Marinas <catalin.marinas@arm.com>
CC: Will Deacon <will.deacon@arm.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Paul Turner <pjt@google.com>
CC: Andrew Hunter <ahh@google.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Andi Kleen <andi@firstfloor.org>
CC: Dave Watson <davejwatson@fb.com>
CC: Chris Lameter <cl@linux.com>
CC: Ingo Molnar <mingo@redhat.com>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Ben Maurer <bmaurer@fb.com>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
CC: Josh Triplett <josh@joshtriplett.org>
CC: Linus Torvalds <torvalds@linux-foundation.org>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: linux-api@vger.kernel.org
---
 arch/x86/Kconfig         | 1 +
 arch/x86/entry/common.c  | 1 +
 arch/x86/kernel/signal.c | 6 ++++++
 3 files changed, 8 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d9a94da0c29f..1db7b06fc010 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -140,6 +140,7 @@ config X86
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_REGS_AND_STACK_ACCESS_API
+	select HAVE_RSEQ
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_UID16			if X86_32 || IA32_EMULATION
 	select HAVE_UNSTABLE_SCHED_CLOCK
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index ec138e538c44..3877dbafba08 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -231,6 +231,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
 		if (cached_flags & _TIF_NOTIFY_RESUME) {
 			clear_thread_flag(TIF_NOTIFY_RESUME);
 			tracehook_notify_resume(regs);
+			rseq_handle_notify_resume(regs);
 		}
 
 		if (cached_flags & _TIF_USER_RETURN_NOTIFY)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 22cc2f9f8aec..0f4da5a4bdf2 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -683,6 +683,12 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
 	sigset_t *set = sigmask_to_save();
 	compat_sigset_t *cset = (compat_sigset_t *) set;
 
+	/*
+	 * Increment event counter and perform fixup for the pre-signal
+	 * frame.
+	 */
+	rseq_signal_deliver(regs);
+
 	/* Set up the stack frame */
 	if (is_ia32_frame()) {
 		if (ksig->ka.sa.sa_flags & SA_SIGINFO)
-- 
2.34.1


From c2b758348b125e7e5be2035febe7fe58b25e06f7 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Wed, 27 Jan 2016 17:13:45 -0500
Subject: [PATCH 15/16] Restartable sequences: wire up x86 32/64 system call

Wire up the rseq system call on x86 32/64.

This provides an ABI improving the speed of a user-space getcpu
operation on x86 by removing the need to perform a function call, "lsl"
instruction, or system call on the fast path, as well as improving the
speed of user-space operations on per-cpu data.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Russell King <linux@arm.linux.org.uk>
CC: Catalin Marinas <catalin.marinas@arm.com>
CC: Will Deacon <will.deacon@arm.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Paul Turner <pjt@google.com>
CC: Andrew Hunter <ahh@google.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Andi Kleen <andi@firstfloor.org>
CC: Dave Watson <davejwatson@fb.com>
CC: Chris Lameter <cl@linux.com>
CC: Ingo Molnar <mingo@redhat.com>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Ben Maurer <bmaurer@fb.com>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
CC: Josh Triplett <josh@joshtriplett.org>
CC: Linus Torvalds <torvalds@linux-foundation.org>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: linux-api@vger.kernel.org
---
 arch/x86/entry/syscalls/syscall_32.tbl | 1 +
 arch/x86/entry/syscalls/syscall_64.tbl | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 4cddd17153fb..15fb98c4b4b0 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -386,3 +386,4 @@
 377	i386	copy_file_range		sys_copy_file_range
 378	i386	preadv2			sys_preadv2			compat_sys_preadv2
 379	i386	pwritev2		sys_pwritev2			compat_sys_pwritev2
+380	i386	rseq			sys_rseq
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 555263e385c9..c7f3c7e98ad8 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -335,6 +335,7 @@
 326	common	copy_file_range		sys_copy_file_range
 327	64	preadv2			sys_preadv2
 328	64	pwritev2		sys_pwritev2
+329	common	rseq			sys_rseq
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
-- 
2.34.1


From be4663d494bce0b60b26203695dc7bcc3dc1bc36 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Mon, 30 May 2016 09:46:53 +0200
Subject: [PATCH 16/16] Restartable sequences: self-tests

Implements two basic tests of RSEQ functionality, and one more
exhaustive parameterizable test.

The first, "basic_test" only asserts that RSEQ works moderately
correctly.
E.g. that:
- The CPUID pointer works
- Code infinitely looping within a critical section will eventually be
  interrupted.
- Critical sections are interrupted by signals.

"basic_percpu_ops_test" is a slightly more "realistic" variant,
implementing a few simple per-cpu operations and testing their
correctness.

"param_test" is a parametrizable restartable sequences test. See
the "--help" output for usage.

As part of those tests, a helper library "rseq" implements a user-space
API around restartable sequences. It takes care of ensuring progress in
case of debugger single-stepping with a fall-back to locking, and
exposes the instruction pointer addresses where the rseq assembly blocks
begin and end, as well as the associated abort instruction pointer, in
the __rseq_table section. This section allows debuggers may know where
to place breakpoints when single-stepping through assembly blocks which
may be aborted at any point by the kernel.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Russell King <linux@arm.linux.org.uk>
CC: Catalin Marinas <catalin.marinas@arm.com>
CC: Will Deacon <will.deacon@arm.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Paul Turner <pjt@google.com>
CC: Andrew Hunter <ahh@google.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Andi Kleen <andi@firstfloor.org>
CC: Dave Watson <davejwatson@fb.com>
CC: Chris Lameter <cl@linux.com>
CC: Ingo Molnar <mingo@redhat.com>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Ben Maurer <bmaurer@fb.com>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
CC: Josh Triplett <josh@joshtriplett.org>
CC: Linus Torvalds <torvalds@linux-foundation.org>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: linux-api@vger.kernel.org
---
 tools/testing/selftests/rseq/.gitignore       |   3 +
 tools/testing/selftests/rseq/Makefile         |  13 +
 .../selftests/rseq/basic_percpu_ops_test.c    | 279 +++++++
 tools/testing/selftests/rseq/basic_test.c     | 106 +++
 tools/testing/selftests/rseq/param_test.c     | 707 ++++++++++++++++++
 tools/testing/selftests/rseq/rseq.c           | 200 +++++
 tools/testing/selftests/rseq/rseq.h           | 449 +++++++++++
 7 files changed, 1757 insertions(+)
 create mode 100644 tools/testing/selftests/rseq/.gitignore
 create mode 100644 tools/testing/selftests/rseq/Makefile
 create mode 100644 tools/testing/selftests/rseq/basic_percpu_ops_test.c
 create mode 100644 tools/testing/selftests/rseq/basic_test.c
 create mode 100644 tools/testing/selftests/rseq/param_test.c
 create mode 100644 tools/testing/selftests/rseq/rseq.c
 create mode 100644 tools/testing/selftests/rseq/rseq.h

diff --git a/tools/testing/selftests/rseq/.gitignore b/tools/testing/selftests/rseq/.gitignore
new file mode 100644
index 000000000000..2596e26bcf0a
--- /dev/null
+++ b/tools/testing/selftests/rseq/.gitignore
@@ -0,0 +1,3 @@
+basic_percpu_ops_test
+basic_test
+param_test
diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile
new file mode 100644
index 000000000000..3d1ad8eceb73
--- /dev/null
+++ b/tools/testing/selftests/rseq/Makefile
@@ -0,0 +1,13 @@
+CFLAGS += -O2 -Wall -g -I../../../../usr/include/
+LDFLAGS += -lpthread
+
+TESTS = basic_test basic_percpu_ops_test param_test
+
+all: $(TESTS)
+%: %.c rseq.h rseq.c
+	$(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
+
+include ../lib.mk
+
+clean:
+	$(RM) $(TESTS)
diff --git a/tools/testing/selftests/rseq/basic_percpu_ops_test.c b/tools/testing/selftests/rseq/basic_percpu_ops_test.c
new file mode 100644
index 000000000000..4667dc50fc4c
--- /dev/null
+++ b/tools/testing/selftests/rseq/basic_percpu_ops_test.c
@@ -0,0 +1,279 @@
+#define _GNU_SOURCE
+#include <assert.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "rseq.h"
+
+static struct rseq_lock rseq_lock;
+
+struct percpu_lock_entry {
+	intptr_t v;
+} __attribute__((aligned(128)));
+
+struct percpu_lock {
+	struct percpu_lock_entry c[CPU_SETSIZE];
+};
+
+struct test_data_entry {
+	int count;
+} __attribute__((aligned(128)));
+
+struct spinlock_test_data {
+	struct percpu_lock lock;
+	struct test_data_entry c[CPU_SETSIZE];
+	int reps;
+};
+
+struct percpu_list_node {
+	intptr_t data;
+	struct percpu_list_node *next;
+};
+
+struct percpu_list_entry {
+	struct percpu_list_node *head;
+} __attribute__((aligned(128)));
+
+struct percpu_list {
+	struct percpu_list_entry c[CPU_SETSIZE];
+};
+
+/* A simple percpu spinlock.  Returns the cpu lock was acquired on. */
+int rseq_percpu_lock(struct percpu_lock *lock)
+{
+	struct rseq_state rseq_state;
+	intptr_t *targetptr, newval;
+	int cpu;
+	bool result;
+
+	for (;;) {
+		do_rseq(&rseq_lock, rseq_state, cpu, result, targetptr, newval,
+			{
+				if (unlikely(lock->c[cpu].v)) {
+					result = false;
+				} else {
+					newval = 1;
+					targetptr = (intptr_t *)&lock->c[cpu].v;
+				}
+			});
+		if (likely(result))
+			break;
+	}
+	/*
+	 * Acquire semantic when taking lock after control dependency.
+	 * Matches smp_store_release().
+	 */
+	smp_acquire__after_ctrl_dep();
+	return cpu;
+}
+
+void rseq_percpu_unlock(struct percpu_lock *lock, int cpu)
+{
+	assert(lock->c[cpu].v == 1);
+	/*
+	 * Release lock, with release semantic. Matches
+	 * smp_acquire__after_ctrl_dep().
+	 */
+	smp_store_release(&lock->c[cpu].v, 0);
+}
+
+void *test_percpu_spinlock_thread(void *arg)
+{
+	struct spinlock_test_data *data = arg;
+	int i, cpu;
+
+	if (rseq_init_current_thread())
+		abort();
+	for (i = 0; i < data->reps; i++) {
+		cpu = rseq_percpu_lock(&data->lock);
+		data->c[cpu].count++;
+		rseq_percpu_unlock(&data->lock, cpu);
+	}
+
+	return NULL;
+}
+
+/*
+ * A simple test which implements a sharded counter using a per-cpu
+ * lock.  Obviously real applications might prefer to simply use a
+ * per-cpu increment; however, this is reasonable for a test and the
+ * lock can be extended to synchronize more complicated operations.
+ */
+void test_percpu_spinlock(void)
+{
+	const int num_threads = 200;
+	int i, sum;
+	pthread_t test_threads[num_threads];
+	struct spinlock_test_data data;
+
+	memset(&data, 0, sizeof(data));
+	data.reps = 5000;
+
+	for (i = 0; i < num_threads; i++)
+		pthread_create(&test_threads[i], NULL,
+			test_percpu_spinlock_thread, &data);
+
+	for (i = 0; i < num_threads; i++)
+		pthread_join(test_threads[i], NULL);
+
+	sum = 0;
+	for (i = 0; i < CPU_SETSIZE; i++)
+		sum += data.c[i].count;
+
+	assert(sum == data.reps * num_threads);
+}
+
+int percpu_list_push(struct percpu_list *list, struct percpu_list_node *node)
+{
+	struct rseq_state rseq_state;
+	intptr_t *targetptr, newval;
+	int cpu;
+	bool result;
+
+	do_rseq(&rseq_lock, rseq_state, cpu, result, targetptr, newval,
+		{
+			newval = (intptr_t)node;
+			targetptr = (intptr_t *)&list->c[cpu].head;
+			node->next = list->c[cpu].head;
+		});
+
+	return cpu;
+}
+
+/*
+ * Unlike a traditional lock-less linked list; the availability of a
+ * rseq primitive allows us to implement pop without concerns over
+ * ABA-type races.
+ */
+struct percpu_list_node *percpu_list_pop(struct percpu_list *list)
+{
+	struct percpu_list_node *head, *next;
+	struct rseq_state rseq_state;
+	intptr_t *targetptr, newval;
+	int cpu;
+	bool result;
+
+	do_rseq(&rseq_lock, rseq_state, cpu, result, targetptr, newval,
+		{
+			head = list->c[cpu].head;
+			if (!head) {
+				result = false;
+			} else {
+				next = head->next;
+				newval = (intptr_t) next;
+				targetptr = (intptr_t *)&list->c[cpu].head;
+			}
+		});
+
+	return head;
+}
+
+void *test_percpu_list_thread(void *arg)
+{
+	int i;
+	struct percpu_list *list = (struct percpu_list *)arg;
+
+	if (rseq_init_current_thread())
+		abort();
+
+	for (i = 0; i < 100000; i++) {
+		struct percpu_list_node *node = percpu_list_pop(list);
+
+		sched_yield();  /* encourage shuffling */
+		if (node)
+			percpu_list_push(list, node);
+	}
+
+	return NULL;
+}
+
+/* Simultaneous modification to a per-cpu linked list from many threads.  */
+void test_percpu_list(void)
+{
+	int i, j;
+	long sum = 0, expected_sum = 0;
+	struct percpu_list list;
+	pthread_t test_threads[200];
+	cpu_set_t allowed_cpus;
+
+	memset(&list, 0, sizeof(list));
+
+	/* Generate list entries for every usable cpu. */
+	sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+		for (j = 1; j <= 100; j++) {
+			struct percpu_list_node *node;
+
+			expected_sum += j;
+
+			node = malloc(sizeof(*node));
+			assert(node);
+			node->data = j;
+			node->next = list.c[i].head;
+			list.c[i].head = node;
+		}
+	}
+
+	for (i = 0; i < 200; i++)
+		assert(pthread_create(&test_threads[i], NULL,
+			test_percpu_list_thread, &list) == 0);
+
+	for (i = 0; i < 200; i++)
+		pthread_join(test_threads[i], NULL);
+
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		cpu_set_t pin_mask;
+		struct percpu_list_node *node;
+
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+
+		CPU_ZERO(&pin_mask);
+		CPU_SET(i, &pin_mask);
+		sched_setaffinity(0, sizeof(pin_mask), &pin_mask);
+
+		while ((node = percpu_list_pop(&list))) {
+			sum += node->data;
+			free(node);
+		}
+	}
+
+	/*
+	 * All entries should now be accounted for (unless some external
+	 * actor is interfering with our allowed affinity while this
+	 * test is running).
+	 */
+	assert(sum == expected_sum);
+}
+
+int main(int argc, char **argv)
+{
+	if (rseq_init_lock(&rseq_lock)) {
+		perror("rseq_init_lock");
+		return -1;
+	}
+	if (rseq_init_current_thread())
+		goto error;
+	printf("spinlock\n");
+	test_percpu_spinlock();
+	printf("percpu_list\n");
+	test_percpu_list();
+
+	if (rseq_destroy_lock(&rseq_lock)) {
+		perror("rseq_destroy_lock");
+		return -1;
+	}
+	return 0;
+
+error:
+	if (rseq_destroy_lock(&rseq_lock))
+		perror("rseq_destroy_lock");
+	return -1;
+}
+
diff --git a/tools/testing/selftests/rseq/basic_test.c b/tools/testing/selftests/rseq/basic_test.c
new file mode 100644
index 000000000000..e8fdcd6ed51c
--- /dev/null
+++ b/tools/testing/selftests/rseq/basic_test.c
@@ -0,0 +1,106 @@
+/*
+ * Basic test coverage for critical regions and rseq_current_cpu().
+ */
+
+#define _GNU_SOURCE
+#include <assert.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/time.h>
+
+#include "rseq.h"
+
+volatile int signals_delivered;
+volatile __thread struct rseq_state sigtest_start;
+static struct rseq_lock rseq_lock;
+
+void test_cpu_pointer(void)
+{
+	cpu_set_t affinity, test_affinity;
+	int i;
+
+	sched_getaffinity(0, sizeof(affinity), &affinity);
+	CPU_ZERO(&test_affinity);
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		if (CPU_ISSET(i, &affinity)) {
+			CPU_SET(i, &test_affinity);
+			sched_setaffinity(0, sizeof(test_affinity),
+					&test_affinity);
+			assert(rseq_current_cpu() == sched_getcpu());
+			assert(rseq_current_cpu() == i);
+			CPU_CLR(i, &test_affinity);
+		}
+	}
+	sched_setaffinity(0, sizeof(affinity), &affinity);
+}
+
+/*
+ * This depends solely on some environmental event triggering a counter
+ * increase.
+ */
+void test_critical_section(void)
+{
+	struct rseq_state start;
+	uint32_t event_counter;
+
+	start = rseq_start(&rseq_lock);
+	event_counter = start.event_counter;
+	do {
+		start = rseq_start(&rseq_lock);
+	} while (start.event_counter == event_counter);
+}
+
+void test_signal_interrupt_handler(int signo)
+{
+	struct rseq_state current;
+
+	current = rseq_start(&rseq_lock);
+	/*
+	 * The potential critical section bordered by 'start' must be
+	 * invalid.
+	 */
+	assert(current.event_counter != sigtest_start.event_counter);
+	signals_delivered++;
+}
+
+void test_signal_interrupts(void)
+{
+	struct itimerval it = { { 0, 1 }, { 0, 1 } };
+
+	setitimer(ITIMER_PROF, &it, NULL);
+	signal(SIGPROF, test_signal_interrupt_handler);
+
+	do {
+		sigtest_start = rseq_start(&rseq_lock);
+	} while (signals_delivered < 10);
+	setitimer(ITIMER_PROF, NULL, NULL);
+}
+
+int main(int argc, char **argv)
+{
+	if (rseq_init_lock(&rseq_lock)) {
+		perror("rseq_init_lock");
+		return -1;
+	}
+	if (rseq_init_current_thread())
+		goto init_thread_error;
+	printf("testing current cpu\n");
+	test_cpu_pointer();
+	printf("testing critical section\n");
+	test_critical_section();
+	printf("testing critical section is interrupted by signal\n");
+	test_signal_interrupts();
+
+	if (rseq_destroy_lock(&rseq_lock)) {
+		perror("rseq_destroy_lock");
+		return -1;
+	}
+	return 0;
+
+init_thread_error:
+	if (rseq_destroy_lock(&rseq_lock))
+		perror("rseq_destroy_lock");
+	return -1;
+}
diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c
new file mode 100644
index 000000000000..f95fba5a1b2a
--- /dev/null
+++ b/tools/testing/selftests/rseq/param_test.c
@@ -0,0 +1,707 @@
+#define _GNU_SOURCE
+#include <assert.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <poll.h>
+#include <sys/types.h>
+#include <signal.h>
+#include <errno.h>
+
+static inline pid_t gettid(void)
+{
+	return syscall(__NR_gettid);
+}
+
+#define NR_INJECT	9
+static int loop_cnt[NR_INJECT + 1];
+
+static int opt_modulo;
+
+static int opt_yield, opt_signal, opt_sleep, opt_fallback_cnt = 3,
+		opt_disable_rseq, opt_threads = 200,
+		opt_reps = 5000, opt_disable_mod = 0, opt_test = 's';
+
+static __thread unsigned int signals_delivered;
+
+static struct rseq_lock rseq_lock;
+
+#ifndef BENCHMARK
+
+static __thread unsigned int yield_mod_cnt, nr_retry;
+
+#define printf_nobench(fmt, ...)	printf(fmt, ## __VA_ARGS__)
+
+#define RSEQ_INJECT_INPUT \
+	, [loop_cnt_1]"m"(loop_cnt[1]) \
+	, [loop_cnt_2]"m"(loop_cnt[2]) \
+	, [loop_cnt_3]"m"(loop_cnt[3]) \
+	, [loop_cnt_4]"m"(loop_cnt[4])
+
+#if defined(__x86_64__) || defined(__i386__)
+
+#define INJECT_ASM_REG	"eax"
+
+#define RSEQ_INJECT_CLOBBER \
+	, INJECT_ASM_REG
+
+#define RSEQ_INJECT_ASM(n) \
+	"mov %[loop_cnt_" #n "], %%" INJECT_ASM_REG "\n\t" \
+	"test %%" INJECT_ASM_REG ",%%" INJECT_ASM_REG "\n\t" \
+	"jz 333f\n\t" \
+	"222:\n\t" \
+	"dec %%" INJECT_ASM_REG "\n\t" \
+	"jnz 222b\n\t" \
+	"333:\n\t"
+
+#elif defined(__ARMEL__)
+
+#define INJECT_ASM_REG	"r4"
+
+#define RSEQ_INJECT_CLOBBER \
+	, INJECT_ASM_REG
+
+#define RSEQ_INJECT_ASM(n) \
+	"ldr " INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t" \
+	"cmp " INJECT_ASM_REG ", #0\n\t" \
+	"beq 333f\n\t" \
+	"222:\n\t" \
+	"subs " INJECT_ASM_REG ", #1\n\t" \
+	"bne 222b\n\t" \
+	"333:\n\t"
+
+#else
+#error unsupported target
+#endif
+
+#define RSEQ_INJECT_FAILED \
+	nr_retry++;
+
+#define RSEQ_INJECT_C(n) \
+{ \
+	int loc_i, loc_nr_loops = loop_cnt[n]; \
+	\
+	for (loc_i = 0; loc_i < loc_nr_loops; loc_i++) { \
+		barrier(); \
+	} \
+	if (loc_nr_loops == -1 && opt_modulo) { \
+		if (yield_mod_cnt == opt_modulo - 1) { \
+			if (opt_sleep > 0) \
+				poll(NULL, 0, opt_sleep); \
+			if (opt_yield) \
+				sched_yield(); \
+			if (opt_signal) \
+				raise(SIGUSR1); \
+			yield_mod_cnt = 0; \
+		} else { \
+			yield_mod_cnt++; \
+		} \
+	} \
+}
+
+#define RSEQ_FALLBACK_CNT	\
+	opt_fallback_cnt
+
+#else
+
+#define printf_nobench(fmt, ...)
+
+#endif /* BENCHMARK */
+
+#include "rseq.h"
+
+struct percpu_lock_entry {
+	intptr_t v;
+} __attribute__((aligned(128)));
+
+struct percpu_lock {
+	struct percpu_lock_entry c[CPU_SETSIZE];
+};
+
+struct test_data_entry {
+	int count;
+} __attribute__((aligned(128)));
+
+struct spinlock_test_data {
+	struct percpu_lock lock;
+	struct test_data_entry c[CPU_SETSIZE];
+};
+
+struct spinlock_thread_test_data {
+	struct spinlock_test_data *data;
+	int reps;
+	int reg;
+};
+
+struct inc_test_data {
+	struct test_data_entry c[CPU_SETSIZE];
+};
+
+struct inc_thread_test_data {
+	struct inc_test_data *data;
+	int reps;
+	int reg;
+};
+
+struct percpu_list_node {
+	intptr_t data;
+	struct percpu_list_node *next;
+};
+
+struct percpu_list_entry {
+	struct percpu_list_node *head;
+} __attribute__((aligned(128)));
+
+struct percpu_list {
+	struct percpu_list_entry c[CPU_SETSIZE];
+};
+
+/* A simple percpu spinlock.  Returns the cpu lock was acquired on. */
+static int rseq_percpu_lock(struct percpu_lock *lock)
+{
+	struct rseq_state rseq_state;
+	intptr_t *targetptr, newval;
+	int cpu;
+	bool result;
+
+	for (;;) {
+		do_rseq(&rseq_lock, rseq_state, cpu, result, targetptr, newval,
+			{
+				if (unlikely(lock->c[cpu].v)) {
+					result = false;
+				} else {
+					newval = 1;
+					targetptr = (intptr_t *)&lock->c[cpu].v;
+				}
+			});
+		if (likely(result))
+			break;
+	}
+	/*
+	 * Acquire semantic when taking lock after control dependency.
+	 * Matches smp_store_release().
+	 */
+	smp_acquire__after_ctrl_dep();
+	return cpu;
+}
+
+static void rseq_percpu_unlock(struct percpu_lock *lock, int cpu)
+{
+	assert(lock->c[cpu].v == 1);
+	/*
+	 * Release lock, with release semantic. Matches
+	 * smp_acquire__after_ctrl_dep().
+	 */
+	smp_store_release(&lock->c[cpu].v, 0);
+}
+
+void *test_percpu_spinlock_thread(void *arg)
+{
+	struct spinlock_thread_test_data *thread_data = arg;
+	struct spinlock_test_data *data = thread_data->data;
+	int i, cpu;
+
+	if (!opt_disable_rseq && thread_data->reg
+			&& rseq_init_current_thread())
+		abort();
+	for (i = 0; i < thread_data->reps; i++) {
+		cpu = rseq_percpu_lock(&data->lock);
+		data->c[cpu].count++;
+		rseq_percpu_unlock(&data->lock, cpu);
+#ifndef BENCHMARK
+		if (i != 0 && !(i % (thread_data->reps / 10)))
+			printf("tid %d: count %d\n", (int) gettid(), i);
+#endif
+	}
+	printf_nobench("tid %d: number of retry: %d, signals delivered: %u, nr_fallback %u, nr_fallback_wait %u\n",
+		(int) gettid(), nr_retry, signals_delivered,
+		__rseq_thread_state.fallback_cnt,
+		__rseq_thread_state.fallback_wait_cnt);
+	return NULL;
+}
+
+/*
+ * A simple test which implements a sharded counter using a per-cpu
+ * lock.  Obviously real applications might prefer to simply use a
+ * per-cpu increment; however, this is reasonable for a test and the
+ * lock can be extended to synchronize more complicated operations.
+ */
+void test_percpu_spinlock(void)
+{
+	const int num_threads = opt_threads;
+	int i, sum, ret;
+	pthread_t test_threads[num_threads];
+	struct spinlock_test_data data;
+	struct spinlock_thread_test_data thread_data[num_threads];
+
+	memset(&data, 0, sizeof(data));
+	for (i = 0; i < num_threads; i++) {
+		thread_data[i].reps = opt_reps;
+		if (opt_disable_mod <= 0 || (i % opt_disable_mod))
+			thread_data[i].reg = 1;
+		else
+			thread_data[i].reg = 0;
+		thread_data[i].data = &data;
+		ret = pthread_create(&test_threads[i], NULL,
+			test_percpu_spinlock_thread, &thread_data[i]);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	sum = 0;
+	for (i = 0; i < CPU_SETSIZE; i++)
+		sum += data.c[i].count;
+
+	assert(sum == opt_reps * num_threads);
+}
+
+void *test_percpu_inc_thread(void *arg)
+{
+	struct inc_thread_test_data *thread_data = arg;
+	struct inc_test_data *data = thread_data->data;
+	int i;
+
+	if (!opt_disable_rseq && thread_data->reg
+			&& rseq_init_current_thread())
+		abort();
+	for (i = 0; i < thread_data->reps; i++) {
+		struct rseq_state rseq_state;
+		intptr_t *targetptr, newval;
+		int cpu;
+		bool result;
+
+		do_rseq(&rseq_lock, rseq_state, cpu, result, targetptr, newval,
+			{
+				newval = (intptr_t)data->c[cpu].count + 1;
+				targetptr = (intptr_t *)&data->c[cpu].count;
+			});
+
+#ifndef BENCHMARK
+		if (i != 0 && !(i % (thread_data->reps / 10)))
+			printf("tid %d: count %d\n", (int) gettid(), i);
+#endif
+	}
+	printf_nobench("tid %d: number of retry: %d, signals delivered: %u, nr_fallback %u, nr_fallback_wait %u\n",
+		(int) gettid(), nr_retry, signals_delivered,
+		__rseq_thread_state.fallback_cnt,
+		__rseq_thread_state.fallback_wait_cnt);
+	return NULL;
+}
+
+void test_percpu_inc(void)
+{
+	const int num_threads = opt_threads;
+	int i, sum, ret;
+	pthread_t test_threads[num_threads];
+	struct inc_test_data data;
+	struct inc_thread_test_data thread_data[num_threads];
+
+	memset(&data, 0, sizeof(data));
+	for (i = 0; i < num_threads; i++) {
+		thread_data[i].reps = opt_reps;
+		if (opt_disable_mod <= 0 || (i % opt_disable_mod))
+			thread_data[i].reg = 1;
+		else
+			thread_data[i].reg = 0;
+		thread_data[i].data = &data;
+		ret = pthread_create(&test_threads[i], NULL,
+			test_percpu_inc_thread, &thread_data[i]);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	sum = 0;
+	for (i = 0; i < CPU_SETSIZE; i++)
+		sum += data.c[i].count;
+
+	assert(sum == opt_reps * num_threads);
+}
+
+int percpu_list_push(struct percpu_list *list, struct percpu_list_node *node)
+{
+	struct rseq_state rseq_state;
+	intptr_t *targetptr, newval;
+	int cpu;
+	bool result;
+
+	do_rseq(&rseq_lock, rseq_state, cpu, result, targetptr, newval,
+		{
+			newval = (intptr_t)node;
+			targetptr = (intptr_t *)&list->c[cpu].head;
+			node->next = list->c[cpu].head;
+		});
+
+	return cpu;
+}
+
+/*
+ * Unlike a traditional lock-less linked list; the availability of a
+ * rseq primitive allows us to implement pop without concerns over
+ * ABA-type races.
+ */
+struct percpu_list_node *percpu_list_pop(struct percpu_list *list)
+{
+	struct percpu_list_node *head, *next;
+	struct rseq_state rseq_state;
+	intptr_t *targetptr, newval;
+	int cpu;
+	bool result;
+
+	do_rseq(&rseq_lock, rseq_state, cpu, result, targetptr, newval,
+		{
+			head = list->c[cpu].head;
+			if (!head) {
+				result = false;
+			} else {
+				next = head->next;
+				newval = (intptr_t) next;
+				targetptr = (intptr_t *) &list->c[cpu].head;
+			}
+		});
+
+	return head;
+}
+
+void *test_percpu_list_thread(void *arg)
+{
+	int i;
+	struct percpu_list *list = (struct percpu_list *)arg;
+
+	if (rseq_init_current_thread())
+		abort();
+
+	for (i = 0; i < opt_reps; i++) {
+		struct percpu_list_node *node = percpu_list_pop(list);
+
+		if (opt_yield)
+			sched_yield();  /* encourage shuffling */
+		if (node)
+			percpu_list_push(list, node);
+	}
+
+	return NULL;
+}
+
+/* Simultaneous modification to a per-cpu linked list from many threads.  */
+void test_percpu_list(void)
+{
+	const int num_threads = opt_threads;
+	int i, j, ret;
+	long sum = 0, expected_sum = 0;
+	struct percpu_list list;
+	pthread_t test_threads[num_threads];
+	cpu_set_t allowed_cpus;
+
+	memset(&list, 0, sizeof(list));
+
+	/* Generate list entries for every usable cpu. */
+	sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+		for (j = 1; j <= 100; j++) {
+			struct percpu_list_node *node;
+
+			expected_sum += j;
+
+			node = malloc(sizeof(*node));
+			assert(node);
+			node->data = j;
+			node->next = list.c[i].head;
+			list.c[i].head = node;
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		ret = pthread_create(&test_threads[i], NULL,
+			test_percpu_list_thread, &list);
+		if (ret) {
+			errno = ret;
+			perror("pthread_create");
+			abort();
+		}
+	}
+
+	for (i = 0; i < num_threads; i++) {
+		pthread_join(test_threads[i], NULL);
+		if (ret) {
+			errno = ret;
+			perror("pthread_join");
+			abort();
+		}
+	}
+
+	for (i = 0; i < CPU_SETSIZE; i++) {
+		cpu_set_t pin_mask;
+		struct percpu_list_node *node;
+
+		if (!CPU_ISSET(i, &allowed_cpus))
+			continue;
+
+		CPU_ZERO(&pin_mask);
+		CPU_SET(i, &pin_mask);
+		sched_setaffinity(0, sizeof(pin_mask), &pin_mask);
+
+		while ((node = percpu_list_pop(&list))) {
+			sum += node->data;
+			free(node);
+		}
+	}
+
+	/*
+	 * All entries should now be accounted for (unless some external
+	 * actor is interfering with our allowed affinity while this
+	 * test is running).
+	 */
+	assert(sum == expected_sum);
+}
+
+static void test_signal_interrupt_handler(int signo)
+{
+	signals_delivered++;
+}
+
+static int set_signal_handler(void)
+{
+	int ret = 0;
+	struct sigaction sa;
+	sigset_t sigset;
+
+	ret = sigemptyset(&sigset);
+	if (ret < 0) {
+		perror("sigemptyset");
+		return ret;
+	}
+
+	sa.sa_handler = test_signal_interrupt_handler;
+	sa.sa_mask = sigset;
+	sa.sa_flags = 0;
+	ret = sigaction(SIGUSR1, &sa, NULL);
+	if (ret < 0) {
+		perror("sigaction");
+		return ret;
+	}
+
+	printf_nobench("Signal handler set for SIGUSR1\n");
+
+	return ret;
+}
+
+static void show_usage(int argc, char **argv)
+{
+	printf("Usage : %s <OPTIONS>\n",
+		argv[0]);
+	printf("OPTIONS:\n");
+	printf("	[-1 loops] Number of loops for delay injection 1\n");
+	printf("	[-2 loops] Number of loops for delay injection 2\n");
+	printf("	[-3 loops] Number of loops for delay injection 3\n");
+	printf("	[-4 loops] Number of loops for delay injection 4\n");
+	printf("	[-5 loops] Number of loops for delay injection 5 (-1 to enable -m)\n");
+	printf("	[-6 loops] Number of loops for delay injection 6 (-1 to enable -m)\n");
+	printf("	[-7 loops] Number of loops for delay injection 7 (-1 to enable -m)\n");
+	printf("	[-8 loops] Number of loops for delay injection 8 (-1 to enable -m)\n");
+	printf("	[-9 loops] Number of loops for delay injection 9 (-1 to enable -m)\n");
+	printf("	[-m N] Yield/sleep/kill every modulo N (default 0: disabled) (>= 0)\n");
+	printf("	[-y] Yield\n");
+	printf("	[-k] Kill thread with signal\n");
+	printf("	[-s S] S: =0: disabled (default), >0: sleep time (ms)\n");
+	printf("	[-f N] Use fallback every N failure (>= 1)\n");
+	printf("	[-t N] Number of threads (default 200)\n");
+	printf("	[-r N] Number of repetitions per thread (default 5000)\n");
+	printf("	[-d] Disable rseq system call (no initialization)\n");
+	printf("	[-D M] Disable rseq for each M threads\n");
+	printf("	[-T test] Choose test: (s)pinlock, (l)ist, (i)ncrement\n");
+	printf("	[-h] Show this help.\n");
+	printf("\n");
+}
+
+int main(int argc, char **argv)
+{
+	int i;
+
+	if (rseq_init_lock(&rseq_lock)) {
+		perror("rseq_init_lock");
+		return -1;
+	}
+	if (set_signal_handler())
+		goto error;
+	for (i = 1; i < argc; i++) {
+		if (argv[i][0] != '-')
+			continue;
+		switch (argv[i][1]) {
+		case '1':
+		case '2':
+		case '3':
+		case '4':
+		case '5':
+		case '6':
+		case '7':
+		case '8':
+		case '9':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			loop_cnt[argv[i][1] - '0'] = atol(argv[i + 1]);
+			i++;
+			break;
+		case 'm':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_modulo = atol(argv[i + 1]);
+			if (opt_modulo < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 's':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_sleep = atol(argv[i + 1]);
+			if (opt_sleep < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 'y':
+			opt_yield = 1;
+			break;
+		case 'k':
+			opt_signal = 1;
+			break;
+		case 'd':
+			opt_disable_rseq = 1;
+			break;
+		case 'D':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_disable_mod = atol(argv[i + 1]);
+			if (opt_disable_mod < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 'f':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_fallback_cnt = atol(argv[i + 1]);
+			if (opt_fallback_cnt < 1) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 't':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_threads = atol(argv[i + 1]);
+			if (opt_threads < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 'r':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_reps = atol(argv[i + 1]);
+			if (opt_reps < 0) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		case 'h':
+			show_usage(argc, argv);
+			goto end;
+		case 'T':
+			if (argc < i + 2) {
+				show_usage(argc, argv);
+				goto error;
+			}
+			opt_test = *argv[i + 1];
+			switch (opt_test) {
+			case 's':
+			case 'l':
+			case 'i':
+				break;
+			default:
+				show_usage(argc, argv);
+				goto error;
+			}
+			i++;
+			break;
+		default:
+			show_usage(argc, argv);
+			goto error;
+		}
+	}
+
+	if (!opt_disable_rseq && rseq_init_current_thread())
+		goto error;
+	switch (opt_test) {
+	case 's':
+		printf_nobench("spinlock\n");
+		test_percpu_spinlock();
+		break;
+	case 'l':
+		printf_nobench("linked list\n");
+		test_percpu_list();
+		break;
+	case 'i':
+		printf_nobench("counter increment\n");
+		test_percpu_inc();
+		break;
+	}
+end:
+	return 0;
+
+error:
+	if (rseq_destroy_lock(&rseq_lock))
+		perror("rseq_destroy_lock");
+	return -1;
+}
diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c
new file mode 100644
index 000000000000..f411be2c77bc
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq.c
@@ -0,0 +1,200 @@
+#define _GNU_SOURCE
+#include <errno.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <assert.h>
+#include <signal.h>
+#include <linux/membarrier.h>
+
+#include "rseq.h"
+
+#ifdef __NR_membarrier
+# define membarrier(...)		syscall(__NR_membarrier, __VA_ARGS__)
+#else
+# define membarrier(...)		-ENOSYS
+#endif
+
+__thread volatile struct rseq_thread_state __rseq_thread_state = {
+	.abi.u.e.cpu_id = -1,
+};
+
+int rseq_has_sys_membarrier;
+
+static int sys_rseq(volatile struct rseq *rseq_abi, int flags)
+{
+	return syscall(__NR_rseq, rseq_abi, flags);
+}
+
+int rseq_init_current_thread(void)
+{
+	int rc;
+
+	rc = sys_rseq(&__rseq_thread_state.abi, 0);
+	if (rc) {
+		fprintf(stderr, "Error: sys_rseq(...) failed(%d): %s\n",
+			errno, strerror(errno));
+		return -1;
+	}
+	assert(rseq_current_cpu() >= 0);
+	return 0;
+}
+
+int rseq_init_lock(struct rseq_lock *rlock)
+{
+	int ret;
+
+	ret = pthread_mutex_init(&rlock->lock, NULL);
+	if (ret) {
+		errno = ret;
+		return -1;
+	}
+	rlock->state = RSEQ_LOCK_STATE_RESTART;
+	return 0;
+}
+
+int rseq_destroy_lock(struct rseq_lock *rlock)
+{
+	int ret;
+
+	ret = pthread_mutex_destroy(&rlock->lock);
+	if (ret) {
+		errno = ret;
+		return -1;
+	}
+	return 0;
+}
+
+static void signal_off_save(sigset_t *oldset)
+{
+	sigset_t set;
+	int ret;
+
+	sigfillset(&set);
+	ret = pthread_sigmask(SIG_BLOCK, &set, oldset);
+	if (ret)
+		abort();
+}
+
+static void signal_restore(sigset_t oldset)
+{
+	int ret;
+
+	ret = pthread_sigmask(SIG_SETMASK, &oldset, NULL);
+	if (ret)
+		abort();
+}
+
+static void rseq_fallback_lock(struct rseq_lock *rlock)
+{
+	signal_off_save((sigset_t *)&__rseq_thread_state.sigmask_saved);
+	pthread_mutex_lock(&rlock->lock);
+	__rseq_thread_state.fallback_cnt++;
+	/*
+	 * For concurrent threads arriving before we set LOCK:
+	 * reading cpu_id after setting the state to LOCK
+	 * ensures they restart.
+	 */
+	ACCESS_ONCE(rlock->state) = RSEQ_LOCK_STATE_LOCK;
+	/*
+	 * For concurrent threads arriving after we set LOCK:
+	 * those will grab the lock, so we are protected by
+	 * mutual exclusion.
+	 */
+}
+
+void rseq_fallback_wait(struct rseq_lock *rlock)
+{
+	signal_off_save((sigset_t *)&__rseq_thread_state.sigmask_saved);
+	pthread_mutex_lock(&rlock->lock);
+	__rseq_thread_state.fallback_wait_cnt++;
+	pthread_mutex_unlock(&rlock->lock);
+	signal_restore(__rseq_thread_state.sigmask_saved);
+}
+
+static void rseq_fallback_unlock(struct rseq_lock *rlock, int cpu_at_start)
+{
+	/*
+	 * Concurrent rseq arriving before we set state back to RESTART
+	 * grab the lock. Those arriving after we set state back to
+	 * RESTART will perform restartable critical sections. The next
+	 * owner of the lock will take take of making sure it prevents
+	 * concurrent restartable sequences from completing.  We may be
+	 * writing from another CPU, so update the state with a store
+	 * release semantic to ensure restartable sections will see our
+	 * side effect (writing to *p) before they enter their
+	 * restartable critical section.
+	 *
+	 * In cases where we observe that we are on the right CPU after the
+	 * critical section, program order ensures that following restartable
+	 * critical sections will see our stores, so we don't have to use
+	 * store-release or membarrier.
+	 *
+	 * Use sys_membarrier when available to remove the memory barrier
+	 * implied by smp_load_acquire().
+	 */
+	barrier();
+	if (likely(rseq_current_cpu() == cpu_at_start)) {
+		ACCESS_ONCE(rlock->state) = RSEQ_LOCK_STATE_RESTART;
+	} else {
+		if (!has_fast_acquire_release() && rseq_has_sys_membarrier) {
+			if (membarrier(MEMBARRIER_CMD_SHARED, 0))
+				abort();
+			ACCESS_ONCE(rlock->state) = RSEQ_LOCK_STATE_RESTART;
+		} else {
+			/*
+			 * Store with release semantic to ensure
+			 * restartable sections will see our side effect
+			 * (writing to *p) before they enter their
+			 * restartable critical section. Matches
+			 * smp_load_acquire() in rseq_start().
+			 */
+			smp_store_release(&rlock->state,
+				RSEQ_LOCK_STATE_RESTART);
+		}
+	}
+	pthread_mutex_unlock(&rlock->lock);
+	signal_restore(__rseq_thread_state.sigmask_saved);
+}
+
+int rseq_fallback_current_cpu(void)
+{
+	int cpu;
+
+	cpu = sched_getcpu();
+	if (cpu < 0) {
+		perror("sched_getcpu()");
+		abort();
+	}
+	return cpu;
+}
+
+int rseq_fallback_begin(struct rseq_lock *rlock)
+{
+	rseq_fallback_lock(rlock);
+	return rseq_fallback_current_cpu();
+}
+
+void rseq_fallback_end(struct rseq_lock *rlock, int cpu)
+{
+	rseq_fallback_unlock(rlock, cpu);
+}
+
+/* Handle non-initialized rseq for this thread. */
+void rseq_fallback_noinit(struct rseq_state *rseq_state)
+{
+	rseq_state->lock_state = RSEQ_LOCK_STATE_FAIL;
+	rseq_state->cpu_id = 0;
+}
+
+void __attribute__((constructor)) rseq_init(void)
+{
+	int ret;
+
+	ret = membarrier(MEMBARRIER_CMD_QUERY, 0);
+	if (ret >= 0 && (ret & MEMBARRIER_CMD_SHARED))
+		rseq_has_sys_membarrier = 1;
+}
diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h
new file mode 100644
index 000000000000..791e14cf42ae
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq.h
@@ -0,0 +1,449 @@
+#ifndef RSEQ_H
+#define RSEQ_H
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <pthread.h>
+#include <signal.h>
+#include <sched.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sched.h>
+#include <linux/rseq.h>
+
+/*
+ * Empty code injection macros, override when testing.
+ * It is important to consider that the ASM injection macros need to be
+ * fully reentrant (e.g. do not modify the stack).
+ */
+#ifndef RSEQ_INJECT_ASM
+#define RSEQ_INJECT_ASM(n)
+#endif
+
+#ifndef RSEQ_INJECT_C
+#define RSEQ_INJECT_C(n)
+#endif
+
+#ifndef RSEQ_INJECT_INPUT
+#define RSEQ_INJECT_INPUT
+#endif
+
+#ifndef RSEQ_INJECT_CLOBBER
+#define RSEQ_INJECT_CLOBBER
+#endif
+
+#ifndef RSEQ_INJECT_FAILED
+#define RSEQ_INJECT_FAILED
+#endif
+
+#ifndef RSEQ_FALLBACK_CNT
+#define RSEQ_FALLBACK_CNT	3
+#endif
+
+struct rseq_thread_state {
+	struct rseq abi;	/* Kernel ABI. */
+	uint32_t fallback_wait_cnt;
+	uint32_t fallback_cnt;
+	sigset_t sigmask_saved;
+};
+
+extern __thread volatile struct rseq_thread_state __rseq_thread_state;
+extern int rseq_has_sys_membarrier;
+
+#define likely(x)		__builtin_expect(!!(x), 1)
+#define unlikely(x)		__builtin_expect(!!(x), 0)
+#define barrier()		__asm__ __volatile__("" : : : "memory")
+
+#define ACCESS_ONCE(x)		(*(__volatile__  __typeof__(x) *)&(x))
+#define WRITE_ONCE(x, v)	__extension__ ({ ACCESS_ONCE(x) = (v); })
+#define READ_ONCE(x)		ACCESS_ONCE(x)
+
+#ifdef __x86_64__
+
+#define smp_mb()	__asm__ __volatile__ ("mfence" : : : "memory")
+#define smp_rmb()	barrier()
+#define smp_wmb()	barrier()
+
+#define smp_load_acquire(p)						\
+__extension__ ({							\
+	__typeof(*p) ____p1 = READ_ONCE(*p);				\
+	barrier();							\
+	____p1;								\
+})
+
+#define smp_acquire__after_ctrl_dep()	smp_rmb()
+
+#define smp_store_release(p, v)						\
+do {									\
+	barrier();							\
+	WRITE_ONCE(*p, v);						\
+} while (0)
+
+#define has_fast_acquire_release()	1
+#define has_single_copy_load_64()	1
+
+#elif __i386__
+
+/*
+ * Support older 32-bit architectures that do not implement fence
+ * instructions.
+ */
+#define smp_mb()	\
+	__asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory")
+#define smp_rmb()	\
+	__asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory")
+#define smp_wmb()	\
+	__asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory")
+
+#define smp_load_acquire(p)						\
+__extension__ ({							\
+	__typeof(*p) ____p1 = READ_ONCE(*p);				\
+	smp_mb();							\
+	____p1;								\
+})
+
+#define smp_acquire__after_ctrl_dep()	smp_rmb()
+
+#define smp_store_release(p, v)						\
+do {									\
+	smp_mb();							\
+	WRITE_ONCE(*p, v);						\
+} while (0)
+
+#define has_fast_acquire_release()	0
+#define has_single_copy_load_64()	0
+
+#elif defined(__ARMEL__)
+
+#define smp_mb()	__asm__ __volatile__ ("dmb" : : : "memory")
+#define smp_rmb()	__asm__ __volatile__ ("dmb" : : : "memory")
+#define smp_wmb()	__asm__ __volatile__ ("dmb" : : : "memory")
+
+#define smp_load_acquire(p)						\
+__extension__ ({							\
+	__typeof(*p) ____p1 = READ_ONCE(*p);				\
+	smp_mb();							\
+	____p1;								\
+})
+
+#define smp_acquire__after_ctrl_dep()	smp_rmb()
+
+#define smp_store_release(p, v)						\
+do {									\
+	smp_mb();							\
+	WRITE_ONCE(*p, v);						\
+} while (0)
+
+#define has_fast_acquire_release()	0
+#define has_single_copy_load_64()	1
+
+#else
+#error unsupported target
+#endif
+
+enum rseq_lock_state {
+	RSEQ_LOCK_STATE_RESTART = 0,
+	RSEQ_LOCK_STATE_LOCK = 1,
+	RSEQ_LOCK_STATE_FAIL = 2,
+};
+
+struct rseq_lock {
+	pthread_mutex_t lock;
+	int32_t state;		/* enum rseq_lock_state */
+};
+
+/* State returned by rseq_start, passed as argument to rseq_finish. */
+struct rseq_state {
+	volatile struct rseq_thread_state *rseqp;
+	int32_t cpu_id;		/* cpu_id at start. */
+	uint32_t event_counter;	/* event_counter at start. */
+	int32_t lock_state;	/* Lock state at start. */
+};
+
+/*
+ * Initialize rseq for the current thread.  Must be called once by any
+ * thread which uses restartable sequences, before they start using
+ * restartable sequences. If initialization is not invoked, or if it
+ * fails, the restartable critical sections will fall-back on locking
+ * (rseq_lock).
+ */
+int rseq_init_current_thread(void);
+
+/*
+ * The fallback lock should be initialized before being used by any
+ * thread, and destroyed after all threads are done using it. This lock
+ * should be used by all rseq calls associated with shared data, either
+ * between threads, or between processes in a shared memory.
+ *
+ * There may be many rseq_lock per process, e.g. one per protected data
+ * structure.
+ */
+int rseq_init_lock(struct rseq_lock *rlock);
+int rseq_destroy_lock(struct rseq_lock *rlock);
+
+/*
+ * Restartable sequence fallback prototypes. Fallback on locking when
+ * rseq is not initialized, not available on the system, or during
+ * single-stepping to ensure forward progress.
+ */
+int rseq_fallback_begin(struct rseq_lock *rlock);
+void rseq_fallback_end(struct rseq_lock *rlock, int cpu);
+void rseq_fallback_wait(struct rseq_lock *rlock);
+void rseq_fallback_noinit(struct rseq_state *rseq_state);
+
+/*
+ * Restartable sequence fallback for reading the current CPU number.
+ */
+int rseq_fallback_current_cpu(void);
+
+static inline int32_t rseq_cpu_at_start(struct rseq_state start_value)
+{
+	return start_value.cpu_id;
+}
+
+static inline int32_t rseq_current_cpu_raw(void)
+{
+	return ACCESS_ONCE(__rseq_thread_state.abi.u.e.cpu_id);
+}
+
+static inline int32_t rseq_current_cpu(void)
+{
+	int32_t cpu;
+
+	cpu = rseq_current_cpu_raw();
+	if (unlikely(cpu < 0))
+		cpu = rseq_fallback_current_cpu();
+	return cpu;
+}
+
+static inline __attribute__((always_inline))
+struct rseq_state rseq_start(struct rseq_lock *rlock)
+{
+	struct rseq_state result;
+
+	result.rseqp = &__rseq_thread_state;
+	if (has_single_copy_load_64()) {
+		union {
+			struct {
+				uint32_t cpu_id;
+				uint32_t event_counter;
+			} e;
+			uint64_t v;
+		} u;
+
+		u.v = ACCESS_ONCE(result.rseqp->abi.u.v);
+		result.event_counter = u.e.event_counter;
+		result.cpu_id = u.e.cpu_id;
+	} else {
+		result.event_counter =
+			ACCESS_ONCE(result.rseqp->abi.u.e.event_counter);
+		/* load event_counter before cpu_id. */
+		RSEQ_INJECT_C(5)
+		result.cpu_id = ACCESS_ONCE(result.rseqp->abi.u.e.cpu_id);
+	}
+	/*
+	 * Read event counter before lock state and cpu_id. This ensures
+	 * that when the state changes from RESTART to LOCK, if we have
+	 * some threads that have already seen the RESTART still in
+	 * flight, they will necessarily be preempted/signalled before a
+	 * thread can see the LOCK state for that same CPU. That
+	 * preemption/signalling will cause them to restart, so they
+	 * don't interfere with the lock.
+	 */
+	RSEQ_INJECT_C(6)
+
+	if (!has_fast_acquire_release() && likely(rseq_has_sys_membarrier)) {
+		result.lock_state = ACCESS_ONCE(rlock->state);
+		barrier();
+	} else {
+		/*
+		 * Load lock state with acquire semantic. Matches
+		 * smp_store_release() in rseq_fallback_end().
+		 */
+		result.lock_state = smp_load_acquire(&rlock->state);
+	}
+	if (unlikely(result.cpu_id < 0))
+		rseq_fallback_noinit(&result);
+	/*
+	 * We need to ensure that the compiler does not re-order the
+	 * loads of any protected values before we read the current
+	 * state.
+	 */
+	barrier();
+	return result;
+}
+
+static inline __attribute__((always_inline))
+bool rseq_finish(struct rseq_lock *rlock,
+		intptr_t *p, intptr_t to_write,
+		struct rseq_state start_value)
+{
+	RSEQ_INJECT_C(9)
+
+	if (unlikely(start_value.lock_state != RSEQ_LOCK_STATE_RESTART)) {
+		if (start_value.lock_state == RSEQ_LOCK_STATE_LOCK)
+			rseq_fallback_wait(rlock);
+		return false;
+	}
+
+#ifdef __x86_64__
+	/*
+	 * The __rseq_table section can be used by debuggers to better
+	 * handle single-stepping through the restartable critical
+	 * sections.
+	 */
+	__asm__ __volatile__ goto (
+		".pushsection __rseq_table, \"aw\"\n\t"
+		".balign 8\n\t"
+		"4:\n\t"
+		".quad 1f, 2f, 3f\n\t"
+		".popsection\n\t"
+		"1:\n\t"
+		RSEQ_INJECT_ASM(1)
+		"movq $4b, (%[rseq_cs])\n\t"
+		RSEQ_INJECT_ASM(2)
+		"cmpl %[start_event_counter], %[current_event_counter]\n\t"
+		"jnz 3f\n\t"
+		RSEQ_INJECT_ASM(3)
+		"movq %[to_write], (%[target])\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(4)
+		"movq $0, (%[rseq_cs])\n\t"
+		"jmp %l[succeed]\n\t"
+		"3: movq $0, (%[rseq_cs])\n\t"
+		: /* no outputs */
+		: [start_event_counter]"r"(start_value.event_counter),
+		  [current_event_counter]"m"(start_value.rseqp->abi.u.e.event_counter),
+		  [to_write]"r"(to_write),
+		  [target]"r"(p),
+		  [rseq_cs]"r"(&start_value.rseqp->abi.rseq_cs)
+		  RSEQ_INJECT_INPUT
+		: "memory", "cc"
+		  RSEQ_INJECT_CLOBBER
+		: succeed
+	);
+#elif defined(__i386__)
+	/*
+	 * The __rseq_table section can be used by debuggers to better
+	 * handle single-stepping through the restartable critical
+	 * sections.
+	 */
+	__asm__ __volatile__ goto (
+		".pushsection __rseq_table, \"aw\"\n\t"
+		".balign 8\n\t"
+		"4:\n\t"
+		".long 1f, 0x0, 2f, 0x0, 3f, 0x0\n\t"
+		".popsection\n\t"
+		"1:\n\t"
+		RSEQ_INJECT_ASM(1)
+		"movl $4b, (%[rseq_cs])\n\t"
+		RSEQ_INJECT_ASM(2)
+		"cmpl %[start_event_counter], %[current_event_counter]\n\t"
+		"jnz 3f\n\t"
+		RSEQ_INJECT_ASM(3)
+		"movl %[to_write], (%[target])\n\t"
+		"2:\n\t"
+		RSEQ_INJECT_ASM(4)
+		"movl $0, (%[rseq_cs])\n\t"
+		"jmp %l[succeed]\n\t"
+		"3: movl $0, (%[rseq_cs])\n\t"
+		: /* no outputs */
+		: [start_event_counter]"r"(start_value.event_counter),
+		  [current_event_counter]"m"(start_value.rseqp->abi.u.e.event_counter),
+		  [to_write]"r"(to_write),
+		  [target]"r"(p),
+		  [rseq_cs]"r"(&start_value.rseqp->abi.rseq_cs)
+		  RSEQ_INJECT_INPUT
+		: "memory", "cc"
+		  RSEQ_INJECT_CLOBBER
+		: succeed
+	);
+#elif defined(__ARMEL__)
+	{
+		/*
+		 * The __rseq_table section can be used by debuggers to better
+		 * handle single-stepping through the restartable critical
+		 * sections.
+		 */
+		__asm__ __volatile__ goto (
+			".pushsection __rseq_table, \"aw\"\n\t"
+			".balign 8\n\t"
+			".word 1f, 0x0, 2f, 0x0, 3f, 0x0\n\t"
+			".popsection\n\t"
+			"1:\n\t"
+			RSEQ_INJECT_ASM(1)
+			"adr r0, 4f\n\t"
+			"str r0, [%[rseq_cs]]\n\t"
+			RSEQ_INJECT_ASM(2)
+			"ldr r0, %[current_event_counter]\n\t"
+			"mov r1, #0\n\t"
+			"cmp %[start_event_counter], r0\n\t"
+			"bne 3f\n\t"
+			RSEQ_INJECT_ASM(3)
+			"str %[to_write], [%[target]]\n\t"
+			"2:\n\t"
+			RSEQ_INJECT_ASM(4)
+			"str r1, [%[rseq_cs]]\n\t"
+			"b %l[succeed]\n\t"
+			".balign 8\n\t"
+			"4:\n\t"
+			".word 1b, 0x0, 2b, 0x0, 3f, 0x0\n\t"
+			"3:\n\t"
+			"mov r1, #0\n\t"
+			"str r1, [%[rseq_cs]]\n\t"
+			: /* no outputs */
+			: [start_event_counter]"r"(start_value.event_counter),
+			  [current_event_counter]"m"(start_value.rseqp->abi.u.e.event_counter),
+			  [to_write]"r"(to_write),
+			  [rseq_cs]"r"(&start_value.rseqp->abi.rseq_cs),
+			  [target]"r"(p)
+			  RSEQ_INJECT_INPUT
+			: "r0", "r1", "memory", "cc"
+			  RSEQ_INJECT_CLOBBER
+			: succeed
+		);
+	}
+#else
+#error unsupported target
+#endif
+	RSEQ_INJECT_FAILED
+	return false;
+succeed:
+	return true;
+}
+
+/*
+ * Helper macro doing two restartable critical section attempts, and if
+ * they fail, fallback on locking.
+ */
+#define do_rseq(_lock, _rseq_state, _cpu, _result, _targetptr, _newval, \
+		_code)							\
+	do {								\
+		_rseq_state = rseq_start(_lock);			\
+		_cpu = rseq_cpu_at_start(_rseq_state);			\
+		_result = true;						\
+		_code							\
+		if (unlikely(!_result))					\
+			break;						\
+		if (likely(rseq_finish(_lock, _targetptr, _newval,	\
+				_rseq_state)))				\
+			break;						\
+		_rseq_state = rseq_start(_lock);			\
+		_cpu = rseq_cpu_at_start(_rseq_state);			\
+		_result = true;						\
+		_code							\
+		if (unlikely(!_result))					\
+			break;						\
+		if (likely(rseq_finish(_lock, _targetptr, _newval,	\
+				_rseq_state)))				\
+			break;						\
+		_cpu = rseq_fallback_begin(_lock);			\
+		_result = true;						\
+		_code							\
+		if (likely(_result))					\
+			*(_targetptr) = (_newval);			\
+		rseq_fallback_end(_lock, _cpu);				\
+	} while (0)
+
+#endif  /* RSEQ_H_ */
-- 
2.34.1