Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 15 Apr 2016 02:53:46 +0000 (19:53 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 15 Apr 2016 02:53:46 +0000 (19:53 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 15 Apr 2016 02:53:46 +0000 (19:53 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 15 Apr 2016 02:53:46 +0000 (19:53 -0700)
diff --git a/Documentation/x86/protection-keys.txt b/Documentation/x86/protection-keys.txt

new file mode 100644 (file)

index 0000000..c281ded
--- /dev/null
+++ b/Documentation/x86/protection-keys.txt
@@ -0,0 +1,27 @@
+Memory Protection Keys for Userspace (PKU aka PKEYs) is a CPU feature
+which will be found on future Intel CPUs.
+
+Memory Protection Keys provides a mechanism for enforcing page-based
+protections, but without requiring modification of the page tables
+when an application changes protection domains.  It works by
+dedicating 4 previously ignored bits in each page table entry to a
+"protection key", giving 16 possible keys.
+
+There is also a new user-accessible register (PKRU) with two separate
+bits (Access Disable and Write Disable) for each key.  Being a CPU
+register, PKRU is inherently thread-local, potentially giving each
+thread a different set of protections from every other thread.
+
+There are two new instructions (RDPKRU/WRPKRU) for reading and writing
+to the new register.  The feature is only available in 64-bit mode,
+even though there is theoretically space in the PAE PTEs.  These
+permissions are enforced on data access only and have no effect on
+instruction fetches.
+
+=========================== Config Option ===========================
+
+This config option adds approximately 1.5kb of text. and 50 bytes of
+data to the executable.  A workload which does large O_DIRECT reads
+of holes in XFS files was run to exercise get_user_pages_fast().  No
+performance delta was observed with the config option
+enabled or disabled.
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile

index 6915ff2bd9962e0b495d0af05c18f51a9382c36e..8774cb23064fe417cf35935c33e876eb9faa9670 100644 (file)
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -26,7 +26,7 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
         vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4
  
  KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
-KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
+KBUILD_CFLAGS += -fno-strict-aliasing $(call cc-option, -fPIE, -fPIC)
  KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
  cflags-$(CONFIG_X86_32) := -march=i386
  cflags-$(CONFIG_X86_64) := -mcmodel=small
@@ -40,6 +40,18 @@ GCOV_PROFILE := n
  UBSAN_SANITIZE :=n
  
  LDFLAGS := -m elf_$(UTS_MACHINE)
+ifeq ($(CONFIG_RELOCATABLE),y)
+# If kernel is relocatable, build compressed kernel as PIE.
+ifeq ($(CONFIG_X86_32),y)
+LDFLAGS += $(call ld-option, -pie) $(call ld-option, --no-dynamic-linker)
+else
+# To build 64-bit compressed kernel as PIE, we disable relocation
+# overflow check to avoid relocation overflow error with a new linker
+# command-line option, -z noreloc-overflow.
+LDFLAGS += $(shell $(LD) --help 2>&1 | grep -q "\-z noreloc-overflow" \
+       && echo "-z noreloc-overflow -pie --no-dynamic-linker")
+endif
+endif
  LDFLAGS_vmlinux := -T
  
  hostprogs-y    := mkpiggy
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S

index 8ef964ddc18ec656b1e3b0f038adf134484dbdd2..0256064da8da38c69098cbccc75a19cc0493ca82 100644 (file)
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -31,6 +31,34 @@
  #include <asm/asm-offsets.h>
  #include <asm/bootparam.h>
  
+/*
+ * The 32-bit x86 assembler in binutils 2.26 will generate R_386_GOT32X
+ * relocation to get the symbol address in PIC.  When the compressed x86
+ * kernel isn't built as PIC, the linker optimizes R_386_GOT32X
+ * relocations to their fixed symbol addresses.  However, when the
+ * compressed x86 kernel is loaded at a different address, it leads
+ * to the following load failure:
+ *
+ *   Failed to allocate space for phdrs
+ *
+ * during the decompression stage.
+ *
+ * If the compressed x86 kernel is relocatable at run-time, it should be
+ * compiled with -fPIE, instead of -fPIC, if possible and should be built as
+ * Position Independent Executable (PIE) so that linker won't optimize
+ * R_386_GOT32X relocation to its fixed symbol address.  Older
+ * linkers generate R_386_32 relocations against locally defined symbols,
+ * _bss, _ebss, _got and _egot, in PIE.  It isn't wrong, just less
+ * optimal than R_386_RELATIVE.  But the x86 kernel fails to properly handle
+ * R_386_32 relocations when relocating the kernel.  To generate
+ * R_386_RELATIVE relocations, we mark _bss, _ebss, _got and _egot as
+ * hidden:
+ */
+       .hidden _bss
+       .hidden _ebss
+       .hidden _got
+       .hidden _egot
+
         __HEAD
  ENTRY(startup_32)
  #ifdef CONFIG_EFI_STUB
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S

index b0c0d16ef58d1099342c97aff83767dd35c73691..86558a1991393c509bc3005c021c2ab490b26b2a 100644 (file)
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -33,6 +33,14 @@
  #include <asm/asm-offsets.h>
  #include <asm/bootparam.h>
  
+/*
+ * Locally defined symbols should be marked hidden:
+ */
+       .hidden _bss
+       .hidden _ebss
+       .hidden _got
+       .hidden _egot
+
         __HEAD
         .code32
  ENTRY(startup_32)
diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c

index 0a850100c5944641717c797e5d4f129b2cf9a79f..2658e2af74ec4c3f433b9958b4498d7f7d603fed 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
@@ -29,7 +29,7 @@ static char gen_pool_buf[MCE_POOLSZ];
  void mce_gen_pool_process(void)
  {
         struct llist_node *head;
-       struct mce_evt_llist *node;
+       struct mce_evt_llist *node, *tmp;
         struct mce *mce;
  
         head = llist_del_all(&mce_event_llist);
@@ -37,7 +37,7 @@ void mce_gen_pool_process(void)
                 return;
  
         head = llist_reverse_order(head);
-       llist_for_each_entry(node, head, llnode) {
+       llist_for_each_entry_safe(node, tmp, head, llnode) {
                 mce = &node->mce;
                 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
                 gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c

index eb934b0242e0e17652b7fc17c3255b5e70e40d15..67392b6ab845fbdec7d54798b591c24e544347a9 100644 (file)
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -331,7 +331,7 @@ void set_interrupt(struct lg_cpu *cpu, unsigned int irq)
   * Actually now I think of it, it's possible that Ron *is* half the Plan 9
   * userbase.  Oh well.
   */
-static bool could_be_syscall(unsigned int num)
+bool could_be_syscall(unsigned int num)
  {
         /* Normal Linux IA32_SYSCALL_VECTOR or reserved vector? */
         return num == IA32_SYSCALL_VECTOR || num == syscall_vector;
@@ -416,6 +416,10 @@ bool deliver_trap(struct lg_cpu *cpu, unsigned int num)
   *
   * This routine indicates if a particular trap number could be delivered
   * directly.
+ *
+ * Unfortunately, Linux 4.6 started using an interrupt gate instead of a
+ * trap gate for syscalls, so this trick is ineffective.  See Mastery for
+ * how we could do this anyway...
   */
  static bool direct_trap(unsigned int num)
  {
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h

index ac8ad0461e809db526bfc1db1ca2290ac612a43d..69b3814afd2f63c7fabe82dde698fae44b2d1862 100644 (file)
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -167,6 +167,7 @@ void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
  bool send_notify_to_eventfd(struct lg_cpu *cpu);
  void init_clockdev(struct lg_cpu *cpu);
  bool check_syscall_vector(struct lguest *lg);
+bool could_be_syscall(unsigned int num);
  int init_interrupts(void);
  void free_interrupts(void);
  
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c

index 6a4cd771a2be62b4172cc26a178ca85fbf7e6d27..adc162c7040d7ef0a2f8f738e7a21bab1d57ba09 100644 (file)
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -429,8 +429,12 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
                         return;
                 break;
         case 32 ... 255:
+               /* This might be a syscall. */
+               if (could_be_syscall(cpu->regs->trapnum))
+                       break;
+
                 /*
-                * These values mean a real interrupt occurred, in which case
+                * Other values mean a real interrupt occurred, in which case
                  * the Host handler has already been run. We just do a
                  * friendly check if another process should now be run, then
                  * return to run the Guest again.
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 15 Apr 2016 02:53:46 +0000 (19:53 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 15 Apr 2016 02:53:46 +0000 (19:53 -0700)
Documentation/x86/protection-keys.txt	[new file with mode: 0644]	patch \| blob
arch/x86/boot/compressed/Makefile		patch \| blob \| blame \| history
arch/x86/boot/compressed/head_32.S		patch \| blob \| blame \| history
arch/x86/boot/compressed/head_64.S		patch \| blob \| blame \| history
arch/x86/kernel/cpu/mcheck/mce-genpool.c		patch \| blob \| blame \| history
drivers/lguest/interrupts_and_traps.c		patch \| blob \| blame \| history
drivers/lguest/lg.h		patch \| blob \| blame \| history
drivers/lguest/x86/core.c		patch \| blob \| blame \| history